1 /*******************************************************************************
2
3 Template class for xml / html / xhtml / etc (markup language) entity
4 en/decoders, which share basically the same entity encoding scheme, only
5 differing in the exact entities which must be encoded. (The html entities
6 are a superset of the xml entities, for example.)
7
8 See_Also:
9 http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
10
11 Example usage:
12
13 ---
14
15 import ocean.text.entities.HtmlEntityCodec;
16
17 scope entity_codec = new HtmlEntityCodec;
18
19 char[] test = "hello & world © ß Ȱ'";
20
21 if ( entity_codec.containsUnencoded(test) )
22 {
23 char[] encoded;
24 entity_codec.encode(test, encoded);
25 }
26
27 ---
28
29 Copyright:
30 Copyright (c) 2009-2016 dunnhumby Germany GmbH.
31 All rights reserved.
32
33 License:
34 Boost Software License Version 1.0. See LICENSE_BOOST.txt for details.
35 Alternatively, this file may be distributed under the terms of the Tango
36 3-Clause BSD License (see LICENSE_BSD.txt for details).
37
38 *******************************************************************************/
39
40 module ocean.text.entities.model.MarkupEntityCodec;
41
42
43
44
45 import ocean.meta.types.Qualifiers;
46
47 import ocean.core.Array;
48
49 import ocean.text.entities.model.IEntityCodec;
50 import ocean.text.entities.model.IEntitySet;
51
52 import ocean.text.utf.UtfString;
53
54 import ocean.text.util.StringSearch;
55
56 import Utf = ocean.text.convert.Utf;
57
58 import Math = ocean.math.Math: min;
59
60 import Integer = ocean.text.convert.Integer_tango: toInt;
61
62 import ocean.core.Verify;
63
64
65 /*******************************************************************************
66
67 Class to en/decode xml / html style entities.
68
69 *******************************************************************************/
70
71 public class MarkupEntityCodec ( E : IEntitySet ) : IEntityCodec!(E)
72 {
73 /***************************************************************************
74
75 This alias.
76
77 ***************************************************************************/
78
79 public alias typeof(this) This;
80
81
82 /***************************************************************************
83
84 Buffers for each character type, used by the utf8 encoder in the methods
85 charTo() & dcharTo().
86
87 ***************************************************************************/
88
89 private char[] char_buffer;
90
91 private wchar[] wchar_buffer;
92
93 private dchar[] dchar_buffer;
94
95
96 /***************************************************************************
97
98 Buffer used when formatting an entity.
99
100 ***************************************************************************/
101
102 private char[] entity_buf;
103
104
105 /***************************************************************************
106
107 Encode any unencoded entities in the input string.
108
109 Params:
110 text = string to encode
111 encoded = output string
112
113 Returns:
114 encoded output string
115
116 ***************************************************************************/
117
118 public override char[] encode ( const(char)[] text, ref char[] encoded )
119 {
120 return this.encode_(text, encoded);
121 }
122
123 public override wchar[] encode ( const(wchar)[] text, ref wchar[] encoded )
124 {
125 return this.encode_(text, encoded);
126 }
127
128 public override dchar[] encode ( const(dchar)[] text, ref dchar[] encoded )
129 {
130 return this.encode_(text, encoded);
131 }
132
133
134 /***************************************************************************
135
136 Decode any encoded entities in the input string.
137
138 Params:
139 text = string to decode
140 decoded = output string
141
142 Returns:
143 decoded output string
144
145 ***************************************************************************/
146
147 public override mstring decode ( const(char)[] text, ref mstring decoded )
148 {
149 return this.decode_(text, decoded);
150 }
151
152 public override wchar[] decode ( const(wchar)[] text, ref wchar[] decoded )
153 {
154 return this.decode_(text, decoded);
155 }
156
157 public override dchar[] decode ( const(dchar)[] text, ref dchar[] decoded )
158 {
159 return this.decode_(text, decoded);
160 }
161
162
163 /***************************************************************************
164
165 Checks whether the input string contains any unencoded entities.
166
167 Params:
168 text = string to check
169
170 Returns:
171 true if one or more unencoded entities are found
172
173 ***************************************************************************/
174
175 public override bool containsUnencoded ( const(char)[] text )
176 {
177 return this.containsUnencoded_(text);
178 }
179
180 public override bool containsUnencoded ( const(wchar)[] text )
181 {
182 return this.containsUnencoded_(text);
183 }
184
185 public override bool containsUnencoded ( const(dchar)[] text )
186 {
187 return this.containsUnencoded_(text);
188 }
189
190
191 /***************************************************************************
192
193 Checks whether the input string contains any encoded entities.
194
195 Params:
196 text = string to check
197
198 Returns:
199 true if one or more encoded entities are found
200
201 ***************************************************************************/
202
203 public override bool containsEncoded ( const(char)[] text )
204 {
205 return this.containsEncoded_(text);
206 }
207
208 public override bool containsEncoded ( const(wchar)[] text )
209 {
210 return this.containsEncoded_(text);
211 }
212
213 public override bool containsEncoded ( const(dchar)[] text )
214 {
215 return this.containsEncoded_(text);
216 }
217
218
219 /***************************************************************************
220
221 Checks whether the input string begins with an unencoded entity.
222
223 Note: a full string has to be passed (not just a single character), as
224 '&' is an unencoded entity, but "&" is not - these cases are not
225 distinguishable from just the 1st character.
226
227 Params:
228 text = string to check
229
230 Returns:
231 true if the first character in the input string is an unencoded
232 entity
233
234 ***************************************************************************/
235
236 public bool isUnencodedEntity ( Char ) ( Char[] text )
237 {
238 static assert(
239 is(Unqual!(Char) == char)
240 || is(Unqual!(Char) == wchar)
241 || is(Unqual!(Char) == dchar),
242 This.stringof ~ " template parameter Char must be one of {char, wchar, dchar}, not " ~ Char.stringof);
243
244 auto c = UtfString!(Char, true).extract(text);
245
246 if ( c in this.entities )
247 {
248 if ( c == '&' )
249 {
250 // The following characters must form a valid character code
251 auto entity = this.sliceEncodedEntity(text);
252 if ( entity.length )
253 {
254 auto decoded_entity = this.decodeEntity(entity);
255 return decoded_entity == InvalidUnicode;
256 }
257 else
258 {
259 return true;
260 }
261 }
262 else
263 {
264 return true;
265 }
266 }
267 else
268 {
269 return false;
270 }
271 }
272
273
274 /***************************************************************************
275
276 Checks whether the input string begins with an encoded entity.
277
278 Params:
279 text = string to check
280 exact_match = if true, the encoded entity must fill the entire input
281 string
282
283 Returns:
284 true if the string begins with an encoded entity
285
286 ***************************************************************************/
287
288 public bool isEncodedEntity ( Char ) ( Char[] text, bool exact_match = false )
289 {
290 static assert(
291 is(Unqual!(Char) == char)
292 || is(Unqual!(Char) == wchar)
293 || is(Unqual!(Char) == dchar),
294 This.stringof ~ " template parameter Char must be one of {char, wchar, dchar}, not " ~ Char.stringof
295 );
296
297 auto entity = this.sliceEncodedEntity(text);
298 if ( !entity.length )
299 {
300 return false;
301 }
302
303 return exact_match ? entity.length == text.length : true;
304 }
305
306
307 /***************************************************************************
308
309 Converts an encoded entity to a unicode character. The entity may be
310 either:
311 - a numeric character reference (eg "á" for 'á'), or
312 - a named ISO8859-1/15 (Latin 1/9) entity (eg "ß" for 'ß').
313
314 Params:
315 entity = entity content to convert; trailing '&' and terminating ';'
316 are expected
317
318 Returns:
319 the unicode character or InvalidUnicode on failure
320
321 ***************************************************************************/
322
323 public dchar decodeEntity ( Char ) ( Char[] entity )
324 {
325 static assert(
326 is(Unqual!(Char) == char)
327 || is(Unqual!(Char) == wchar)
328 || is(Unqual!(Char) == dchar),
329 This.stringof ~ " template parameter Char must be one of {char, wchar, dchar}, not " ~ Char.stringof
330 );
331
332 verify(this.isEncodedEntity(entity, true),
333 This.stringof ~ ".decodeEntity - invalid character entity");
334
335 dchar unicode = InvalidUnicode;
336
337 if ( entity.length )
338 {
339 UtfString!(Char, true) utf_str = { entity };
340 auto c = utf_str[1];
341 if (c == '#')
342 {
343 unicode = this.decodeNumericCharacterRef(entity);
344 }
345 else
346 {
347 unicode = this.decodeCharacterEntity(entity);
348 }
349 }
350
351 return unicode;
352 }
353
354
355 /***************************************************************************
356
357 Internal method for encoding any unencoded entities in a string.
358
359 Params:
360 text = string to encode
361 encoded = encoded output string
362
363 Returns:
364 encoded output string
365
366 ***************************************************************************/
367
368 protected MutChar[] encode_ ( ConstChar, MutChar ) ( ConstChar[] text,
369 ref MutChar[] encoded )
370 {
371 static assert (is(Unqual!(ConstChar) == Unqual!(MutChar)));
372
373 static assert(
374 is(MutChar == char)
375 || is(MutChar == wchar)
376 || is(MutChar == dchar),
377 This.stringof ~ " template parameter MutChar must be one of {char, wchar, dchar}, not " ~ MutChar.stringof
378 );
379
380 encoded.length = 0;
381
382 size_t last_special_char;
383 size_t i;
384 while ( i < text.length )
385 {
386 ConstChar[] process = text[i..$];
387
388 size_t width;
389 auto c = UtfString!(ConstChar, true).extract(process, width);
390
391 if ( this.isUnencodedEntity(process) )
392 {
393 encoded.append(text[last_special_char..i]);
394
395 this.appendEncodedEntity(encoded, c);
396
397 last_special_char = i + width;
398 }
399
400 i += width;
401 }
402
403 encoded.append(text[last_special_char..$]);
404 return encoded;
405 }
406
407
408 /***************************************************************************
409
410 Internal method for decoding any encoded entities in a string.
411
412 Params:
413 text = string to decode
414 decoded = decoded output string
415
416 Returns:
417 decoded output string
418
419 ***************************************************************************/
420
421 protected MutChar[] decode_ ( ConstChar, MutChar ) ( ConstChar[] text,
422 ref MutChar[] decoded )
423 {
424 static assert (is(Unqual!(ConstChar) == Unqual!(MutChar)));
425
426 static assert(
427 is(MutChar == char)
428 || is(MutChar == wchar)
429 || is(MutChar == dchar),
430 This.stringof ~ " template parameter MutChar must be one of {char, wchar, dchar}, not " ~ MutChar.stringof
431 );
432
433 decoded.length = 0;
434
435 size_t last_special_char = 0;
436 size_t i = 0;
437 while ( i < text.length )
438 {
439 if ( text[i] == '&')
440 {
441 auto entity = this.sliceEncodedEntity(text[i..$]);
442 if ( entity.length )
443 {
444 decoded.append(text[last_special_char..i]);
445
446 dchar unicode = this.decodeEntity(entity);
447 if ( unicode != InvalidUnicode )
448 {
449 decoded.append(this.dcharTo!(MutChar)(unicode));
450 }
451
452 i += entity.length;
453 last_special_char = i;
454 continue;
455 }
456 }
457 ++i;
458 }
459
460 decoded.append(text[last_special_char..$]);
461 return decoded;
462 }
463
464
465 /***************************************************************************
466
467 Internal method for checking whether the passed string contains any
468 unencoded entities.
469
470 Params:
471 text = string to check
472
473 Returns:
474 true if any unencoded entities are found
475
476 ***************************************************************************/
477
478 protected bool containsUnencoded_ ( Char ) ( Char[] text )
479 {
480 static assert(
481 is(Unqual!(Char) == char)
482 || is(Unqual!(Char) == wchar)
483 || is(Unqual!(Char) == dchar),
484 This.stringof ~ " template parameter Char must be one of {char, wchar, dchar}, not " ~ Char.stringof
485 );
486
487 UtfString!(Char) utf_str = { text };
488 foreach ( i, c; utf_str )
489 {
490 if ( this.isUnencodedEntity(text[i..$]) )
491 {
492 return true;
493 }
494 }
495
496 return false;
497 }
498
499
500 /***************************************************************************
501
502 Internal method for checking whether the passed string contains any
503 encoded entities.
504
505 Params:
506 text = string to check
507
508 Returns:
509 true if any encoded entities are found
510
511 ***************************************************************************/
512
513 protected bool containsEncoded_ ( Char ) ( Char[] text )
514 {
515 static assert(
516 is(Unqual!(Char) == char)
517 || is(Unqual!(Char) == wchar)
518 || is(Unqual!(Char) == dchar),
519 This.stringof ~ " template parameter Char must be one of {char, wchar, dchar}, not " ~ Char.stringof
520 );
521
522 UtfString!(Char) utf_str = { text };
523 foreach ( i, c; utf_str )
524 {
525 auto entity = this.sliceEncodedEntity(text[i..$]);
526 if ( entity.length )
527 {
528 return true;
529 }
530 }
531
532 return false;
533 }
534
535
536 /***************************************************************************
537
538 Appends an encoded entity to a string (in the form "&entity_name;").
539
540 Params:
541 text = string to append to
542 c = unicode character for entity to append
543
544 Returns:
545 appended string
546
547 ***************************************************************************/
548
549 protected Char[] appendEncodedEntity ( Char ) ( ref Char[] text, dchar c )
550 {
551 static assert(is(Char == char) || is(Char == wchar) || is(Char == dchar),
552 This.stringof ~ " template parameter Char must be one of {char, wchar, dchar}, not " ~ Char.stringof);
553
554 auto name = this.entities.getName(c);
555 if ( name.length )
556 {
557 text.append(this.charTo!(Char)(this.entities.getEncodedEntity(c, this.entity_buf)));
558 }
559
560 return text;
561 }
562
563
564 /***************************************************************************
565
566 Parses content to see if it's an encoded entity string. The criteria
567 are:
568
569 1. length of "entity" is at least 3
570
571 2. character 0 is '&'
572
573 3. a ';' between characters 1 and 16
574
575 4. no white space character or '&' before the first ';'
576
577 5. first ';' is after character 2
578
579 If "entity" complies with all of these, slice from the '&' to the ';' is
580 returned, otherwise null.
581
582 Params:
583 text = HTML entity string to parse
584
585 Returns:
586 The entity if parsing was successfull or null on failure.
587
588 ***************************************************************************/
589
590 protected Char[] sliceEncodedEntity ( Char ) ( Char[] text )
591 {
592 static assert(
593 is(Unqual!(Char) == char)
594 || is(Unqual!(Char) == wchar)
595 || is(Unqual!(Char) == dchar),
596 This.stringof ~ " template parameter Char must be one of {char, wchar, dchar}, not " ~ Char.stringof
597 );
598
599 if ( text.length <= 2 ) // a) criterion
600 {
601 return "";
602 }
603
604 Char[] entity;
605 UtfString!(Char, true) utf_str = { text };
606 foreach ( i, c; utf_str )
607 {
608 if ( i == 0 )
609 {
610 if ( c != '&' ) // b) criterion
611 {
612 break;
613 }
614 }
615 else
616 {
617 if ( c == '&' || this.isSpace(c) ) // d) criterion
618 {
619 break;
620 }
621
622 if ( c == ';' )
623 {
624 if ( i < 2 ) // e) criterion
625 {
626 break;
627 }
628
629 entity = text[0 .. i + 1];
630 break;
631 }
632 }
633 }
634
635 return entity;
636 }
637
638
639 /***************************************************************************
640
641 Checks whether the given character is a space.
642
643 Params:
644 c = character to check
645
646 Returns:
647 true if the character is a space
648
649 ***************************************************************************/
650
651 protected bool isSpace ( Char ) ( Char c )
652 {
653 static assert(
654 is(Unqual!(Char) == char)
655 || is(Unqual!(Char) == wchar)
656 || is(Unqual!(Char) == dchar),
657 This.stringof ~ " template parameter Char must be one of {char, wchar, dchar}, not " ~ Char.stringof
658 );
659
660 dchar unicode = c;
661 StringSearch!(true) str_search;
662 return !!str_search.isSpace(unicode);
663 }
664
665
666 /***************************************************************************
667
668 Converts an encoded entity to a unicode character.
669
670 Params:
671 entity = entity content to convert; including leading '&' and
672 terminating ';'
673
674 Returns:
675 the unicode character or InvalidUnicode on failure
676
677 ***************************************************************************/
678
679 protected dchar decodeCharacterEntity ( Char ) ( Char[] entity )
680 in
681 {
682 assert(entity.length >= 2, "character entity too short");
683 assert(entity[0] == '&' && entity[$ - 1] == ';', "invalid character entity");
684 }
685 do
686 {
687 static assert(
688 is(Unqual!(Char) == char)
689 || is(Unqual!(Char) == wchar)
690 || is(Unqual!(Char) == dchar),
691 This.stringof ~ " template parameter Char must be one of {char, wchar, dchar}, not " ~ Char.stringof
692 );
693
694 return this.entities.getUnicode(entity[1 .. $ - 1]);
695 }
696
697
698 /***************************************************************************
699
700 Converts an encoded numeric character reference entity to a unicode
701 character. Numeric character references are either:
702
703 &#<decimal Unicode>;
704 or
705 &#x<hexadecimal Unicode>;
706
707 (case insensitive)
708
709 Examples:
710
711 Entity Character Unicode hex (dec)
712 "A" 'A' 0x41 (65)
713 "á" 'á' 0xE1 (225)
714 "ñ" 'ñ' 0xF1 (241)
715
716 Params:
717 entity = entity content to convert; including leading "&#" and
718 terminating ';'
719
720 Returns:
721 the unicode character or InvalidUnicode on failure
722
723 ***************************************************************************/
724
725 protected dchar decodeNumericCharacterRef ( Char ) ( Char[] entity )
726 in
727 {
728 assert(entity.length >= 2, "character entity too short");
729 assert(entity[0] == '&' && entity[$ - 1] == ';', "invalid character entity");
730 }
731 do
732 {
733 static assert(
734 is(Unqual!(Char) == char)
735 || is(Unqual!(Char) == wchar)
736 || is(Unqual!(Char) == dchar),
737 This.stringof ~ " template parameter Char must be one of {char, wchar, dchar}, not " ~ Char.stringof
738 );
739
740 dchar unicode = InvalidUnicode;
741
742 try
743 {
744 // Get the first character after the '&'
745 auto c = entity[2];
746
747 // hexadecimal
748 if ( c == 'x' || c == 'X' )
749 {
750 unicode = cast(dchar) Integer.toInt(entity[3 .. $ - 1], 16);
751 }
752 // decimal
753 else
754 {
755 unicode = cast(dchar) Integer.toInt(entity[2 .. $ - 1], 10);
756 }
757 }
758 catch {}
759
760 return unicode;
761 }
762
763
764 /***************************************************************************
765
766 Converts from a unicode dchar to an array of the specified character
767 type, doing utf8 encoding if applicable.
768
769 Params:
770 unicode = unicode character to convert
771
772 Returns:
773 converted character string
774
775 ***************************************************************************/
776
777 private Char[] dcharTo ( Char ) ( dchar unicode )
778 {
779 dchar[1] str;
780 str[0] = unicode;
781 return this.dcharTo!(Char)(str);
782 }
783
784
785 /***************************************************************************
786
787 Converts from a unicode dchar[] to an array of the specified character
788 type, doing utf8 encoding if applicable.
789
790 Params:
791 unicode = unicode string to convert
792
793 Returns:
794 converted character string
795
796 ***************************************************************************/
797
798 private Char[] dcharTo ( Char ) ( dchar[] unicode )
799 {
800 static if ( is(Char == char) )
801 {
802 return super.dcharTo!(Char)(unicode, this.char_buffer);
803 }
804 else static if ( is(Char == wchar) )
805 {
806 return super.dcharTo!(Char)(unicode, this.wchar_buffer);
807 }
808 else static if ( is(Char == dchar) )
809 {
810 return super.dcharTo!(Char)(unicode, this.dchar_buffer);
811 }
812 else
813 {
814 static assert(false, typeof(this).stringof ~ ".dcharTo - method template can only handle char types");
815 }
816 }
817
818
819 /***************************************************************************
820
821 Converts from a single char to an array of the specified character type.
822
823 Params:
824 text = character to convert
825
826 Returns:
827 converted character string
828
829 ***************************************************************************/
830
831 private Char[] charTo ( Char ) ( char text )
832 {
833 dchar[1] str;
834 str[0] = text;
835 return this.charTo!(Char)(str);
836 }
837
838
839 /***************************************************************************
840
841 Converts from a utf8 char array to an array of the specified character
842 type.
843
844 Params:
845 text = string to convert
846
847 Returns:
848 converted character string
849
850 ***************************************************************************/
851
852 private Char[] charTo ( Char ) ( char[] text )
853 {
854 static if ( is(Char == char) )
855 {
856 return super.charTo!(Char)(text, this.char_buffer);
857 }
858 else static if ( is(Char == wchar) )
859 {
860 return super.charTo!(Char)(text, this.wchar_buffer);
861 }
862 else static if ( is(Char == dchar) )
863 {
864 return super.charTo!(Char)(text, this.dchar_buffer);
865 }
866 else
867 {
868 static assert(false, typeof(this).stringof ~ ".charTo - method template can only handle char types");
869 }
870 }
871 }