ocean.text.entities.model.MarkupEntityCodec source code

1 /*******************************************************************************
2 
3     Template class for xml / html / xhtml / etc (markup language) entity
4     en/decoders, which share basically the same entity encoding scheme, only
5     differing in the exact entities which must be encoded. (The html entities
6     are a superset of the xml entities, for example.)
7 
8     See_Also:
9         http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
10 
11     Example usage:
12 
13     ---
14 
15         import ocean.text.entities.HtmlEntityCodec;
16 
17         scope entity_codec = new HtmlEntityCodec;
18 
19         char[] test = "hello & world © &szlig;&nbsp;&amp;#x230;'";
20 
21         if ( entity_codec.containsUnencoded(test) )
22         {
23             char[] encoded;
24             entity_codec.encode(test, encoded);
25         }
26 
27     ---
28 
29     Copyright:
30         Copyright (c) 2009-2016 dunnhumby Germany GmbH.
31         All rights reserved.
32 
33     License:
34         Boost Software License Version 1.0. See LICENSE_BOOST.txt for details.
35         Alternatively, this file may be distributed under the terms of the Tango
36         3-Clause BSD License (see LICENSE_BSD.txt for details).
37 
38 *******************************************************************************/
39 
40 module ocean.text.entities.model.MarkupEntityCodec;
41 
42 
43 
44 
45 import ocean.meta.types.Qualifiers;
46 
47 import ocean.core.Array;
48 
49 import ocean.text.entities.model.IEntityCodec;
50 import ocean.text.entities.model.IEntitySet;
51 
52 import ocean.text.utf.UtfString;
53 
54 import ocean.text.util.StringSearch;
55 
56 import Utf = ocean.text.convert.Utf;
57 
58 import Math = ocean.math.Math: min;
59 
60 import Integer = ocean.text.convert.Integer_tango: toInt;
61 
62 import ocean.core.Verify;
63 
64 
65 /*******************************************************************************
66 
67     Class to en/decode xml / html style entities.
68 
69 *******************************************************************************/
70 
71 public class MarkupEntityCodec ( E : IEntitySet ) : IEntityCodec!(E)
72 {
73     /***************************************************************************
74 
75         This alias.
76 
77     ***************************************************************************/
78 
79     public alias typeof(this) This;
80 
81 
82     /***************************************************************************
83 
84         Buffers for each character type, used by the utf8 encoder in the methods
85         charTo() & dcharTo().
86 
87     ***************************************************************************/
88 
89     private char[] char_buffer;
90 
91     private wchar[] wchar_buffer;
92 
93     private dchar[] dchar_buffer;
94 
95 
96     /***************************************************************************
97 
98         Buffer used when formatting an entity.
99 
100     ***************************************************************************/
101 
102     private char[] entity_buf;
103 
104 
105     /***************************************************************************
106 
107         Encode any unencoded entities in the input string.
108 
109         Params:
110             text = string to encode
111             encoded = output string
112 
113         Returns:
114             encoded output string
115 
116     ***************************************************************************/
117 
118     public override char[] encode ( const(char)[] text, ref char[] encoded )
119     {
120         return this.encode_(text, encoded);
121     }
122 
123     public override wchar[] encode ( const(wchar)[] text, ref wchar[] encoded )
124     {
125         return this.encode_(text, encoded);
126     }
127 
128     public override dchar[] encode ( const(dchar)[] text, ref dchar[] encoded )
129     {
130         return this.encode_(text, encoded);
131     }
132 
133 
134     /***************************************************************************
135 
136         Decode any encoded entities in the input string.
137 
138         Params:
139             text = string to decode
140             decoded = output string
141 
142         Returns:
143             decoded output string
144 
145     ***************************************************************************/
146 
147     public override mstring decode ( const(char)[] text, ref mstring decoded )
148     {
149         return this.decode_(text, decoded);
150     }
151 
152     public override wchar[] decode ( const(wchar)[] text, ref wchar[] decoded )
153     {
154         return this.decode_(text, decoded);
155     }
156 
157     public override dchar[] decode ( const(dchar)[] text, ref dchar[] decoded )
158     {
159         return this.decode_(text, decoded);
160     }
161 
162 
163     /***************************************************************************
164 
165         Checks whether the input string contains any unencoded entities.
166 
167         Params:
168             text = string to check
169 
170         Returns:
171             true if one or more unencoded entities are found
172 
173     ***************************************************************************/
174 
175     public override bool containsUnencoded ( const(char)[] text )
176     {
177         return this.containsUnencoded_(text);
178     }
179 
180     public override bool containsUnencoded ( const(wchar)[] text )
181     {
182         return this.containsUnencoded_(text);
183     }
184 
185     public override bool containsUnencoded ( const(dchar)[] text )
186     {
187         return this.containsUnencoded_(text);
188     }
189 
190 
191     /***************************************************************************
192 
193         Checks whether the input string contains any encoded entities.
194 
195         Params:
196             text = string to check
197 
198         Returns:
199             true if one or more encoded entities are found
200 
201     ***************************************************************************/
202 
203     public override bool containsEncoded ( const(char)[] text )
204     {
205         return this.containsEncoded_(text);
206     }
207 
208     public override bool containsEncoded ( const(wchar)[] text )
209     {
210         return this.containsEncoded_(text);
211     }
212 
213     public override bool containsEncoded ( const(dchar)[] text )
214     {
215         return this.containsEncoded_(text);
216     }
217 
218 
219     /***************************************************************************
220 
221         Checks whether the input string begins with an unencoded entity.
222 
223         Note: a full string has to be passed (not just a single character), as
224         '&' is an unencoded entity, but "&amp;" is not - these cases are not
225         distinguishable from just the 1st character.
226 
227         Params:
228             text = string to check
229 
230         Returns:
231             true if the first character in the input string is an unencoded
232             entity
233 
234     ***************************************************************************/
235 
236     public bool isUnencodedEntity ( Char ) ( Char[] text )
237     {
238         static assert(
239             is(Unqual!(Char) == char)
240                 || is(Unqual!(Char) == wchar)
241                 || is(Unqual!(Char) == dchar),
242             This.stringof ~ " template parameter Char must be one of {char, wchar, dchar}, not " ~ Char.stringof);
243 
244         auto c = UtfString!(Char, true).extract(text);
245 
246         if ( c in this.entities )
247         {
248             if ( c == '&' )
249             {
250                 // The following characters must form a valid character code
251                 auto entity = this.sliceEncodedEntity(text);
252                 if ( entity.length )
253                 {
254                     auto decoded_entity = this.decodeEntity(entity);
255                     return decoded_entity == InvalidUnicode;
256                 }
257                 else
258                 {
259                     return true;
260                 }
261             }
262             else
263             {
264                 return true;
265             }
266         }
267         else
268         {
269             return false;
270         }
271     }
272 
273 
274     /***************************************************************************
275 
276         Checks whether the input string begins with an encoded entity.
277 
278         Params:
279             text = string to check
280             exact_match = if true, the encoded entity must fill the entire input
281                 string
282 
283         Returns:
284             true if the string begins with an encoded entity
285 
286     ***************************************************************************/
287 
288     public bool isEncodedEntity ( Char ) ( Char[] text, bool exact_match = false )
289     {
290         static assert(
291             is(Unqual!(Char) == char)
292                 || is(Unqual!(Char) == wchar)
293                 || is(Unqual!(Char) == dchar),
294             This.stringof ~ " template parameter Char must be one of {char, wchar, dchar}, not " ~ Char.stringof
295         );
296 
297         auto entity = this.sliceEncodedEntity(text);
298         if ( !entity.length )
299         {
300             return false;
301         }
302 
303         return exact_match ? entity.length == text.length : true;
304     }
305 
306 
307     /***************************************************************************
308 
309         Converts an encoded entity to a unicode character. The entity may be
310         either:
311             - a numeric character reference (eg "&#xE1;" for 'á'), or
312             - a named ISO8859-1/15 (Latin 1/9) entity (eg "&szlig;" for 'ß').
313 
314         Params:
315             entity = entity content to convert; trailing '&' and terminating ';'
316                 are expected
317 
318         Returns:
319             the unicode character or InvalidUnicode on failure
320 
321     ***************************************************************************/
322 
323     public dchar decodeEntity ( Char ) ( Char[] entity )
324     {
325         static assert(
326             is(Unqual!(Char) == char)
327                 || is(Unqual!(Char) == wchar)
328                 || is(Unqual!(Char) == dchar),
329             This.stringof ~ " template parameter Char must be one of {char, wchar, dchar}, not " ~ Char.stringof
330         );
331 
332         verify(this.isEncodedEntity(entity, true),
333             This.stringof ~ ".decodeEntity - invalid character entity");
334 
335         dchar unicode = InvalidUnicode;
336 
337         if ( entity.length )
338         {
339             UtfString!(Char, true) utf_str = { entity };
340             auto c = utf_str[1];
341             if (c == '#')
342             {
343                 unicode = this.decodeNumericCharacterRef(entity);
344             }
345             else
346             {
347                 unicode = this.decodeCharacterEntity(entity);
348             }
349         }
350 
351         return unicode;
352     }
353 
354 
355     /***************************************************************************
356 
357         Internal method for encoding any unencoded entities in a string.
358 
359         Params:
360             text = string to encode
361             encoded = encoded output string
362 
363         Returns:
364             encoded output string
365 
366     ***************************************************************************/
367 
368     protected MutChar[] encode_ ( ConstChar, MutChar ) ( ConstChar[] text,
369         ref MutChar[] encoded )
370     {
371         static assert (is(Unqual!(ConstChar) == Unqual!(MutChar)));
372 
373         static assert(
374             is(MutChar == char)
375                 || is(MutChar == wchar)
376                 || is(MutChar == dchar),
377             This.stringof ~ " template parameter MutChar must be one of {char, wchar, dchar}, not " ~ MutChar.stringof
378         );
379 
380         encoded.length = 0;
381 
382         size_t last_special_char;
383         size_t i;
384         while ( i < text.length )
385         {
386             ConstChar[] process = text[i..$];
387 
388             size_t width;
389             auto c = UtfString!(ConstChar, true).extract(process, width);
390 
391             if ( this.isUnencodedEntity(process) )
392             {
393                 encoded.append(text[last_special_char..i]);
394 
395                 this.appendEncodedEntity(encoded, c);
396 
397                 last_special_char = i + width;
398             }
399 
400             i += width;
401         }
402 
403         encoded.append(text[last_special_char..$]);
404         return encoded;
405     }
406 
407 
408     /***************************************************************************
409 
410         Internal method for decoding any encoded entities in a string.
411 
412         Params:
413             text = string to decode
414             decoded = decoded output string
415 
416         Returns:
417             decoded output string
418 
419     ***************************************************************************/
420 
421     protected MutChar[] decode_ ( ConstChar, MutChar ) ( ConstChar[] text,
422         ref MutChar[] decoded )
423     {
424         static assert (is(Unqual!(ConstChar) == Unqual!(MutChar)));
425 
426         static assert(
427             is(MutChar == char)
428                 || is(MutChar == wchar)
429                 || is(MutChar == dchar),
430             This.stringof ~ " template parameter MutChar must be one of {char, wchar, dchar}, not " ~ MutChar.stringof
431         );
432 
433         decoded.length = 0;
434 
435         size_t last_special_char = 0;
436         size_t i = 0;
437         while ( i < text.length )
438         {
439             if ( text[i] == '&')
440             {
441                 auto entity = this.sliceEncodedEntity(text[i..$]);
442                 if ( entity.length )
443                 {
444                     decoded.append(text[last_special_char..i]);
445 
446                     dchar unicode = this.decodeEntity(entity);
447                     if ( unicode != InvalidUnicode )
448                     {
449                         decoded.append(this.dcharTo!(MutChar)(unicode));
450                     }
451 
452                     i += entity.length;
453                     last_special_char = i;
454                     continue;
455                 }
456             }
457             ++i;
458         }
459 
460         decoded.append(text[last_special_char..$]);
461         return decoded;
462     }
463 
464 
465     /***************************************************************************
466 
467         Internal method for checking whether the passed string contains any
468         unencoded entities.
469 
470         Params:
471             text = string to check
472 
473         Returns:
474             true if any unencoded entities are found
475 
476     ***************************************************************************/
477 
478     protected bool containsUnencoded_ ( Char ) ( Char[] text )
479     {
480         static assert(
481             is(Unqual!(Char) == char)
482                 || is(Unqual!(Char) == wchar)
483                 || is(Unqual!(Char) == dchar),
484             This.stringof ~ " template parameter Char must be one of {char, wchar, dchar}, not " ~ Char.stringof
485         );
486 
487         UtfString!(Char) utf_str = { text };
488         foreach ( i, c; utf_str )
489         {
490             if ( this.isUnencodedEntity(text[i..$]) )
491             {
492                 return true;
493             }
494         }
495 
496         return false;
497     }
498 
499 
500     /***************************************************************************
501 
502         Internal method for checking whether the passed string contains any
503         encoded entities.
504 
505         Params:
506             text = string to check
507 
508         Returns:
509             true if any encoded entities are found
510 
511     ***************************************************************************/
512 
513     protected bool containsEncoded_ ( Char ) ( Char[] text )
514     {
515         static assert(
516             is(Unqual!(Char) == char)
517                 || is(Unqual!(Char) == wchar)
518                 || is(Unqual!(Char) == dchar),
519             This.stringof ~ " template parameter Char must be one of {char, wchar, dchar}, not " ~ Char.stringof
520         );
521 
522         UtfString!(Char) utf_str = { text };
523         foreach ( i, c; utf_str )
524         {
525             auto entity = this.sliceEncodedEntity(text[i..$]);
526             if ( entity.length )
527             {
528                 return true;
529             }
530         }
531 
532         return false;
533     }
534 
535 
536     /***************************************************************************
537 
538         Appends an encoded entity to a string (in the form "&entity_name;").
539 
540         Params:
541             text = string to append to
542             c = unicode character for entity to append
543 
544         Returns:
545             appended string
546 
547     ***************************************************************************/
548 
549     protected Char[] appendEncodedEntity ( Char ) ( ref Char[] text, dchar c )
550     {
551         static assert(is(Char == char) || is(Char == wchar) || is(Char == dchar),
552                 This.stringof ~ " template parameter Char must be one of {char, wchar, dchar}, not " ~ Char.stringof);
553 
554         auto name = this.entities.getName(c);
555         if ( name.length )
556         {
557             text.append(this.charTo!(Char)(this.entities.getEncodedEntity(c, this.entity_buf)));
558         }
559 
560         return text;
561     }
562 
563 
564     /***************************************************************************
565 
566         Parses content to see if it's an encoded entity string. The criteria
567         are:
568 
569          1. length of "entity" is at least 3
570 
571          2. character 0 is '&'
572 
573          3. a ';' between characters 1 and 16
574 
575          4. no white space character or '&' before the first ';'
576 
577          5. first ';' is after character 2
578 
579         If "entity" complies with all of these, slice from the '&' to the ';' is
580         returned, otherwise null.
581 
582         Params:
583              text = HTML entity string to parse
584 
585         Returns:
586              The entity if parsing was successfull or null on failure.
587 
588     ***************************************************************************/
589 
590     protected Char[] sliceEncodedEntity ( Char ) ( Char[] text )
591     {
592         static assert(
593             is(Unqual!(Char) == char)
594                 || is(Unqual!(Char) == wchar)
595                 || is(Unqual!(Char) == dchar),
596             This.stringof ~ " template parameter Char must be one of {char, wchar, dchar}, not " ~ Char.stringof
597         );
598 
599         if ( text.length <= 2 )                             // a) criterion
600         {
601             return "";
602         }
603 
604         Char[] entity;
605         UtfString!(Char, true) utf_str = { text };
606         foreach ( i, c; utf_str )
607         {
608             if ( i == 0 )
609             {
610                 if ( c != '&' )                                // b) criterion
611                 {
612                     break;
613                 }
614             }
615             else
616             {
617                 if ( c == '&' || this.isSpace(c) )            // d) criterion
618                 {
619                     break;
620                 }
621 
622                 if ( c == ';' )
623                 {
624                     if ( i < 2 )                            // e) criterion
625                     {
626                         break;
627                     }
628 
629                     entity = text[0 .. i + 1];
630                     break;
631                 }
632             }
633         }
634 
635         return entity;
636     }
637 
638 
639     /***************************************************************************
640 
641         Checks whether the given character is a space.
642 
643         Params:
644             c = character to check
645 
646         Returns:
647             true if the character is a space
648 
649     ***************************************************************************/
650 
651     protected bool isSpace ( Char ) ( Char c )
652     {
653         static assert(
654             is(Unqual!(Char) == char)
655                 || is(Unqual!(Char) == wchar)
656                 || is(Unqual!(Char) == dchar),
657             This.stringof ~ " template parameter Char must be one of {char, wchar, dchar}, not " ~ Char.stringof
658         );
659 
660         dchar unicode = c;
661         StringSearch!(true) str_search;
662         return !!str_search.isSpace(unicode);
663     }
664 
665 
666     /***************************************************************************
667 
668         Converts an encoded entity to a unicode character.
669 
670         Params:
671             entity = entity content to convert; including leading '&' and
672                 terminating ';'
673 
674         Returns:
675             the unicode character or InvalidUnicode on failure
676 
677     ***************************************************************************/
678 
679     protected dchar decodeCharacterEntity ( Char ) ( Char[] entity )
680     in
681     {
682         assert(entity.length >= 2, "character entity too short");
683         assert(entity[0] == '&' && entity[$ - 1] == ';', "invalid character entity");
684     }
685     do
686     {
687         static assert(
688             is(Unqual!(Char) == char)
689                 || is(Unqual!(Char) == wchar)
690                 || is(Unqual!(Char) == dchar),
691             This.stringof ~ " template parameter Char must be one of {char, wchar, dchar}, not " ~ Char.stringof
692         );
693 
694         return this.entities.getUnicode(entity[1 .. $ - 1]);
695     }
696 
697 
698     /***************************************************************************
699 
700         Converts an encoded numeric character reference entity to a unicode
701         character. Numeric character references are either:
702 
703              &#<decimal Unicode>;
704         or
705              &#x<hexadecimal Unicode>;
706 
707         (case insensitive)
708 
709         Examples:
710 
711              Entity      Character       Unicode hex (dec)
712              "&#65;"     'A'             0x41 (65)
713              "&#xE1;"    'á'             0xE1 (225)
714              "&#Xf1;"    'ñ'             0xF1 (241)
715 
716         Params:
717             entity = entity content to convert; including leading "&#" and
718                 terminating ';'
719 
720         Returns:
721             the unicode character or InvalidUnicode on failure
722 
723     ***************************************************************************/
724 
725     protected dchar decodeNumericCharacterRef ( Char ) ( Char[] entity )
726     in
727     {
728         assert(entity.length >= 2, "character entity too short");
729         assert(entity[0] == '&' && entity[$ - 1] == ';', "invalid character entity");
730     }
731     do
732     {
733         static assert(
734             is(Unqual!(Char) == char)
735                 || is(Unqual!(Char) == wchar)
736                 || is(Unqual!(Char) == dchar),
737             This.stringof ~ " template parameter Char must be one of {char, wchar, dchar}, not " ~ Char.stringof
738         );
739 
740         dchar unicode = InvalidUnicode;
741 
742         try
743         {
744             // Get the first character after the '&'
745             auto c = entity[2];
746 
747             // hexadecimal
748             if ( c == 'x' || c == 'X' )
749             {
750                 unicode = cast(dchar) Integer.toInt(entity[3 .. $ - 1], 16);
751             }
752             // decimal
753             else
754             {
755                 unicode = cast(dchar) Integer.toInt(entity[2 .. $ - 1], 10);
756             }
757         }
758         catch {}
759 
760         return unicode;
761     }
762 
763 
764     /***************************************************************************
765 
766         Converts from a unicode dchar to an array of the specified character
767         type, doing utf8 encoding if applicable.
768 
769         Params:
770             unicode = unicode character to convert
771 
772         Returns:
773             converted character string
774 
775     ***************************************************************************/
776 
777     private Char[] dcharTo ( Char ) ( dchar unicode )
778     {
779         dchar[1] str;
780         str[0] = unicode;
781         return this.dcharTo!(Char)(str);
782     }
783 
784 
785     /***************************************************************************
786 
787         Converts from a unicode dchar[] to an array of the specified character
788         type, doing utf8 encoding if applicable.
789 
790         Params:
791             unicode = unicode string to convert
792 
793         Returns:
794             converted character string
795 
796     ***************************************************************************/
797 
798     private Char[] dcharTo ( Char ) ( dchar[] unicode )
799     {
800         static if ( is(Char == char) )
801         {
802             return super.dcharTo!(Char)(unicode, this.char_buffer);
803         }
804         else static if ( is(Char == wchar) )
805         {
806             return super.dcharTo!(Char)(unicode, this.wchar_buffer);
807         }
808         else static if ( is(Char == dchar) )
809         {
810             return super.dcharTo!(Char)(unicode, this.dchar_buffer);
811         }
812         else
813         {
814             static assert(false, typeof(this).stringof ~ ".dcharTo - method template can only handle char types");
815         }
816     }
817 
818 
819     /***************************************************************************
820 
821         Converts from a single char to an array of the specified character type.
822 
823         Params:
824             text = character to convert
825 
826         Returns:
827             converted character string
828 
829     ***************************************************************************/
830 
831     private Char[] charTo ( Char ) ( char text )
832     {
833         dchar[1] str;
834         str[0] = text;
835         return this.charTo!(Char)(str);
836     }
837 
838 
839     /***************************************************************************
840 
841         Converts from a utf8 char array to an array of the specified character
842         type.
843 
844         Params:
845             text = string to convert
846 
847         Returns:
848             converted character string
849 
850     ***************************************************************************/
851 
852     private Char[] charTo ( Char ) ( char[] text )
853     {
854         static if ( is(Char == char) )
855         {
856             return super.charTo!(Char)(text, this.char_buffer);
857         }
858         else static if ( is(Char == wchar) )
859         {
860             return super.charTo!(Char)(text, this.wchar_buffer);
861         }
862         else static if ( is(Char == dchar) )
863         {
864             return super.charTo!(Char)(text, this.dchar_buffer);
865         }
866         else
867         {
868             static assert(false, typeof(this).stringof ~ ".charTo - method template can only handle char types");
869         }
870     }
871 }