1 /******************************************************************************* 2 3 Template class for xml / html / xhtml / etc (markup language) entity 4 en/decoders, which share basically the same entity encoding scheme, only 5 differing in the exact entities which must be encoded. (The html entities 6 are a superset of the xml entities, for example.) 7 8 See_Also: 9 http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references 10 11 Example usage: 12 13 --- 14 15 import ocean.text.entities.HtmlEntityCodec; 16 17 scope entity_codec = new HtmlEntityCodec; 18 19 char[] test = "hello & world © ß &#x230;'"; 20 21 if ( entity_codec.containsUnencoded(test) ) 22 { 23 char[] encoded; 24 entity_codec.encode(test, encoded); 25 } 26 27 --- 28 29 Copyright: 30 Copyright (c) 2009-2016 dunnhumby Germany GmbH. 31 All rights reserved. 32 33 License: 34 Boost Software License Version 1.0. See LICENSE_BOOST.txt for details. 35 Alternatively, this file may be distributed under the terms of the Tango 36 3-Clause BSD License (see LICENSE_BSD.txt for details). 37 38 *******************************************************************************/ 39 40 module ocean.text.entities.model.MarkupEntityCodec; 41 42 43 44 45 import ocean.meta.types.Qualifiers; 46 47 import ocean.core.Array; 48 49 import ocean.text.entities.model.IEntityCodec; 50 import ocean.text.entities.model.IEntitySet; 51 52 import ocean.text.utf.UtfString; 53 54 import ocean.text.util.StringSearch; 55 56 import Utf = ocean.text.convert.Utf; 57 58 import Math = ocean.math.Math: min; 59 60 import Integer = ocean.text.convert.Integer_tango: toInt; 61 62 import ocean.core.Verify; 63 64 65 /******************************************************************************* 66 67 Class to en/decode xml / html style entities. 68 69 *******************************************************************************/ 70 71 public class MarkupEntityCodec ( E : IEntitySet ) : IEntityCodec!(E) 72 { 73 /*************************************************************************** 74 75 This alias. 76 77 ***************************************************************************/ 78 79 public alias typeof(this) This; 80 81 82 /*************************************************************************** 83 84 Buffers for each character type, used by the utf8 encoder in the methods 85 charTo() & dcharTo(). 86 87 ***************************************************************************/ 88 89 private char[] char_buffer; 90 91 private wchar[] wchar_buffer; 92 93 private dchar[] dchar_buffer; 94 95 96 /*************************************************************************** 97 98 Buffer used when formatting an entity. 99 100 ***************************************************************************/ 101 102 private char[] entity_buf; 103 104 105 /*************************************************************************** 106 107 Encode any unencoded entities in the input string. 108 109 Params: 110 text = string to encode 111 encoded = output string 112 113 Returns: 114 encoded output string 115 116 ***************************************************************************/ 117 118 public override char[] encode ( const(char)[] text, ref char[] encoded ) 119 { 120 return this.encode_(text, encoded); 121 } 122 123 public override wchar[] encode ( const(wchar)[] text, ref wchar[] encoded ) 124 { 125 return this.encode_(text, encoded); 126 } 127 128 public override dchar[] encode ( const(dchar)[] text, ref dchar[] encoded ) 129 { 130 return this.encode_(text, encoded); 131 } 132 133 134 /*************************************************************************** 135 136 Decode any encoded entities in the input string. 137 138 Params: 139 text = string to decode 140 decoded = output string 141 142 Returns: 143 decoded output string 144 145 ***************************************************************************/ 146 147 public override mstring decode ( const(char)[] text, ref mstring decoded ) 148 { 149 return this.decode_(text, decoded); 150 } 151 152 public override wchar[] decode ( const(wchar)[] text, ref wchar[] decoded ) 153 { 154 return this.decode_(text, decoded); 155 } 156 157 public override dchar[] decode ( const(dchar)[] text, ref dchar[] decoded ) 158 { 159 return this.decode_(text, decoded); 160 } 161 162 163 /*************************************************************************** 164 165 Checks whether the input string contains any unencoded entities. 166 167 Params: 168 text = string to check 169 170 Returns: 171 true if one or more unencoded entities are found 172 173 ***************************************************************************/ 174 175 public override bool containsUnencoded ( const(char)[] text ) 176 { 177 return this.containsUnencoded_(text); 178 } 179 180 public override bool containsUnencoded ( const(wchar)[] text ) 181 { 182 return this.containsUnencoded_(text); 183 } 184 185 public override bool containsUnencoded ( const(dchar)[] text ) 186 { 187 return this.containsUnencoded_(text); 188 } 189 190 191 /*************************************************************************** 192 193 Checks whether the input string contains any encoded entities. 194 195 Params: 196 text = string to check 197 198 Returns: 199 true if one or more encoded entities are found 200 201 ***************************************************************************/ 202 203 public override bool containsEncoded ( const(char)[] text ) 204 { 205 return this.containsEncoded_(text); 206 } 207 208 public override bool containsEncoded ( const(wchar)[] text ) 209 { 210 return this.containsEncoded_(text); 211 } 212 213 public override bool containsEncoded ( const(dchar)[] text ) 214 { 215 return this.containsEncoded_(text); 216 } 217 218 219 /*************************************************************************** 220 221 Checks whether the input string begins with an unencoded entity. 222 223 Note: a full string has to be passed (not just a single character), as 224 '&' is an unencoded entity, but "&" is not - these cases are not 225 distinguishable from just the 1st character. 226 227 Params: 228 text = string to check 229 230 Returns: 231 true if the first character in the input string is an unencoded 232 entity 233 234 ***************************************************************************/ 235 236 public bool isUnencodedEntity ( Char ) ( Char[] text ) 237 { 238 static assert( 239 is(Unqual!(Char) == char) 240 || is(Unqual!(Char) == wchar) 241 || is(Unqual!(Char) == dchar), 242 This.stringof ~ " template parameter Char must be one of {char, wchar, dchar}, not " ~ Char.stringof); 243 244 auto c = UtfString!(Char, true).extract(text); 245 246 if ( c in this.entities ) 247 { 248 if ( c == '&' ) 249 { 250 // The following characters must form a valid character code 251 auto entity = this.sliceEncodedEntity(text); 252 if ( entity.length ) 253 { 254 auto decoded_entity = this.decodeEntity(entity); 255 return decoded_entity == InvalidUnicode; 256 } 257 else 258 { 259 return true; 260 } 261 } 262 else 263 { 264 return true; 265 } 266 } 267 else 268 { 269 return false; 270 } 271 } 272 273 274 /*************************************************************************** 275 276 Checks whether the input string begins with an encoded entity. 277 278 Params: 279 text = string to check 280 exact_match = if true, the encoded entity must fill the entire input 281 string 282 283 Returns: 284 true if the string begins with an encoded entity 285 286 ***************************************************************************/ 287 288 public bool isEncodedEntity ( Char ) ( Char[] text, bool exact_match = false ) 289 { 290 static assert( 291 is(Unqual!(Char) == char) 292 || is(Unqual!(Char) == wchar) 293 || is(Unqual!(Char) == dchar), 294 This.stringof ~ " template parameter Char must be one of {char, wchar, dchar}, not " ~ Char.stringof 295 ); 296 297 auto entity = this.sliceEncodedEntity(text); 298 if ( !entity.length ) 299 { 300 return false; 301 } 302 303 return exact_match ? entity.length == text.length : true; 304 } 305 306 307 /*************************************************************************** 308 309 Converts an encoded entity to a unicode character. The entity may be 310 either: 311 - a numeric character reference (eg "á" for 'á'), or 312 - a named ISO8859-1/15 (Latin 1/9) entity (eg "ß" for 'ß'). 313 314 Params: 315 entity = entity content to convert; trailing '&' and terminating ';' 316 are expected 317 318 Returns: 319 the unicode character or InvalidUnicode on failure 320 321 ***************************************************************************/ 322 323 public dchar decodeEntity ( Char ) ( Char[] entity ) 324 { 325 static assert( 326 is(Unqual!(Char) == char) 327 || is(Unqual!(Char) == wchar) 328 || is(Unqual!(Char) == dchar), 329 This.stringof ~ " template parameter Char must be one of {char, wchar, dchar}, not " ~ Char.stringof 330 ); 331 332 verify(this.isEncodedEntity(entity, true), 333 This.stringof ~ ".decodeEntity - invalid character entity"); 334 335 dchar unicode = InvalidUnicode; 336 337 if ( entity.length ) 338 { 339 UtfString!(Char, true) utf_str = { entity }; 340 auto c = utf_str[1]; 341 if (c == '#') 342 { 343 unicode = this.decodeNumericCharacterRef(entity); 344 } 345 else 346 { 347 unicode = this.decodeCharacterEntity(entity); 348 } 349 } 350 351 return unicode; 352 } 353 354 355 /*************************************************************************** 356 357 Internal method for encoding any unencoded entities in a string. 358 359 Params: 360 text = string to encode 361 encoded = encoded output string 362 363 Returns: 364 encoded output string 365 366 ***************************************************************************/ 367 368 protected MutChar[] encode_ ( ConstChar, MutChar ) ( ConstChar[] text, 369 ref MutChar[] encoded ) 370 { 371 static assert (is(Unqual!(ConstChar) == Unqual!(MutChar))); 372 373 static assert( 374 is(MutChar == char) 375 || is(MutChar == wchar) 376 || is(MutChar == dchar), 377 This.stringof ~ " template parameter MutChar must be one of {char, wchar, dchar}, not " ~ MutChar.stringof 378 ); 379 380 encoded.length = 0; 381 382 size_t last_special_char; 383 size_t i; 384 while ( i < text.length ) 385 { 386 ConstChar[] process = text[i..$]; 387 388 size_t width; 389 auto c = UtfString!(ConstChar, true).extract(process, width); 390 391 if ( this.isUnencodedEntity(process) ) 392 { 393 encoded.append(text[last_special_char..i]); 394 395 this.appendEncodedEntity(encoded, c); 396 397 last_special_char = i + width; 398 } 399 400 i += width; 401 } 402 403 encoded.append(text[last_special_char..$]); 404 return encoded; 405 } 406 407 408 /*************************************************************************** 409 410 Internal method for decoding any encoded entities in a string. 411 412 Params: 413 text = string to decode 414 decoded = decoded output string 415 416 Returns: 417 decoded output string 418 419 ***************************************************************************/ 420 421 protected MutChar[] decode_ ( ConstChar, MutChar ) ( ConstChar[] text, 422 ref MutChar[] decoded ) 423 { 424 static assert (is(Unqual!(ConstChar) == Unqual!(MutChar))); 425 426 static assert( 427 is(MutChar == char) 428 || is(MutChar == wchar) 429 || is(MutChar == dchar), 430 This.stringof ~ " template parameter MutChar must be one of {char, wchar, dchar}, not " ~ MutChar.stringof 431 ); 432 433 decoded.length = 0; 434 435 size_t last_special_char = 0; 436 size_t i = 0; 437 while ( i < text.length ) 438 { 439 if ( text[i] == '&') 440 { 441 auto entity = this.sliceEncodedEntity(text[i..$]); 442 if ( entity.length ) 443 { 444 decoded.append(text[last_special_char..i]); 445 446 dchar unicode = this.decodeEntity(entity); 447 if ( unicode != InvalidUnicode ) 448 { 449 decoded.append(this.dcharTo!(MutChar)(unicode)); 450 } 451 452 i += entity.length; 453 last_special_char = i; 454 continue; 455 } 456 } 457 ++i; 458 } 459 460 decoded.append(text[last_special_char..$]); 461 return decoded; 462 } 463 464 465 /*************************************************************************** 466 467 Internal method for checking whether the passed string contains any 468 unencoded entities. 469 470 Params: 471 text = string to check 472 473 Returns: 474 true if any unencoded entities are found 475 476 ***************************************************************************/ 477 478 protected bool containsUnencoded_ ( Char ) ( Char[] text ) 479 { 480 static assert( 481 is(Unqual!(Char) == char) 482 || is(Unqual!(Char) == wchar) 483 || is(Unqual!(Char) == dchar), 484 This.stringof ~ " template parameter Char must be one of {char, wchar, dchar}, not " ~ Char.stringof 485 ); 486 487 UtfString!(Char) utf_str = { text }; 488 foreach ( i, c; utf_str ) 489 { 490 if ( this.isUnencodedEntity(text[i..$]) ) 491 { 492 return true; 493 } 494 } 495 496 return false; 497 } 498 499 500 /*************************************************************************** 501 502 Internal method for checking whether the passed string contains any 503 encoded entities. 504 505 Params: 506 text = string to check 507 508 Returns: 509 true if any encoded entities are found 510 511 ***************************************************************************/ 512 513 protected bool containsEncoded_ ( Char ) ( Char[] text ) 514 { 515 static assert( 516 is(Unqual!(Char) == char) 517 || is(Unqual!(Char) == wchar) 518 || is(Unqual!(Char) == dchar), 519 This.stringof ~ " template parameter Char must be one of {char, wchar, dchar}, not " ~ Char.stringof 520 ); 521 522 UtfString!(Char) utf_str = { text }; 523 foreach ( i, c; utf_str ) 524 { 525 auto entity = this.sliceEncodedEntity(text[i..$]); 526 if ( entity.length ) 527 { 528 return true; 529 } 530 } 531 532 return false; 533 } 534 535 536 /*************************************************************************** 537 538 Appends an encoded entity to a string (in the form "&entity_name;"). 539 540 Params: 541 text = string to append to 542 c = unicode character for entity to append 543 544 Returns: 545 appended string 546 547 ***************************************************************************/ 548 549 protected Char[] appendEncodedEntity ( Char ) ( ref Char[] text, dchar c ) 550 { 551 static assert(is(Char == char) || is(Char == wchar) || is(Char == dchar), 552 This.stringof ~ " template parameter Char must be one of {char, wchar, dchar}, not " ~ Char.stringof); 553 554 auto name = this.entities.getName(c); 555 if ( name.length ) 556 { 557 text.append(this.charTo!(Char)(this.entities.getEncodedEntity(c, this.entity_buf))); 558 } 559 560 return text; 561 } 562 563 564 /*************************************************************************** 565 566 Parses content to see if it's an encoded entity string. The criteria 567 are: 568 569 1. length of "entity" is at least 3 570 571 2. character 0 is '&' 572 573 3. a ';' between characters 1 and 16 574 575 4. no white space character or '&' before the first ';' 576 577 5. first ';' is after character 2 578 579 If "entity" complies with all of these, slice from the '&' to the ';' is 580 returned, otherwise null. 581 582 Params: 583 text = HTML entity string to parse 584 585 Returns: 586 The entity if parsing was successfull or null on failure. 587 588 ***************************************************************************/ 589 590 protected Char[] sliceEncodedEntity ( Char ) ( Char[] text ) 591 { 592 static assert( 593 is(Unqual!(Char) == char) 594 || is(Unqual!(Char) == wchar) 595 || is(Unqual!(Char) == dchar), 596 This.stringof ~ " template parameter Char must be one of {char, wchar, dchar}, not " ~ Char.stringof 597 ); 598 599 if ( text.length <= 2 ) // a) criterion 600 { 601 return ""; 602 } 603 604 Char[] entity; 605 UtfString!(Char, true) utf_str = { text }; 606 foreach ( i, c; utf_str ) 607 { 608 if ( i == 0 ) 609 { 610 if ( c != '&' ) // b) criterion 611 { 612 break; 613 } 614 } 615 else 616 { 617 if ( c == '&' || this.isSpace(c) ) // d) criterion 618 { 619 break; 620 } 621 622 if ( c == ';' ) 623 { 624 if ( i < 2 ) // e) criterion 625 { 626 break; 627 } 628 629 entity = text[0 .. i + 1]; 630 break; 631 } 632 } 633 } 634 635 return entity; 636 } 637 638 639 /*************************************************************************** 640 641 Checks whether the given character is a space. 642 643 Params: 644 c = character to check 645 646 Returns: 647 true if the character is a space 648 649 ***************************************************************************/ 650 651 protected bool isSpace ( Char ) ( Char c ) 652 { 653 static assert( 654 is(Unqual!(Char) == char) 655 || is(Unqual!(Char) == wchar) 656 || is(Unqual!(Char) == dchar), 657 This.stringof ~ " template parameter Char must be one of {char, wchar, dchar}, not " ~ Char.stringof 658 ); 659 660 dchar unicode = c; 661 StringSearch!(true) str_search; 662 return !!str_search.isSpace(unicode); 663 } 664 665 666 /*************************************************************************** 667 668 Converts an encoded entity to a unicode character. 669 670 Params: 671 entity = entity content to convert; including leading '&' and 672 terminating ';' 673 674 Returns: 675 the unicode character or InvalidUnicode on failure 676 677 ***************************************************************************/ 678 679 protected dchar decodeCharacterEntity ( Char ) ( Char[] entity ) 680 in 681 { 682 assert(entity.length >= 2, "character entity too short"); 683 assert(entity[0] == '&' && entity[$ - 1] == ';', "invalid character entity"); 684 } 685 do 686 { 687 static assert( 688 is(Unqual!(Char) == char) 689 || is(Unqual!(Char) == wchar) 690 || is(Unqual!(Char) == dchar), 691 This.stringof ~ " template parameter Char must be one of {char, wchar, dchar}, not " ~ Char.stringof 692 ); 693 694 return this.entities.getUnicode(entity[1 .. $ - 1]); 695 } 696 697 698 /*************************************************************************** 699 700 Converts an encoded numeric character reference entity to a unicode 701 character. Numeric character references are either: 702 703 &#<decimal Unicode>; 704 or 705 &#x<hexadecimal Unicode>; 706 707 (case insensitive) 708 709 Examples: 710 711 Entity Character Unicode hex (dec) 712 "A" 'A' 0x41 (65) 713 "á" 'á' 0xE1 (225) 714 "ñ" 'ñ' 0xF1 (241) 715 716 Params: 717 entity = entity content to convert; including leading "&#" and 718 terminating ';' 719 720 Returns: 721 the unicode character or InvalidUnicode on failure 722 723 ***************************************************************************/ 724 725 protected dchar decodeNumericCharacterRef ( Char ) ( Char[] entity ) 726 in 727 { 728 assert(entity.length >= 2, "character entity too short"); 729 assert(entity[0] == '&' && entity[$ - 1] == ';', "invalid character entity"); 730 } 731 do 732 { 733 static assert( 734 is(Unqual!(Char) == char) 735 || is(Unqual!(Char) == wchar) 736 || is(Unqual!(Char) == dchar), 737 This.stringof ~ " template parameter Char must be one of {char, wchar, dchar}, not " ~ Char.stringof 738 ); 739 740 dchar unicode = InvalidUnicode; 741 742 try 743 { 744 // Get the first character after the '&' 745 auto c = entity[2]; 746 747 // hexadecimal 748 if ( c == 'x' || c == 'X' ) 749 { 750 unicode = cast(dchar) Integer.toInt(entity[3 .. $ - 1], 16); 751 } 752 // decimal 753 else 754 { 755 unicode = cast(dchar) Integer.toInt(entity[2 .. $ - 1], 10); 756 } 757 } 758 catch {} 759 760 return unicode; 761 } 762 763 764 /*************************************************************************** 765 766 Converts from a unicode dchar to an array of the specified character 767 type, doing utf8 encoding if applicable. 768 769 Params: 770 unicode = unicode character to convert 771 772 Returns: 773 converted character string 774 775 ***************************************************************************/ 776 777 private Char[] dcharTo ( Char ) ( dchar unicode ) 778 { 779 dchar[1] str; 780 str[0] = unicode; 781 return this.dcharTo!(Char)(str); 782 } 783 784 785 /*************************************************************************** 786 787 Converts from a unicode dchar[] to an array of the specified character 788 type, doing utf8 encoding if applicable. 789 790 Params: 791 unicode = unicode string to convert 792 793 Returns: 794 converted character string 795 796 ***************************************************************************/ 797 798 private Char[] dcharTo ( Char ) ( dchar[] unicode ) 799 { 800 static if ( is(Char == char) ) 801 { 802 return super.dcharTo!(Char)(unicode, this.char_buffer); 803 } 804 else static if ( is(Char == wchar) ) 805 { 806 return super.dcharTo!(Char)(unicode, this.wchar_buffer); 807 } 808 else static if ( is(Char == dchar) ) 809 { 810 return super.dcharTo!(Char)(unicode, this.dchar_buffer); 811 } 812 else 813 { 814 static assert(false, typeof(this).stringof ~ ".dcharTo - method template can only handle char types"); 815 } 816 } 817 818 819 /*************************************************************************** 820 821 Converts from a single char to an array of the specified character type. 822 823 Params: 824 text = character to convert 825 826 Returns: 827 converted character string 828 829 ***************************************************************************/ 830 831 private Char[] charTo ( Char ) ( char text ) 832 { 833 dchar[1] str; 834 str[0] = text; 835 return this.charTo!(Char)(str); 836 } 837 838 839 /*************************************************************************** 840 841 Converts from a utf8 char array to an array of the specified character 842 type. 843 844 Params: 845 text = string to convert 846 847 Returns: 848 converted character string 849 850 ***************************************************************************/ 851 852 private Char[] charTo ( Char ) ( char[] text ) 853 { 854 static if ( is(Char == char) ) 855 { 856 return super.charTo!(Char)(text, this.char_buffer); 857 } 858 else static if ( is(Char == wchar) ) 859 { 860 return super.charTo!(Char)(text, this.wchar_buffer); 861 } 862 else static if ( is(Char == dchar) ) 863 { 864 return super.charTo!(Char)(text, this.dchar_buffer); 865 } 866 else 867 { 868 static assert(false, typeof(this).stringof ~ ".charTo - method template can only handle char types"); 869 } 870 } 871 }