1 /******************************************************************************* 2 3 Copyright: 4 Copyright (C) 2007 Aaron Craelius and Kris Bell 5 Some parts copyright (c) 2009-2016 dunnhumby Germany GmbH. 6 All rights reserved. 7 8 License: 9 Tango Dual License: 3-Clause BSD License / Academic Free License v3.0. 10 See LICENSE_TANGO.txt for details. 11 12 Authors: Aaron, Kris 13 14 *******************************************************************************/ 15 16 module ocean.text.xml.PullParser; 17 18 import ocean.meta.types.Qualifiers; 19 20 import ocean.text.Util : indexOf; 21 22 import ocean.core.ExceptionDefinitions : XmlException; 23 24 import Integer = ocean.text.convert.Integer_tango; 25 26 import Utf = ocean.text.convert.Utf : toString; 27 28 version (unittest) 29 { 30 import ocean.core.Test; 31 } 32 33 /******************************************************************************* 34 35 Use -version=whitespace to retain whitespace as data nodes. We 36 see a %25 increase in token count and 10% throughput drop when 37 parsing "hamlet.xml" with this option enabled (pullparser alone) 38 39 *******************************************************************************/ 40 41 version (whitespace) 42 version = retainwhite; 43 else 44 { 45 version = stripwhite; 46 version = partialwhite; 47 } 48 49 /******************************************************************************* 50 51 The XML node types 52 53 *******************************************************************************/ 54 55 public enum XmlNodeType 56 { 57 Element, 58 Data, 59 Attribute, 60 CData, 61 Comment, 62 PI, 63 Doctype, 64 Document 65 } 66 67 /******************************************************************************* 68 69 Values returned by the pull-parser 70 71 *******************************************************************************/ 72 73 public enum XmlTokenType 74 { 75 Done, 76 StartElement, 77 Attribute, 78 EndElement, 79 EndEmptyElement, 80 Data, 81 Comment, 82 CData, 83 Doctype, 84 PI, 85 None 86 } 87 88 /******************************************************************************* 89 90 Token based xml Parser. Templated to operate with char[], wchar[], 91 and dchar[] content. 92 93 The parser is constructed with some tradeoffs relating to document 94 integrity. It is generally optimized for well-formed documents, and 95 currently may read past a document-end for those that are not well 96 formed. There are various compilation options to enable checks and 97 balances, depending on how things should be handled. We'll settle 98 on a common configuration over the next few weeks, but for now all 99 settings are somewhat experimental. Partly because making some tiny 100 unrelated change to the code can cause notable throughput changes, 101 and we need to track that down. 102 103 We're not yet clear why these swings are so pronounced (for changes 104 outside the code path) but they seem to be related to the alignment 105 of codegen. It could be a cache-line issue, or something else. We'll 106 figure it out, yet it's interesting that some hardware buttons are 107 clearly being pushed 108 109 *******************************************************************************/ 110 111 class PullParser(ChMut = char) 112 { 113 alias const(ChMut) Ch; 114 115 public int depth; 116 public Ch[] prefix; 117 public Ch[] rawValue; 118 public Ch[] localName; 119 public XmlTokenType type = XmlTokenType.None; 120 121 package XmlText!(Ch) text; 122 private bool stream; 123 private istring errMsg; 124 125 /*********************************************************************** 126 127 Construct a parser on the given content (may be null) 128 129 ***********************************************************************/ 130 131 this(Ch[] content = null) 132 { 133 reset (content); 134 } 135 136 /*********************************************************************** 137 138 Consume the next token and return its type 139 140 ***********************************************************************/ 141 142 final XmlTokenType next() 143 { 144 auto e = text.end; 145 auto p = text.point; 146 147 // at end of document? 148 if (p >= e) 149 return endOfInput; 150 version (stripwhite) 151 { 152 // strip leading whitespace 153 while (*p <= 32) 154 if (++p >= e) 155 return endOfInput; 156 } 157 // StartElement or Attribute? 158 if (type < XmlTokenType.EndElement) 159 { 160 version (retainwhite) 161 { 162 // strip leading whitespace (thanks to DRK) 163 while (*p <= 32) 164 if (++p >= e) 165 return endOfInput; 166 } 167 switch (*p) 168 { 169 case '>': 170 // termination of StartElement 171 ++depth; 172 ++p; 173 break; 174 175 case '/': 176 // empty element closure 177 text.point = p; 178 return doEndEmptyElement; 179 180 default: 181 // must be attribute instead 182 text.point = p; 183 return doAttributeName; 184 } 185 } 186 187 // consume data between elements? 188 if (*p != '<') 189 { 190 auto q = p; 191 while (++p < e && *p != '<') {} 192 193 if (p < e) 194 { 195 version (partialwhite) 196 { 197 // include leading whitespace 198 while (*(q-1) <= 32) 199 --q; 200 } 201 text.point = p; 202 rawValue = q [0 .. p - q]; 203 return type = XmlTokenType.Data; 204 } 205 return endOfInput; 206 } 207 208 // must be a '<' character, so peek ahead 209 switch (p[1]) 210 { 211 case '!': 212 // one of the following ... 213 if (p[2..4] == "--") 214 { 215 text.point = p + 4; 216 return doComment; 217 } 218 else 219 if (p[2..9] == "[CDATA[") 220 { 221 text.point = p + 9; 222 return doCData; 223 } 224 else 225 if (p[2..9] == "DOCTYPE") 226 { 227 text.point = p + 9; 228 return doDoctype; 229 } 230 return doUnexpected("!", p); 231 232 case '\?': 233 // must be PI data 234 text.point = p + 2; 235 return doPI; 236 237 case '/': 238 // should be a closing element name 239 p += 2; 240 auto q = p; 241 while (*q > 63 || text.name[*q]) 242 ++q; 243 244 if (*q is ':') 245 { 246 prefix = p[0 .. q - p]; 247 p = ++q; 248 while (*q > 63 || text.attributeName[*q]) 249 ++q; 250 251 localName = p[0 .. q - p]; 252 } 253 else 254 { 255 prefix = null; 256 localName = p[0 .. q - p]; 257 } 258 259 while (*q <= 32) 260 if (++q >= e) 261 return endOfInput; 262 263 if (*q is '>') 264 { 265 --depth; 266 text.point = q + 1; 267 return type = XmlTokenType.EndElement; 268 } 269 return doExpected(">", q); 270 271 default: 272 // scan new element name 273 auto q = ++p; 274 while (*q > 63 || text.name[*q]) 275 ++q; 276 277 // check if we ran past the end 278 if (q >= e) 279 return endOfInput; 280 281 if (*q != ':') 282 { 283 prefix = null; 284 localName = p [0 .. q - p]; 285 } 286 else 287 { 288 prefix = p[0 .. q - p]; 289 p = ++q; 290 while (*q > 63 || text.attributeName[*q]) 291 ++q; 292 localName = p[0 .. q - p]; 293 } 294 295 text.point = q; 296 return type = XmlTokenType.StartElement; 297 } 298 } 299 300 /*********************************************************************** 301 302 ***********************************************************************/ 303 304 private XmlTokenType doAttributeName() 305 { 306 auto p = text.point; 307 auto q = p; 308 auto e = text.end; 309 310 while (*q > 63 || text.attributeName[*q]) 311 ++q; 312 if (q >= e) 313 return endOfInput; 314 315 if (*q is ':') 316 { 317 prefix = p[0 .. q - p]; 318 p = ++q; 319 320 while (*q > 63 || text.attributeName[*q]) 321 ++q; 322 323 localName = p[0 .. q - p]; 324 } 325 else 326 { 327 prefix = null; 328 localName = p[0 .. q - p]; 329 } 330 331 if (*q <= 32) 332 { 333 while (*++q <= 32) {} 334 if (q >= e) 335 return endOfInput; 336 } 337 338 if (*q is '=') 339 { 340 while (*++q <= 32) {} 341 if (q >= e) 342 return endOfInput; 343 344 auto quote = *q; 345 switch (quote) 346 { 347 case '"': 348 case '\'': 349 p = q + 1; 350 while (*++q != quote) {} 351 if (q < e) 352 { 353 rawValue = p[0 .. q - p]; 354 text.point = q + 1; // skip end quote 355 return type = XmlTokenType.Attribute; 356 } 357 return endOfInput; 358 359 default: 360 return doExpected("\' or \"", q); 361 } 362 } 363 364 return doExpected ("=", q); 365 } 366 367 /*********************************************************************** 368 369 ***********************************************************************/ 370 371 private XmlTokenType doEndEmptyElement() 372 { 373 if (text.point[0] is '/' && text.point[1] is '>') 374 { 375 localName = prefix = null; 376 text.point += 2; 377 return type = XmlTokenType.EndEmptyElement; 378 } 379 return doExpected("/>", text.point); 380 } 381 382 /*********************************************************************** 383 384 ***********************************************************************/ 385 386 private XmlTokenType doComment() 387 { 388 auto e = text.end; 389 auto p = text.point; 390 auto q = p; 391 392 while (p < e) 393 { 394 while (*p != '-') 395 if (++p >= e) 396 return endOfInput; 397 398 if (p[0..3] == "-->") 399 { 400 text.point = p + 3; 401 rawValue = q [0 .. p - q]; 402 return type = XmlTokenType.Comment; 403 } 404 ++p; 405 } 406 407 return endOfInput; 408 } 409 410 /*********************************************************************** 411 412 ***********************************************************************/ 413 414 private XmlTokenType doCData() 415 { 416 auto e = text.end; 417 auto p = text.point; 418 419 while (p < e) 420 { 421 auto q = p; 422 while (*p != ']') 423 if (++p >= e) 424 return endOfInput; 425 426 if (p[0..3] == "]]>") 427 { 428 text.point = p + 3; 429 rawValue = q [0 .. p - q]; 430 return type = XmlTokenType.CData; 431 } 432 ++p; 433 } 434 435 return endOfInput; 436 } 437 438 /*********************************************************************** 439 440 ***********************************************************************/ 441 442 private XmlTokenType doPI() 443 { 444 auto e = text.end; 445 auto p = text.point; 446 auto q = p; 447 448 while (p < e) 449 { 450 while (*p != '\?') 451 if (++p >= e) 452 return endOfInput; 453 454 if (p[1] == '>') 455 { 456 rawValue = q [0 .. p - q]; 457 text.point = p + 2; 458 return type = XmlTokenType.PI; 459 } 460 ++p; 461 } 462 return endOfInput; 463 } 464 465 /*********************************************************************** 466 467 ***********************************************************************/ 468 469 private XmlTokenType doDoctype() 470 { 471 auto e = text.end; 472 auto p = text.point; 473 474 // strip leading whitespace 475 while (*p <= 32) 476 if (++p >= e) 477 return endOfInput; 478 479 auto q = p; 480 while (p < e) 481 { 482 if (*p is '>') 483 { 484 rawValue = q [0 .. p - q]; 485 prefix = null; 486 text.point = p + 1; 487 return type = XmlTokenType.Doctype; 488 } 489 else 490 { 491 if (*p == '[') 492 do { 493 if (++p >= e) 494 return endOfInput; 495 } while (*p != ']'); 496 ++p; 497 } 498 } 499 500 if (p >= e) 501 return endOfInput; 502 return XmlTokenType.Doctype; 503 } 504 505 /*********************************************************************** 506 507 ***********************************************************************/ 508 509 private XmlTokenType endOfInput () 510 { 511 if (depth && (stream is false)) 512 error ("Unexpected EOF"); 513 514 return XmlTokenType.Done; 515 } 516 517 /*********************************************************************** 518 519 ***********************************************************************/ 520 521 private XmlTokenType doUnexpected (istring msg, Ch* p) 522 { 523 return position ("parse error :: unexpected " ~ msg, p); 524 } 525 526 /*********************************************************************** 527 528 ***********************************************************************/ 529 530 private XmlTokenType doExpected (istring msg, Ch* p) 531 { 532 char[6] tmp = void; 533 return position ("parse error :: expected " ~ msg ~ " instead of " 534 ~ idup(Utf.toString(p[0..1], tmp)), p); 535 } 536 537 /*********************************************************************** 538 539 ***********************************************************************/ 540 541 private XmlTokenType position (istring msg, Ch* p) 542 { 543 return error (msg ~ " at position " 544 ~ idup(Integer.toString(p-text.text.ptr))); 545 } 546 547 /*********************************************************************** 548 549 ***********************************************************************/ 550 551 protected final XmlTokenType error (istring msg) 552 { 553 errMsg = msg; 554 throw new XmlException (msg); 555 } 556 557 /*********************************************************************** 558 559 Return the raw value of the current token 560 561 ***********************************************************************/ 562 563 final Ch[] value() 564 { 565 return rawValue; 566 } 567 568 /*********************************************************************** 569 570 Return the name of the current token 571 572 ***********************************************************************/ 573 574 final Ch[] name() 575 { 576 if (prefix.length) 577 return prefix ~ ":" ~ localName; 578 return localName; 579 } 580 581 /*********************************************************************** 582 583 Returns the text of the last error 584 585 ***********************************************************************/ 586 587 final istring error() 588 { 589 return errMsg; 590 } 591 592 /*********************************************************************** 593 594 Reset the parser 595 596 ***********************************************************************/ 597 598 final bool reset() 599 { 600 text.reset (text.text); 601 reset_; 602 return true; 603 } 604 605 /*********************************************************************** 606 607 Reset parser with new content 608 609 ***********************************************************************/ 610 611 final void reset(Ch[] newText) 612 { 613 text.reset (newText); 614 reset_; 615 } 616 617 /*********************************************************************** 618 619 experimental: set streaming mode 620 621 Use at your own risk, may be removed. 622 623 ***********************************************************************/ 624 625 final void incremental (bool yes = true) 626 { 627 stream = yes; 628 } 629 630 /*********************************************************************** 631 632 ***********************************************************************/ 633 634 private void reset_() 635 { 636 depth = 0; 637 errMsg = null; 638 type = XmlTokenType.None; 639 640 auto p = text.point; 641 if (p) 642 { 643 static if (Ch.sizeof == 1) 644 { 645 // consume UTF8 BOM 646 if (p[0] is 0xef && p[1] is 0xbb && p[2] is 0xbf) 647 p += 3; 648 } 649 650 //TODO enable optional declaration parsing 651 auto e = text.end; 652 while (p < e && *p <= 32) 653 ++p; 654 655 if (p < e) 656 if (p[0] is '<' && p[1] is '\?' && p[2..5] == "xml") 657 { 658 p += 5; 659 while (p < e && *p != '\?') 660 ++p; 661 p += 2; 662 } 663 text.point = p; 664 } 665 } 666 } 667 668 669 /******************************************************************************* 670 671 *******************************************************************************/ 672 673 package struct XmlText(Ch) 674 { 675 package Ch* end; 676 package size_t len; 677 package Ch[] text; 678 package Ch* point; 679 680 final void reset(Ch[] newText) 681 { 682 this.text = newText; 683 this.len = newText.length; 684 this.point = text.ptr; 685 this.end = point + len; 686 } 687 688 static const(ubyte[64]) name = 689 [ 690 // 0 1 2 3 4 5 6 7 8 9 A B C D E F 691 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, // 0 692 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1 693 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, // 2 694 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0 // 3 695 ]; 696 697 static const(ubyte[64]) attributeName = 698 [ 699 // 0 1 2 3 4 5 6 7 8 9 A B C D E F 700 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, // 0 701 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1 702 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, // 2 703 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0 // 3 704 ]; 705 } 706 707 /******************************************************************************* 708 709 *******************************************************************************/ 710 711 version (unittest) 712 { 713 /*********************************************************************** 714 715 ***********************************************************************/ 716 717 void testParser(Ch)(PullParser!(Ch) itr) 718 { 719 test(itr.next); 720 test(itr.value == "element [ <!ELEMENT element (#PCDATA)>]"); 721 test(itr.type == XmlTokenType.Doctype); 722 test(itr.next); 723 test(itr.localName == "element"); 724 test(itr.type == XmlTokenType.StartElement); 725 test(itr.depth == 0); 726 test(itr.next); 727 test(itr.localName == "attr"); 728 test(itr.value == "1"); 729 test(itr.next); 730 test(itr.type == XmlTokenType.Attribute); 731 test(itr.localName == "attr2"); 732 test(itr.value == "two"); 733 test(itr.next); 734 test(itr.value == "comment"); 735 test(itr.next); 736 test(itr.rawValue == "test&Z"); 737 test(itr.next); 738 test(itr.prefix == "qual"); 739 test(itr.localName == "elem"); 740 test(itr.next); 741 test(itr.type == XmlTokenType.EndEmptyElement); 742 test(itr.next); 743 test(itr.localName == "el2"); 744 test(itr.depth == 1); 745 test(itr.next); 746 test(itr.localName == "attr3"); 747 test(itr.value == "3three", itr.value); 748 test(itr.next); 749 test(itr.rawValue == "sdlgjsh"); 750 test(itr.next); 751 test(itr.localName == "el3"); 752 test(itr.depth == 2); 753 test(itr.next); 754 test(itr.type == XmlTokenType.EndEmptyElement); 755 test(itr.next); 756 test(itr.value == "data"); 757 test(itr.next); 758 test(itr.rawValue == "pi test", itr.rawValue); 759 test(itr.next); 760 test(itr.localName == "el2"); 761 test(itr.next); 762 test(itr.localName == "element"); 763 test(!itr.next); 764 } 765 766 767 /*********************************************************************** 768 769 ***********************************************************************/ 770 771 static immutable istring testXML = "<?xml version=\"1.0\" ?><!DOCTYPE element [ <!ELEMENT element (#PCDATA)>]><element " 772 ~ "attr=\"1\" attr2=\"two\"><!--comment-->test&Z<qual:elem /><el2 attr3 = " 773 ~ "'3three'><![CDATA[sdlgjsh]]><el3 />data<?pi test?></el2></element>"; 774 } 775 776 unittest 777 { 778 auto itr = new PullParser!(char)(testXML); 779 testParser (itr); 780 781 // Parsing new text (or even the same one) should not involve any further 782 // memory allocation 783 testNoAlloc({ 784 itr.reset(testXML); 785 testParser(itr); 786 }()); 787 }