1 /*******************************************************************************
2
3 Copyright:
4 Copyright (C) 2007 Aaron Craelius and Kris Bell
5 Some parts copyright (c) 2009-2016 dunnhumby Germany GmbH.
6 All rights reserved.
7
8 License:
9 Tango Dual License: 3-Clause BSD License / Academic Free License v3.0.
10 See LICENSE_TANGO.txt for details.
11
12 Authors: Aaron, Kris
13
14 *******************************************************************************/
15
16 module ocean.text.xml.PullParser;
17
18 import ocean.transition;
19
20 import ocean.text.Util : indexOf;
21
22 import ocean.core.ExceptionDefinitions : XmlException;
23
24 import Integer = ocean.text.convert.Integer_tango;
25
26 import Utf = ocean.text.convert.Utf : toString;
27
28 version (UnitTest)
29 {
30 import ocean.core.Test;
31 }
32
33 /*******************************************************************************
34
35 Use -version=whitespace to retain whitespace as data nodes. We
36 see a %25 increase in token count and 10% throughput drop when
37 parsing "hamlet.xml" with this option enabled (pullparser alone)
38
39 *******************************************************************************/
40
41 version (whitespace)
42 version = retainwhite;
43 else
44 {
45 version = stripwhite;
46 version = partialwhite;
47 }
48
49 /*******************************************************************************
50
51 The XML node types
52
53 *******************************************************************************/
54
55 public enum XmlNodeType
56 {
57 Element,
58 Data,
59 Attribute,
60 CData,
61 Comment,
62 PI,
63 Doctype,
64 Document
65 }
66
67 /*******************************************************************************
68
69 Values returned by the pull-parser
70
71 *******************************************************************************/
72
73 public enum XmlTokenType
74 {
75 Done,
76 StartElement,
77 Attribute,
78 EndElement,
79 EndEmptyElement,
80 Data,
81 Comment,
82 CData,
83 Doctype,
84 PI,
85 None
86 }
87
88 /*******************************************************************************
89
90 Token based xml Parser. Templated to operate with char[], wchar[],
91 and dchar[] content.
92
93 The parser is constructed with some tradeoffs relating to document
94 integrity. It is generally optimized for well-formed documents, and
95 currently may read past a document-end for those that are not well
96 formed. There are various compilation options to enable checks and
97 balances, depending on how things should be handled. We'll settle
98 on a common configuration over the next few weeks, but for now all
99 settings are somewhat experimental. Partly because making some tiny
100 unrelated change to the code can cause notable throughput changes,
101 and we need to track that down.
102
103 We're not yet clear why these swings are so pronounced (for changes
104 outside the code path) but they seem to be related to the alignment
105 of codegen. It could be a cache-line issue, or something else. We'll
106 figure it out, yet it's interesting that some hardware buttons are
107 clearly being pushed
108
109 *******************************************************************************/
110
111 class PullParser(ChMut = char)
112 {
113 alias Const!(ChMut) Ch;
114
115 public int depth;
116 public Ch[] prefix;
117 public Ch[] rawValue;
118 public Ch[] localName;
119 public XmlTokenType type = XmlTokenType.None;
120
121 package XmlText!(Ch) text;
122 private bool stream;
123 private istring errMsg;
124
125 /***********************************************************************
126
127 Construct a parser on the given content (may be null)
128
129 ***********************************************************************/
130
131 this(Ch[] content = null)
132 {
133 reset (content);
134 }
135
136 /***********************************************************************
137
138 Consume the next token and return its type
139
140 ***********************************************************************/
141
142 final XmlTokenType next()
143 {
144 auto e = text.end;
145 auto p = text.point;
146
147 // at end of document?
148 if (p >= e)
149 return endOfInput;
150 version (stripwhite)
151 {
152 // strip leading whitespace
153 while (*p <= 32)
154 if (++p >= e)
155 return endOfInput;
156 }
157 // StartElement or Attribute?
158 if (type < XmlTokenType.EndElement)
159 {
160 version (retainwhite)
161 {
162 // strip leading whitespace (thanks to DRK)
163 while (*p <= 32)
164 if (++p >= e)
165 return endOfInput;
166 }
167 switch (*p)
168 {
169 case '>':
170 // termination of StartElement
171 ++depth;
172 ++p;
173 break;
174
175 case '/':
176 // empty element closure
177 text.point = p;
178 return doEndEmptyElement;
179
180 default:
181 // must be attribute instead
182 text.point = p;
183 return doAttributeName;
184 }
185 }
186
187 // consume data between elements?
188 if (*p != '<')
189 {
190 auto q = p;
191 while (++p < e && *p != '<') {}
192
193 if (p < e)
194 {
195 version (partialwhite)
196 {
197 // include leading whitespace
198 while (*(q-1) <= 32)
199 --q;
200 }
201 text.point = p;
202 rawValue = q [0 .. p - q];
203 return type = XmlTokenType.Data;
204 }
205 return endOfInput;
206 }
207
208 // must be a '<' character, so peek ahead
209 switch (p[1])
210 {
211 case '!':
212 // one of the following ...
213 if (p[2..4] == "--")
214 {
215 text.point = p + 4;
216 return doComment;
217 }
218 else
219 if (p[2..9] == "[CDATA[")
220 {
221 text.point = p + 9;
222 return doCData;
223 }
224 else
225 if (p[2..9] == "DOCTYPE")
226 {
227 text.point = p + 9;
228 return doDoctype;
229 }
230 return doUnexpected("!", p);
231
232 case '\?':
233 // must be PI data
234 text.point = p + 2;
235 return doPI;
236
237 case '/':
238 // should be a closing element name
239 p += 2;
240 auto q = p;
241 while (*q > 63 || text.name[*q])
242 ++q;
243
244 if (*q is ':')
245 {
246 prefix = p[0 .. q - p];
247 p = ++q;
248 while (*q > 63 || text.attributeName[*q])
249 ++q;
250
251 localName = p[0 .. q - p];
252 }
253 else
254 {
255 prefix = null;
256 localName = p[0 .. q - p];
257 }
258
259 while (*q <= 32)
260 if (++q >= e)
261 return endOfInput;
262
263 if (*q is '>')
264 {
265 --depth;
266 text.point = q + 1;
267 return type = XmlTokenType.EndElement;
268 }
269 return doExpected(">", q);
270
271 default:
272 // scan new element name
273 auto q = ++p;
274 while (*q > 63 || text.name[*q])
275 ++q;
276
277 // check if we ran past the end
278 if (q >= e)
279 return endOfInput;
280
281 if (*q != ':')
282 {
283 prefix = null;
284 localName = p [0 .. q - p];
285 }
286 else
287 {
288 prefix = p[0 .. q - p];
289 p = ++q;
290 while (*q > 63 || text.attributeName[*q])
291 ++q;
292 localName = p[0 .. q - p];
293 }
294
295 text.point = q;
296 return type = XmlTokenType.StartElement;
297 }
298 }
299
300 /***********************************************************************
301
302 ***********************************************************************/
303
304 private XmlTokenType doAttributeName()
305 {
306 auto p = text.point;
307 auto q = p;
308 auto e = text.end;
309
310 while (*q > 63 || text.attributeName[*q])
311 ++q;
312 if (q >= e)
313 return endOfInput;
314
315 if (*q is ':')
316 {
317 prefix = p[0 .. q - p];
318 p = ++q;
319
320 while (*q > 63 || text.attributeName[*q])
321 ++q;
322
323 localName = p[0 .. q - p];
324 }
325 else
326 {
327 prefix = null;
328 localName = p[0 .. q - p];
329 }
330
331 if (*q <= 32)
332 {
333 while (*++q <= 32) {}
334 if (q >= e)
335 return endOfInput;
336 }
337
338 if (*q is '=')
339 {
340 while (*++q <= 32) {}
341 if (q >= e)
342 return endOfInput;
343
344 auto quote = *q;
345 switch (quote)
346 {
347 case '"':
348 case '\'':
349 p = q + 1;
350 while (*++q != quote) {}
351 if (q < e)
352 {
353 rawValue = p[0 .. q - p];
354 text.point = q + 1; // skip end quote
355 return type = XmlTokenType.Attribute;
356 }
357 return endOfInput;
358
359 default:
360 return doExpected("\' or \"", q);
361 }
362 }
363
364 return doExpected ("=", q);
365 }
366
367 /***********************************************************************
368
369 ***********************************************************************/
370
371 private XmlTokenType doEndEmptyElement()
372 {
373 if (text.point[0] is '/' && text.point[1] is '>')
374 {
375 localName = prefix = null;
376 text.point += 2;
377 return type = XmlTokenType.EndEmptyElement;
378 }
379 return doExpected("/>", text.point);
380 }
381
382 /***********************************************************************
383
384 ***********************************************************************/
385
386 private XmlTokenType doComment()
387 {
388 auto e = text.end;
389 auto p = text.point;
390 auto q = p;
391
392 while (p < e)
393 {
394 while (*p != '-')
395 if (++p >= e)
396 return endOfInput;
397
398 if (p[0..3] == "-->")
399 {
400 text.point = p + 3;
401 rawValue = q [0 .. p - q];
402 return type = XmlTokenType.Comment;
403 }
404 ++p;
405 }
406
407 return endOfInput;
408 }
409
410 /***********************************************************************
411
412 ***********************************************************************/
413
414 private XmlTokenType doCData()
415 {
416 auto e = text.end;
417 auto p = text.point;
418
419 while (p < e)
420 {
421 auto q = p;
422 while (*p != ']')
423 if (++p >= e)
424 return endOfInput;
425
426 if (p[0..3] == "]]>")
427 {
428 text.point = p + 3;
429 rawValue = q [0 .. p - q];
430 return type = XmlTokenType.CData;
431 }
432 ++p;
433 }
434
435 return endOfInput;
436 }
437
438 /***********************************************************************
439
440 ***********************************************************************/
441
442 private XmlTokenType doPI()
443 {
444 auto e = text.end;
445 auto p = text.point;
446 auto q = p;
447
448 while (p < e)
449 {
450 while (*p != '\?')
451 if (++p >= e)
452 return endOfInput;
453
454 if (p[1] == '>')
455 {
456 rawValue = q [0 .. p - q];
457 text.point = p + 2;
458 return type = XmlTokenType.PI;
459 }
460 ++p;
461 }
462 return endOfInput;
463 }
464
465 /***********************************************************************
466
467 ***********************************************************************/
468
469 private XmlTokenType doDoctype()
470 {
471 auto e = text.end;
472 auto p = text.point;
473
474 // strip leading whitespace
475 while (*p <= 32)
476 if (++p >= e)
477 return endOfInput;
478
479 auto q = p;
480 while (p < e)
481 {
482 if (*p is '>')
483 {
484 rawValue = q [0 .. p - q];
485 prefix = null;
486 text.point = p + 1;
487 return type = XmlTokenType.Doctype;
488 }
489 else
490 {
491 if (*p == '[')
492 do {
493 if (++p >= e)
494 return endOfInput;
495 } while (*p != ']');
496 ++p;
497 }
498 }
499
500 if (p >= e)
501 return endOfInput;
502 return XmlTokenType.Doctype;
503 }
504
505 /***********************************************************************
506
507 ***********************************************************************/
508
509 private XmlTokenType endOfInput ()
510 {
511 if (depth && (stream is false))
512 error ("Unexpected EOF");
513
514 return XmlTokenType.Done;
515 }
516
517 /***********************************************************************
518
519 ***********************************************************************/
520
521 private XmlTokenType doUnexpected (istring msg, Ch* p)
522 {
523 return position ("parse error :: unexpected " ~ msg, p);
524 }
525
526 /***********************************************************************
527
528 ***********************************************************************/
529
530 private XmlTokenType doExpected (istring msg, Ch* p)
531 {
532 char[6] tmp = void;
533 return position ("parse error :: expected " ~ msg ~ " instead of "
534 ~ idup(Utf.toString(p[0..1], tmp)), p);
535 }
536
537 /***********************************************************************
538
539 ***********************************************************************/
540
541 private XmlTokenType position (istring msg, Ch* p)
542 {
543 return error (msg ~ " at position "
544 ~ idup(Integer.toString(p-text.text.ptr)));
545 }
546
547 /***********************************************************************
548
549 ***********************************************************************/
550
551 protected final XmlTokenType error (istring msg)
552 {
553 errMsg = msg;
554 throw new XmlException (msg);
555 }
556
557 /***********************************************************************
558
559 Return the raw value of the current token
560
561 ***********************************************************************/
562
563 final Ch[] value()
564 {
565 return rawValue;
566 }
567
568 /***********************************************************************
569
570 Return the name of the current token
571
572 ***********************************************************************/
573
574 final Ch[] name()
575 {
576 if (prefix.length)
577 return prefix ~ ":" ~ localName;
578 return localName;
579 }
580
581 /***********************************************************************
582
583 Returns the text of the last error
584
585 ***********************************************************************/
586
587 final istring error()
588 {
589 return errMsg;
590 }
591
592 /***********************************************************************
593
594 Reset the parser
595
596 ***********************************************************************/
597
598 final bool reset()
599 {
600 text.reset (text.text);
601 reset_;
602 return true;
603 }
604
605 /***********************************************************************
606
607 Reset parser with new content
608
609 ***********************************************************************/
610
611 final void reset(Ch[] newText)
612 {
613 text.reset (newText);
614 reset_;
615 }
616
617 /***********************************************************************
618
619 experimental: set streaming mode
620
621 Use at your own risk, may be removed.
622
623 ***********************************************************************/
624
625 final void incremental (bool yes = true)
626 {
627 stream = yes;
628 }
629
630 /***********************************************************************
631
632 ***********************************************************************/
633
634 private void reset_()
635 {
636 depth = 0;
637 errMsg = null;
638 type = XmlTokenType.None;
639
640 auto p = text.point;
641 if (p)
642 {
643 static if (Ch.sizeof == 1)
644 {
645 // consume UTF8 BOM
646 if (p[0] is 0xef && p[1] is 0xbb && p[2] is 0xbf)
647 p += 3;
648 }
649
650 //TODO enable optional declaration parsing
651 auto e = text.end;
652 while (p < e && *p <= 32)
653 ++p;
654
655 if (p < e)
656 if (p[0] is '<' && p[1] is '\?' && p[2..5] == "xml")
657 {
658 p += 5;
659 while (p < e && *p != '\?')
660 ++p;
661 p += 2;
662 }
663 text.point = p;
664 }
665 }
666 }
667
668
669 /*******************************************************************************
670
671 *******************************************************************************/
672
673 package struct XmlText(Ch)
674 {
675 package Ch* end;
676 package size_t len;
677 package Ch[] text;
678 package Ch* point;
679
680 final void reset(Ch[] newText)
681 {
682 (&this).text = newText;
683 (&this).len = newText.length;
684 (&this).point = text.ptr;
685 (&this).end = point + len;
686 }
687
688 static Const!(ubyte[64]) name =
689 [
690 // 0 1 2 3 4 5 6 7 8 9 A B C D E F
691 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, // 0
692 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1
693 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, // 2
694 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0 // 3
695 ];
696
697 static Const!(ubyte[64]) attributeName =
698 [
699 // 0 1 2 3 4 5 6 7 8 9 A B C D E F
700 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, // 0
701 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1
702 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, // 2
703 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0 // 3
704 ];
705 }
706
707 /*******************************************************************************
708
709 *******************************************************************************/
710
711 version (UnitTest)
712 {
713 /***********************************************************************
714
715 ***********************************************************************/
716
717 void testParser(Ch)(PullParser!(Ch) itr)
718 {
719 test(itr.next);
720 test(itr.value == "element [ <!ELEMENT element (#PCDATA)>]");
721 test(itr.type == XmlTokenType.Doctype);
722 test(itr.next);
723 test(itr.localName == "element");
724 test(itr.type == XmlTokenType.StartElement);
725 test(itr.depth == 0);
726 test(itr.next);
727 test(itr.localName == "attr");
728 test(itr.value == "1");
729 test(itr.next);
730 test(itr.type == XmlTokenType.Attribute);
731 test(itr.localName == "attr2");
732 test(itr.value == "two");
733 test(itr.next);
734 test(itr.value == "comment");
735 test(itr.next);
736 test(itr.rawValue == "test&Z");
737 test(itr.next);
738 test(itr.prefix == "qual");
739 test(itr.localName == "elem");
740 test(itr.next);
741 test(itr.type == XmlTokenType.EndEmptyElement);
742 test(itr.next);
743 test(itr.localName == "el2");
744 test(itr.depth == 1);
745 test(itr.next);
746 test(itr.localName == "attr3");
747 test(itr.value == "3three", itr.value);
748 test(itr.next);
749 test(itr.rawValue == "sdlgjsh");
750 test(itr.next);
751 test(itr.localName == "el3");
752 test(itr.depth == 2);
753 test(itr.next);
754 test(itr.type == XmlTokenType.EndEmptyElement);
755 test(itr.next);
756 test(itr.value == "data");
757 test(itr.next);
758 test(itr.rawValue == "pi test", itr.rawValue);
759 test(itr.next);
760 test(itr.localName == "el2");
761 test(itr.next);
762 test(itr.localName == "element");
763 test(!itr.next);
764 }
765
766
767 /***********************************************************************
768
769 ***********************************************************************/
770
771 static immutable istring testXML = "<?xml version=\"1.0\" ?><!DOCTYPE element [ <!ELEMENT element (#PCDATA)>]><element "
772 ~ "attr=\"1\" attr2=\"two\"><!--comment-->test&Z<qual:elem /><el2 attr3 = "
773 ~ "'3three'><![CDATA[sdlgjsh]]><el3 />data<?pi test?></el2></element>";
774 }
775
776 unittest
777 {
778 auto itr = new PullParser!(char)(testXML);
779 testParser (itr);
780
781 // Parsing new text (or even the same one) should not involve any further
782 // memory allocation
783 testNoAlloc({
784 itr.reset(testXML);
785 testParser(itr);
786 }());
787 }