1 /*******************************************************************************
2 
3     Copyright:
4         Copyright (C) 2007 Aaron Craelius and Kris Bell
5         Some parts copyright (c) 2009-2016 dunnhumby Germany GmbH.
6         All rights reserved.
7 
8     License:
9         Tango Dual License: 3-Clause BSD License / Academic Free License v3.0.
10         See LICENSE_TANGO.txt for details.
11 
12     Authors: Aaron, Kris
13 
14 *******************************************************************************/
15 
16 module ocean.text.xml.PullParser;
17 
18 import ocean.meta.types.Qualifiers;
19 
20 import ocean.text.Util : indexOf;
21 
22 import ocean.core.ExceptionDefinitions : XmlException;
23 
24 import Integer = ocean.text.convert.Integer_tango;
25 
26 import Utf = ocean.text.convert.Utf : toString;
27 
28 version (unittest)
29 {
30     import ocean.core.Test;
31 }
32 
33 /*******************************************************************************
34 
35         Use -version=whitespace to retain whitespace as data nodes. We
36         see a %25 increase in token count and 10% throughput drop when
37         parsing "hamlet.xml" with this option enabled (pullparser alone)
38 
39 *******************************************************************************/
40 
41 version (whitespace)
42     version = retainwhite;
43 else
44 {
45    version = stripwhite;
46    version = partialwhite;
47 }
48 
49 /*******************************************************************************
50 
51         The XML node types
52 
53 *******************************************************************************/
54 
55 public enum XmlNodeType
56 {
57     Element,
58     Data,
59     Attribute,
60     CData,
61     Comment,
62     PI,
63     Doctype,
64     Document
65 }
66 
67 /*******************************************************************************
68 
69         Values returned by the pull-parser
70 
71 *******************************************************************************/
72 
73 public enum XmlTokenType
74 {
75     Done,
76     StartElement,
77     Attribute,
78     EndElement,
79     EndEmptyElement,
80     Data,
81     Comment,
82     CData,
83     Doctype,
84     PI,
85     None
86 }
87 
88 /*******************************************************************************
89 
90         Token based xml Parser.  Templated to operate with char[], wchar[],
91         and dchar[] content.
92 
93         The parser is constructed with some tradeoffs relating to document
94         integrity. It is generally optimized for well-formed documents, and
95         currently may read past a document-end for those that are not well
96         formed. There are various compilation options to enable checks and
97         balances, depending on how things should be handled. We'll settle
98         on a common configuration over the next few weeks, but for now all
99         settings are somewhat experimental. Partly because making some tiny
100         unrelated change to the code can cause notable throughput changes,
101         and we need to track that down.
102 
103         We're not yet clear why these swings are so pronounced (for changes
104         outside the code path) but they seem to be related to the alignment
105         of codegen. It could be a cache-line issue, or something else. We'll
106         figure it out, yet it's interesting that some hardware buttons are
107         clearly being pushed
108 
109 *******************************************************************************/
110 
111 class PullParser(ChMut = char)
112 {
113     alias const(ChMut)  Ch;
114 
115     public int           depth;
116     public Ch[]          prefix;
117     public Ch[]          rawValue;
118     public Ch[]          localName;
119     public XmlTokenType  type = XmlTokenType.None;
120 
121     package XmlText!(Ch) text;
122     private bool         stream;
123     private istring      errMsg;
124 
125     /***********************************************************************
126 
127       Construct a parser on the given content (may be null)
128 
129      ***********************************************************************/
130 
131     this(Ch[] content = null)
132     {
133         reset (content);
134     }
135 
136     /***********************************************************************
137 
138       Consume the next token and return its type
139 
140      ***********************************************************************/
141 
142     final XmlTokenType next()
143     {
144         auto e = text.end;
145         auto p = text.point;
146 
147         // at end of document?
148         if (p >= e)
149             return endOfInput;
150         version (stripwhite)
151         {
152             // strip leading whitespace
153             while (*p <= 32)
154                 if (++p >= e)
155                     return endOfInput;
156         }
157         // StartElement or Attribute?
158         if (type < XmlTokenType.EndElement)
159         {
160             version (retainwhite)
161             {
162                 // strip leading whitespace (thanks to DRK)
163                 while (*p <= 32)
164                     if (++p >= e)
165                         return endOfInput;
166             }
167             switch (*p)
168             {
169                 case '>':
170                     // termination of StartElement
171                     ++depth;
172                     ++p;
173                     break;
174 
175                 case '/':
176                     // empty element closure
177                     text.point = p;
178                     return doEndEmptyElement;
179 
180                 default:
181                     // must be attribute instead
182                     text.point = p;
183                     return doAttributeName;
184             }
185         }
186 
187         // consume data between elements?
188         if (*p != '<')
189         {
190             auto q = p;
191             while (++p < e && *p != '<') {}
192 
193             if (p < e)
194             {
195                 version (partialwhite)
196                 {
197                     // include leading whitespace
198                     while (*(q-1) <= 32)
199                         --q;
200                 }
201                 text.point = p;
202                 rawValue = q [0 .. p - q];
203                 return type = XmlTokenType.Data;
204             }
205             return endOfInput;
206         }
207 
208         // must be a '<' character, so peek ahead
209         switch (p[1])
210         {
211             case '!':
212                 // one of the following ...
213                 if (p[2..4] == "--")
214                 {
215                     text.point = p + 4;
216                     return doComment;
217                 }
218                 else
219                     if (p[2..9] == "[CDATA[")
220                     {
221                         text.point = p + 9;
222                         return doCData;
223                     }
224                     else
225                         if (p[2..9] == "DOCTYPE")
226                         {
227                             text.point = p + 9;
228                             return doDoctype;
229                         }
230                 return doUnexpected("!", p);
231 
232             case '\?':
233                 // must be PI data
234                 text.point = p + 2;
235                 return doPI;
236 
237             case '/':
238                 // should be a closing element name
239                 p += 2;
240                 auto q = p;
241                 while (*q > 63 || text.name[*q])
242                     ++q;
243 
244                 if (*q is ':')
245                 {
246                     prefix = p[0 .. q - p];
247                     p = ++q;
248                     while (*q > 63 || text.attributeName[*q])
249                         ++q;
250 
251                     localName = p[0 .. q - p];
252                 }
253                 else
254                 {
255                     prefix = null;
256                     localName = p[0 .. q - p];
257                 }
258 
259                 while (*q <= 32)
260                     if (++q >= e)
261                         return endOfInput;
262 
263                 if (*q is '>')
264                 {
265                     --depth;
266                     text.point = q + 1;
267                     return type = XmlTokenType.EndElement;
268                 }
269                 return doExpected(">", q);
270 
271             default:
272                 // scan new element name
273                 auto q = ++p;
274                 while (*q > 63 || text.name[*q])
275                     ++q;
276 
277                 // check if we ran past the end
278                 if (q >= e)
279                     return endOfInput;
280 
281                 if (*q != ':')
282                 {
283                     prefix = null;
284                     localName = p [0 .. q - p];
285                 }
286                 else
287                 {
288                     prefix = p[0 .. q - p];
289                     p = ++q;
290                     while (*q > 63 || text.attributeName[*q])
291                         ++q;
292                     localName = p[0 .. q - p];
293                 }
294 
295                 text.point = q;
296                 return type = XmlTokenType.StartElement;
297         }
298     }
299 
300     /***********************************************************************
301 
302      ***********************************************************************/
303 
304     private XmlTokenType doAttributeName()
305     {
306         auto p = text.point;
307         auto q = p;
308         auto e = text.end;
309 
310         while (*q > 63 || text.attributeName[*q])
311             ++q;
312         if (q >= e)
313             return endOfInput;
314 
315         if (*q is ':')
316         {
317             prefix = p[0 .. q - p];
318             p = ++q;
319 
320             while (*q > 63 || text.attributeName[*q])
321                 ++q;
322 
323             localName = p[0 .. q - p];
324         }
325         else
326         {
327             prefix = null;
328             localName = p[0 .. q - p];
329         }
330 
331         if (*q <= 32)
332         {
333             while (*++q <= 32) {}
334             if (q >= e)
335                 return endOfInput;
336         }
337 
338         if (*q is '=')
339         {
340             while (*++q <= 32) {}
341             if (q >= e)
342                 return endOfInput;
343 
344             auto quote = *q;
345             switch (quote)
346             {
347                 case '"':
348                 case '\'':
349                     p = q + 1;
350                     while (*++q != quote) {}
351                     if (q < e)
352                     {
353                         rawValue = p[0 .. q - p];
354                         text.point = q + 1;   // skip end quote
355                         return type = XmlTokenType.Attribute;
356                     }
357                     return endOfInput;
358 
359                 default:
360                     return doExpected("\' or \"", q);
361             }
362         }
363 
364         return doExpected ("=", q);
365     }
366 
367     /***********************************************************************
368 
369      ***********************************************************************/
370 
371     private XmlTokenType doEndEmptyElement()
372     {
373         if (text.point[0] is '/' && text.point[1] is '>')
374         {
375             localName = prefix = null;
376             text.point += 2;
377             return type = XmlTokenType.EndEmptyElement;
378         }
379         return doExpected("/>", text.point);
380     }
381 
382     /***********************************************************************
383 
384      ***********************************************************************/
385 
386     private XmlTokenType doComment()
387     {
388         auto e = text.end;
389         auto p = text.point;
390         auto q = p;
391 
392         while (p < e)
393         {
394             while (*p != '-')
395                 if (++p >= e)
396                     return endOfInput;
397 
398             if (p[0..3] == "-->")
399             {
400                 text.point = p + 3;
401                 rawValue = q [0 .. p - q];
402                 return type = XmlTokenType.Comment;
403             }
404             ++p;
405         }
406 
407         return endOfInput;
408     }
409 
410     /***********************************************************************
411 
412      ***********************************************************************/
413 
414     private XmlTokenType doCData()
415     {
416         auto e = text.end;
417         auto p = text.point;
418 
419         while (p < e)
420         {
421             auto q = p;
422             while (*p != ']')
423                 if (++p >= e)
424                     return endOfInput;
425 
426             if (p[0..3] == "]]>")
427             {
428                 text.point = p + 3;
429                 rawValue = q [0 .. p - q];
430                 return type = XmlTokenType.CData;
431             }
432             ++p;
433         }
434 
435         return endOfInput;
436     }
437 
438     /***********************************************************************
439 
440      ***********************************************************************/
441 
442     private XmlTokenType doPI()
443     {
444         auto e = text.end;
445         auto p = text.point;
446         auto q = p;
447 
448         while (p < e)
449         {
450             while (*p != '\?')
451                 if (++p >= e)
452                     return endOfInput;
453 
454             if (p[1] == '>')
455             {
456                 rawValue = q [0 .. p - q];
457                 text.point = p + 2;
458                 return type = XmlTokenType.PI;
459             }
460             ++p;
461         }
462         return endOfInput;
463     }
464 
465     /***********************************************************************
466 
467      ***********************************************************************/
468 
469     private XmlTokenType doDoctype()
470     {
471         auto e = text.end;
472         auto p = text.point;
473 
474         // strip leading whitespace
475         while (*p <= 32)
476             if (++p >= e)
477                 return endOfInput;
478 
479         auto q = p;
480         while (p < e)
481         {
482             if (*p is '>')
483             {
484                 rawValue = q [0 .. p - q];
485                 prefix = null;
486                 text.point = p + 1;
487                 return type = XmlTokenType.Doctype;
488             }
489             else
490             {
491                 if (*p == '[')
492                     do {
493                         if (++p >= e)
494                             return endOfInput;
495                     } while (*p != ']');
496                 ++p;
497             }
498         }
499 
500         if (p >= e)
501             return endOfInput;
502         return XmlTokenType.Doctype;
503     }
504 
505     /***********************************************************************
506 
507      ***********************************************************************/
508 
509     private XmlTokenType endOfInput ()
510     {
511         if (depth && (stream is false))
512             error ("Unexpected EOF");
513 
514         return XmlTokenType.Done;
515     }
516 
517     /***********************************************************************
518 
519      ***********************************************************************/
520 
521     private XmlTokenType doUnexpected (istring msg, Ch* p)
522     {
523         return position ("parse error :: unexpected  " ~ msg, p);
524     }
525 
526     /***********************************************************************
527 
528      ***********************************************************************/
529 
530     private XmlTokenType doExpected (istring msg, Ch* p)
531     {
532         char[6] tmp = void;
533         return position ("parse error :: expected  " ~ msg ~ " instead of "
534                 ~ idup(Utf.toString(p[0..1], tmp)), p);
535     }
536 
537     /***********************************************************************
538 
539      ***********************************************************************/
540 
541     private XmlTokenType position (istring msg, Ch* p)
542     {
543         return error (msg ~ " at position "
544                 ~ idup(Integer.toString(p-text.text.ptr)));
545     }
546 
547     /***********************************************************************
548 
549      ***********************************************************************/
550 
551     protected final XmlTokenType error (istring msg)
552     {
553         errMsg = msg;
554         throw new XmlException (msg);
555     }
556 
557     /***********************************************************************
558 
559       Return the raw value of the current token
560 
561      ***********************************************************************/
562 
563     final Ch[] value()
564     {
565         return rawValue;
566     }
567 
568     /***********************************************************************
569 
570       Return the name of the current token
571 
572      ***********************************************************************/
573 
574     final Ch[] name()
575     {
576         if (prefix.length)
577             return prefix ~ ":" ~ localName;
578         return localName;
579     }
580 
581     /***********************************************************************
582 
583       Returns the text of the last error
584 
585      ***********************************************************************/
586 
587     final istring error()
588     {
589         return errMsg;
590     }
591 
592     /***********************************************************************
593 
594       Reset the parser
595 
596      ***********************************************************************/
597 
598     final bool reset()
599     {
600         text.reset (text.text);
601         reset_;
602         return true;
603     }
604 
605     /***********************************************************************
606 
607       Reset parser with new content
608 
609      ***********************************************************************/
610 
611     final void reset(Ch[] newText)
612     {
613         text.reset (newText);
614         reset_;
615     }
616 
617     /***********************************************************************
618 
619         experimental: set streaming mode
620 
621         Use at your own risk, may be removed.
622 
623      ***********************************************************************/
624 
625     final void incremental (bool yes = true)
626     {
627         stream = yes;
628     }
629 
630     /***********************************************************************
631 
632      ***********************************************************************/
633 
634     private void reset_()
635     {
636         depth = 0;
637         errMsg = null;
638         type = XmlTokenType.None;
639 
640         auto p = text.point;
641         if (p)
642         {
643             static if (Ch.sizeof == 1)
644             {
645                 // consume UTF8 BOM
646                 if (p[0] is 0xef && p[1] is 0xbb && p[2] is 0xbf)
647                     p += 3;
648             }
649 
650             //TODO enable optional declaration parsing
651             auto e = text.end;
652             while (p < e && *p <= 32)
653                 ++p;
654 
655             if (p < e)
656                 if (p[0] is '<' && p[1] is '\?' && p[2..5] == "xml")
657                 {
658                     p += 5;
659                     while (p < e && *p != '\?')
660                         ++p;
661                     p += 2;
662                 }
663             text.point = p;
664         }
665     }
666 }
667 
668 
669 /*******************************************************************************
670 
671  *******************************************************************************/
672 
673 package struct XmlText(Ch)
674 {
675     package Ch*     end;
676     package size_t  len;
677     package Ch[]    text;
678     package Ch*     point;
679 
680     final void reset(Ch[] newText)
681     {
682         this.text = newText;
683         this.len = newText.length;
684         this.point = text.ptr;
685         this.end = point + len;
686     }
687 
688     static const(ubyte[64]) name =
689     [
690         // 0   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
691         0,  1,  1,  1,  1,  1,  1,  1,  1,  0,  0,  1,  1,  0,  1,  1,  // 0
692         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 1
693         0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,  // 2
694         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,  1,  1,  1,  0,  0   // 3
695     ];
696 
697     static const(ubyte[64]) attributeName =
698     [
699         // 0   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
700         0,  1,  1,  1,  1,  1,  1,  1,  1,  0,  0,  1,  1,  0,  1,  1,  // 0
701         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 1
702         0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,  // 2
703         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,  1,  0,  0,  0,  0   // 3
704     ];
705 }
706 
707 /*******************************************************************************
708 
709  *******************************************************************************/
710 
711 version (unittest)
712 {
713     /***********************************************************************
714 
715      ***********************************************************************/
716 
717     void testParser(Ch)(PullParser!(Ch) itr)
718     {
719         test(itr.next);
720         test(itr.value == "element [ <!ELEMENT element (#PCDATA)>]");
721         test(itr.type == XmlTokenType.Doctype);
722         test(itr.next);
723         test(itr.localName == "element");
724         test(itr.type == XmlTokenType.StartElement);
725         test(itr.depth == 0);
726         test(itr.next);
727         test(itr.localName == "attr");
728         test(itr.value == "1");
729         test(itr.next);
730         test(itr.type == XmlTokenType.Attribute);
731         test(itr.localName == "attr2");
732         test(itr.value == "two");
733         test(itr.next);
734         test(itr.value == "comment");
735         test(itr.next);
736         test(itr.rawValue == "test&amp;&#x5a;");
737         test(itr.next);
738         test(itr.prefix == "qual");
739         test(itr.localName == "elem");
740         test(itr.next);
741         test(itr.type == XmlTokenType.EndEmptyElement);
742         test(itr.next);
743         test(itr.localName == "el2");
744         test(itr.depth == 1);
745         test(itr.next);
746         test(itr.localName == "attr3");
747         test(itr.value == "3three", itr.value);
748         test(itr.next);
749         test(itr.rawValue == "sdlgjsh");
750         test(itr.next);
751         test(itr.localName == "el3");
752         test(itr.depth == 2);
753         test(itr.next);
754         test(itr.type == XmlTokenType.EndEmptyElement);
755         test(itr.next);
756         test(itr.value == "data");
757         test(itr.next);
758         test(itr.rawValue == "pi test", itr.rawValue);
759         test(itr.next);
760         test(itr.localName == "el2");
761         test(itr.next);
762         test(itr.localName == "element");
763         test(!itr.next);
764     }
765 
766 
767     /***********************************************************************
768 
769      ***********************************************************************/
770 
771     static immutable istring testXML = "<?xml version=\"1.0\" ?><!DOCTYPE element [ <!ELEMENT element (#PCDATA)>]><element "
772         ~ "attr=\"1\" attr2=\"two\"><!--comment-->test&amp;&#x5a;<qual:elem /><el2 attr3 = "
773         ~ "'3three'><![CDATA[sdlgjsh]]><el3 />data<?pi test?></el2></element>";
774 }
775 
776 unittest
777 {
778     auto itr = new PullParser!(char)(testXML);
779     testParser (itr);
780 
781     // Parsing new text (or even the same one) should not involve any further
782     // memory allocation
783     testNoAlloc({
784         itr.reset(testXML);
785         testParser(itr);
786     }());
787 }