1 /*******************************************************************************
2 
3         Copyright:
4             Copyright (C) 2008 Aaron Craelius & Kris Bell.
5             Some parts copyright (c) 2009-2016 dunnhumby Germany GmbH.
6             All rights reserved.
7 
8         License:
9             Tango Dual License: 3-Clause BSD License / Academic Free License v3.0.
10             See LICENSE_TANGO.txt for details.
11 
12         Version: Initial release: July 2008
13 
14         Authors: Aaron, Kris
15 
16 *******************************************************************************/
17 
18 module ocean.text.json.JsonParser;
19 
20 import ocean.meta.types.Qualifiers;
21 import ocean.core.Exception;
22 import ocean.util.container.more.Stack;
23 
24 version (unittest) import ocean.core.Test;
25 
26 /*******************************************************************************
27 
28  If AllowNaN is true, then NaN, Infinity, and -Infinity are parsed.
29 
30  NaN, Infinity, and -Infinity are technically not part of
31  the JSON specification, but Javascript writes it by default, so it is
32  by the far the most common cause of invalid JSON. Practically all
33  JSON parsers (eg, Google GSON, Jackson, Ruby's JSON, simplejson,
34  JSON.net, Lua CJson) ...have an option to accept NaN.
35 
36 *******************************************************************************/
37 
38 class JsonParser(T, bool AllowNaN = false)
39 {
40     /***************************************************************************
41 
42          JSON tokens. The last three are used only if AllowNaN is true
43 
44     ***************************************************************************/
45 
46     public enum Token
47     {
48         Empty, Name, String, Number, BeginObject, EndObject,
49         BeginArray, EndArray, True, False, Null,
50         NaN, Infinity, NegInfinity
51     }
52 
53     private enum State
54     {
55         Object,
56         Array
57     };
58 
59     private struct Iterator
60     {
61         const(T)*   ptr;
62         const(T)*   end;
63         const(T)[]  text;
64 
65         void reset (const(T)[] text)
66         {
67             this.text = text;
68             this.ptr = text.ptr;
69             this.end = this.ptr + text.length;
70         }
71     }
72 
73     protected Iterator              str;
74     private Stack!(State, 16)       state;
75     private const(T)*              curLoc;
76     private ptrdiff_t               curLen;
77     private State                   curState;
78     protected Token                 curType;
79     protected JsonParserException   exception;
80 
81     /***************************************************************************
82 
83         Construct a parser from a string
84 
85         Params:
86             text = Text to initialize this parser to. Can be `null`.
87 
88     ***************************************************************************/
89 
90     this (const(T)[] text = null)
91     {
92         this.exception = new JsonParserException();
93         this.reset(text);
94     }
95 
96 
97     /// Returns: `true` if there is a next element, `false` otherwise
98     final bool next ()
99     {
100         if (this.str.ptr is null || this.str.end is null)
101             return false;
102 
103         auto p = this.str.ptr;
104         auto e = this.str.end;
105 
106         while (*p <= 32 && p < e)
107             ++p;
108 
109         if ((this.str.ptr = p) >= e)
110             return false;
111 
112         if (this.curState is State.Array)
113             return this.parseArrayValue();
114 
115         switch (this.curType)
116         {
117         case Token.Name:
118             return this.parseMemberValue();
119 
120         default:
121             break;
122         }
123 
124         return this.parseMemberName();
125     }
126 
127     /// Returns: The `Token` type of the current token
128     final Token type ()
129     {
130         return this.curType;
131     }
132 
133     /// Returns: The current value of the token
134     final const(T)[] value ()
135     {
136         return this.curLoc[0 .. this.curLen];
137     }
138 
139     /***************************************************************************
140 
141         Reset the parser to a new string
142 
143         Params:
144             json = new string to process
145 
146         Returns:
147             `true` if the document starts with a '{' or a '['
148 
149     ***************************************************************************/
150 
151     bool reset (const(T)[] json = null)
152     {
153         this.state.clear();
154         this.str.reset(json);
155         this.curType = Token.Empty;
156         this.curState = State.Object;
157 
158         if (json.length)
159         {
160             auto p = this.str.ptr;
161             auto e = this.str.end;
162 
163             while (*p <= 32 && p < e)
164                 ++p;
165             if (p < e)
166                 return this.start(*(this.str.ptr = p));
167         }
168         return false;
169     }
170 
171 
172     /// Throws: a new exception with "expected `token`" as message
173     protected final void expected (cstring token)
174     {
175         throw this.exception.set("expected ").append(token);
176     }
177 
178     /***************************************************************************
179 
180         Report error about an expected token not being found
181 
182         Params:
183             token = the token that was expected to be found
184             point = Where the token was expected
185 
186         Throws:
187             Always end up throwing a new expection
188 
189     ***************************************************************************/
190 
191     protected final void expected (cstring token, const(T)* point)
192     {
193         auto diff = cast(int) (point - this.str.text.ptr);
194         throw this.exception.set("expected ").append(token).append(" @input[")
195             .append(diff).append("]");
196     }
197 
198     /// Throws: A new expection with "unexpected end-of-input: msg" as message
199     private void unexpectedEOF (cstring msg)
200     {
201         throw this.exception.set("unexpected end-of-input: ").append(msg);
202     }
203 
204 
205     /// Called by `reset`, ensure the document starts with '{' or '['
206     private bool start (T c)
207     {
208         if (c is '{')
209             return this.push(Token.BeginObject, State.Object);
210 
211         if (c is '[')
212             return this.push(Token.BeginArray, State.Array);
213 
214         this.expected("'{' or '[' at start of document");
215 
216         assert(0);
217     }
218 
219     ///
220     private bool parseMemberName ()
221     {
222         auto p = this.str.ptr;
223         auto e = this.str.end;
224 
225         if (*p is '}')
226             return this.pop(Token.EndObject);
227 
228         if (*p is ',')
229             ++p;
230 
231         while (*p <= 32)
232             ++p;
233 
234         if (*p != '"')
235         {
236             if (*p == '}')
237                 this.expected("an attribute-name after (a potentially trailing) ','", p);
238             else
239                 this.expected("'\"' before attribute-name", p);
240         }
241 
242         this.curLoc = p + 1;
243         this.curType = Token.Name;
244 
245         while (++p < e)
246             if (*p is '"' && !this.escaped(p))
247                 break;
248 
249         if (p < e)
250             this.curLen = p - this.curLoc;
251         else
252             this.unexpectedEOF("in attribute-name");
253 
254         this.str.ptr = p + 1;
255         return true;
256     }
257 
258     ///
259     private bool parseMemberValue ()
260     {
261         auto p = this.str.ptr;
262 
263         if (*p != ':')
264             this.expected("':' before attribute-value", p);
265 
266         auto e = this.str.end;
267         while (++p < e && *p <= 32) {}
268 
269         return this.parseValue(*(this.str.ptr = p));
270     }
271 
272     ///
273     private bool parseValue (T c)
274     {
275         switch (c)
276         {
277         case '{':
278             return this.push(Token.BeginObject, State.Object);
279 
280         case '[':
281             return this.push(Token.BeginArray, State.Array);
282 
283         case '"':
284             return this.doString();
285 
286         case 'n':
287             if (this.match("null", Token.Null))
288                 return true;
289             this.expected("'null'", this.str.ptr);
290             assert(false);
291 
292         case 't':
293             if (this.match("true", Token.True))
294                 return true;
295             this.expected("'true'", this.str.ptr);
296             assert(false);
297 
298         case 'f':
299             if (this.match("false", Token.False))
300                 return true;
301             this.expected("'false'", this.str.ptr);
302             assert(false);
303 
304         static if (AllowNaN)
305         {
306         case 'N':
307             if (this.match("NaN", Token.NaN))
308                 return true;
309             this.expected ("'NaN'", this.str.ptr);
310             assert(false);
311 
312         case 'I':
313             if (this.match("Infinity", Token.Infinity))
314                 return true;
315             this.expected ("'Infinity'", this.str.ptr);
316             assert(false);
317 
318         case '-':
319             if (this.match("-Infinity", Token.NegInfinity))
320                 return true;
321             break;
322         }
323 
324         default:
325             break;
326         }
327 
328         return this.parseNumber();
329     }
330 
331     ///
332     private bool doString ()
333     {
334         auto p = this.str.ptr;
335         auto e = this.str.end;
336 
337         this.curLoc = p+1;
338         this.curType = Token.String;
339 
340         while (++p < e)
341             if (*p is '"' && !this.escaped(p))
342                 break;
343 
344         if (p < e)
345             this.curLen = p - this.curLoc;
346         else
347             this.unexpectedEOF("in string");
348 
349         this.str.ptr = p + 1;
350         return true;
351     }
352 
353     ///
354     private bool parseNumber ()
355     {
356         auto p = this.str.ptr;
357         auto e = this.str.end;
358         T c = *(this.curLoc = p);
359 
360         this.curType = Token.Number;
361 
362         if (c is '-' || c is '+')
363             c = *++p;
364 
365         while (c >= '0' && c <= '9')
366             c = *++p;
367 
368         if (c is '.')
369             do { c = *++p; } while (c >= '0' && c <= '9');
370 
371         if (c is 'e' || c is 'E')
372         {
373             c = *++p;
374 
375             if (c is '-' || c is '+')
376                 c = *++p;
377 
378             while (c >= '0' && c <= '9')
379                 c = *++p;
380         }
381 
382         if (p < e)
383             this.curLen = p - this.curLoc;
384         else
385             this.unexpectedEOF("after number");
386 
387         this.str.ptr = p;
388         return this.curLen > 0;
389     }
390 
391     ///
392     private bool match (const(T)[] name, Token token)
393     {
394         auto i = name.length;
395         if (this.str.ptr[0 .. i] == name)
396         {
397             this.curLoc = this.str.ptr;
398             this.curType = token;
399             this.str.ptr += i;
400             this.curLen = i;
401             return true;
402         }
403         return false;
404     }
405 
406     ///
407     private bool push (Token token, State next)
408     {
409         this.curLen = 0;
410         this.curType = token;
411         this.curLoc = this.str.ptr++;
412         this.state.push(this.curState);
413         this.curState = next;
414         return true;
415     }
416 
417     ///
418     private bool pop (Token token)
419     {
420         this.curLen = 0;
421         this.curType = token;
422         this.curLoc = this.str.ptr++;
423         this.curState = this.state.pop;
424         return true;
425     }
426 
427     ///
428     private bool parseArrayValue ()
429     {
430         auto p = this.str.ptr;
431         if (*p is ']')
432             return this.pop(Token.EndArray);
433 
434         if (*p is ',')
435             ++p;
436 
437         auto e = this.str.end;
438         while (p < e && *p <= 32)
439             ++p;
440 
441         return this.parseValue(*(this.str.ptr = p));
442     }
443 
444     ///
445     private int escaped (const(T)* p)
446     {
447         int i;
448 
449         while (*--p is '\\')
450             ++i;
451         return i & 1;
452     }
453 }
454 
455 public class JsonParserException : Exception
456 {
457     mixin ReusableExceptionImplementation!() R;
458 }
459 
460 
461 unittest
462 {
463     static immutable istring json =
464     `{
465         "glossary": {
466             "title": "example glossary",
467             "GlossDiv": {
468                 "title": "S",
469                 "GlossList": {
470                     "GlossEntry": {
471                         "ID": "SGML",
472                         "SortAs": "SGML",
473                         "GlossTerm": "Standard Generalized Markup Language",
474                         "Acronym": "SGML",
475                         "Abbrev": "ISO 8879:1986",
476                         "GlossDef": {
477                             "para": "A meta-markup language, used to create markup languages such as DocBook.",
478                             "GlossSeeAlso": [
479                                 "GML",
480                                 "XML"
481                             ]
482                         },
483                         "GlossSee": "markup",
484                         "ANumber": 12345.6e7
485                         "BNumber": 12345.6e+7
486                         "CNumber": 12345.6e-7
487                         "DNumber": 12345.6E7
488                         "ENumber": 12345.6E+7
489                         "FNumber": 12345.6E-7
490                         "True": true
491                         "False": false
492                         "Null": null
493                     }
494                 }
495             }
496         }
497      }`;
498 
499     auto p = new JsonParser!(char)(json);
500     test(p);
501     test(p.type == p.Token.BeginObject);
502     test(p.next);
503     test(p.type == p.Token.Name);
504     test(p.value == "glossary", p.value);
505     test(p.next);
506     test(p.value == "", p.value);
507     test(p.type == p.Token.BeginObject);
508     test(p.next);
509     test(p.type == p.Token.Name);
510     test(p.value == "title", p.value);
511     test(p.next);
512     test(p.type == p.Token.String);
513     test(p.value == "example glossary", p.value);
514     test(p.next);
515     test(p.type == p.Token.Name);
516     test(p.value == "GlossDiv", p.value);
517     test(p.next);
518     test(p.type == p.Token.BeginObject);
519     test(p.next);
520     test(p.type == p.Token.Name);
521     test(p.value == "title", p.value);
522     test(p.next);
523     test(p.type == p.Token.String);
524     test(p.value == "S", p.value);
525     test(p.next);
526     test(p.type == p.Token.Name);
527     test(p.value == "GlossList", p.value);
528     test(p.next);
529     test(p.type == p.Token.BeginObject);
530     test(p.next);
531     test(p.type == p.Token.Name);
532     test(p.value == "GlossEntry", p.value);
533     test(p.next);
534     test(p.type == p.Token.BeginObject);
535     test(p.next);
536     test(p.type == p.Token.Name);
537     test(p.value == "ID", p.value);
538     test(p.next);
539     test(p.type == p.Token.String);
540     test(p.value == "SGML", p.value);
541     test(p.next);
542     test(p.type == p.Token.Name);
543     test(p.value == "SortAs", p.value);
544     test(p.next);
545     test(p.type == p.Token.String);
546     test(p.value == "SGML", p.value);
547     test(p.next);
548     test(p.type == p.Token.Name);
549     test(p.value == "GlossTerm", p.value);
550     test(p.next);
551     test(p.type == p.Token.String);
552     test(p.value == "Standard Generalized Markup Language", p.value);
553     test(p.next);
554     test(p.type == p.Token.Name);
555     test(p.value == "Acronym", p.value);
556     test(p.next);
557     test(p.type == p.Token.String);
558     test(p.value == "SGML", p.value);
559     test(p.next);
560     test(p.type == p.Token.Name);
561     test(p.value == "Abbrev", p.value);
562     test(p.next);
563     test(p.type == p.Token.String);
564     test(p.value == "ISO 8879:1986", p.value);
565     test(p.next);
566     test(p.type == p.Token.Name);
567     test(p.value == "GlossDef", p.value);
568     test(p.next);
569     test(p.type == p.Token.BeginObject);
570     test(p.next);
571     test(p.type == p.Token.Name);
572     test(p.value == "para", p.value);
573     test(p.next);
574 
575     test(p.type == p.Token.String);
576     test(p.value == "A meta-markup language, used to create markup languages such as DocBook.", p.value);
577     test(p.next);
578     test(p.type == p.Token.Name);
579     test(p.value == "GlossSeeAlso", p.value);
580     test(p.next);
581     test(p.type == p.Token.BeginArray);
582     test(p.next);
583     test(p.type == p.Token.String);
584     test(p.value == "GML", p.value);
585     test(p.next);
586     test(p.type == p.Token.String);
587     test(p.value == "XML", p.value);
588     test(p.next);
589     test(p.type == p.Token.EndArray);
590     test(p.next);
591     test(p.type == p.Token.EndObject);
592     test(p.next);
593     test(p.type == p.Token.Name);
594     test(p.value == "GlossSee", p.value);
595     test(p.next);
596     test(p.type == p.Token.String);
597     test(p.value == "markup", p.value);
598     test(p.next);
599     test(p.type == p.Token.Name);
600     test(p.value == "ANumber", p.value);
601     test(p.next);
602     test(p.type == p.Token.Number);
603     test(p.value == "12345.6e7", p.value);
604     test(p.next);
605     test(p.type == p.Token.Name);
606     test(p.value == "BNumber", p.value);
607     test(p.next);
608     test(p.type == p.Token.Number);
609     test(p.value == "12345.6e+7", p.value);
610     test(p.next);
611     test(p.type == p.Token.Name);
612     test(p.value == "CNumber", p.value);
613     test(p.next);
614     test(p.type == p.Token.Number);
615     test(p.value == "12345.6e-7", p.value);
616     test(p.next);
617     test(p.type == p.Token.Name);
618     test(p.value == "DNumber", p.value);
619     test(p.next);
620     test(p.type == p.Token.Number);
621     test(p.value == "12345.6E7", p.value);
622     test(p.next);
623     test(p.type == p.Token.Name);
624     test(p.value == "ENumber", p.value);
625     test(p.next);
626     test(p.type == p.Token.Number);
627     test(p.value == "12345.6E+7", p.value);
628     test(p.next);
629     test(p.type == p.Token.Name);
630     test(p.value == "FNumber", p.value);
631     test(p.next);
632     test(p.type == p.Token.Number);
633     test(p.value == "12345.6E-7", p.value);
634     test(p.next);
635     test(p.type == p.Token.Name);
636     test(p.value == "True", p.value);
637     test(p.next);
638     test(p.type == p.Token.True);
639     test(p.next);
640     test(p.type == p.Token.Name);
641     test(p.value == "False", p.value);
642     test(p.next);
643     test(p.type == p.Token.False);
644     test(p.next);
645     test(p.type == p.Token.Name);
646     test(p.value == "Null", p.value);
647     test(p.next);
648     test(p.type == p.Token.Null);
649     test(p.next);
650     test(p.type == p.Token.EndObject);
651     test(p.next);
652     test(p.type == p.Token.EndObject);
653     test(p.next);
654     test(p.type == p.Token.EndObject);
655     test(p.next);
656     test(p.type == p.Token.EndObject);
657     test(p.next);
658     test(p.type == p.Token.EndObject);
659     test(!p.next);
660 
661     test(p.state.size == 0);
662 }