1 /*******************************************************************************
2 
3         Copyright:
4             Copyright (c) 2004 Kris Bell.
5             Some parts copyright (c) 2009-2016 dunnhumby Germany GmbH.
6             All rights reserved.
7 
8         License:
9             Tango Dual License: 3-Clause BSD License / Academic Free License v3.0.
10             See LICENSE_TANGO.txt for details.
11 
12         Version: Initial release: December 2005
13 
14         Authors: Kris Bell
15 
16 *******************************************************************************/
17 
18 module ocean.io.stream.Iterator;
19 
20 import ocean.meta.types.Qualifiers;
21 
22 import ocean.core.Verify;
23 
24 import ocean.io.stream.Buffered;
25 
26 package import ocean.io.device.Conduit : InputFilter, InputBuffer, InputStream;
27 
28 /*******************************************************************************
29 
30         The base class for a set of stream iterators. These operate
31         upon a buffered input stream, and are designed to deal with
32         partial content. That is, stream iterators go to work the
33         moment any data becomes available in the buffer. Contrast
34         this behaviour with the ocean.text.Util iterators, which
35         operate upon the extent of an array.
36 
37         There are two types of iterators supported; exclusive and
38         inclusive. The former are the more common kind, where a token
39         is delimited by elements that are considered foreign. Examples
40         include space, comma, and end-of-line delineation. Inclusive
41         tokens are just the opposite: they look for patterns in the
42         text that should be part of the token itself - everything else
43         is considered foreign. Currently ocean.io.stream includes the
44         exclusive variety only.
45 
46         Each pattern is exposed to the client as a slice of the original
47         content, where the slice is transient. If you need to retain the
48         exposed content, then you should .dup it appropriately.
49 
50         The content provided to these iterators is intended to be fully
51         read-only. All current tokenizers abide by this rule, but it is
52         possible a user could mutate the content through a token slice.
53         To enforce the desired read-only aspect, the code would have to
54         introduce redundant copying or the compiler would have to support
55         read-only arrays (now in D2).
56 
57         See Delimiters, Lines, Patterns, Quotes.
58 
59 *******************************************************************************/
60 
61 class Iterator : InputFilter
62 {
63         private InputBuffer     source;
64         protected cstring       slice,
65                                 delim;
66 
67         /***********************************************************************
68 
69                 The pattern scanner, implemented via subclasses.
70 
71         ***********************************************************************/
72 
73         abstract protected size_t scan (const(void)[] data);
74 
75         /***********************************************************************
76 
77                 Instantiate with a buffer.
78 
79         ***********************************************************************/
80 
81         this (InputStream stream = null)
82         {
83                 super (stream);
84                 if (stream)
85                     set (stream);
86         }
87 
88         /***********************************************************************
89 
90                 Set the provided stream as the scanning source.
91 
92         ***********************************************************************/
93 
94         Iterator set (InputStream stream)
95         {
96                 verify(stream !is null);
97                 source = BufferedInput.create (stream);
98                 super.source = source;
99                 return this;
100         }
101 
102         /***********************************************************************
103 
104                 Return the current token as a slice of the content.
105 
106         ***********************************************************************/
107 
108         final cstring get ()
109         {
110                 return slice;
111         }
112 
113         /**********************************************************************
114 
115                 Iterate over the set of tokens. This should really
116                 provide read-only access to the tokens, but D does
117                 not support that at this time.
118 
119         **********************************************************************/
120 
121         int opApply (scope int delegate(ref cstring) dg)
122         {
123                 bool more;
124                 int  result;
125 
126                 do {
127                    more = consume;
128                    result = dg (slice);
129                    } while (more && !result);
130                 return result;
131         }
132 
133         /**********************************************************************
134 
135                 Iterate over a set of tokens, exposing a token count
136                 starting at zero.
137 
138         **********************************************************************/
139 
140         int opApply (scope int delegate(ref int, ref cstring) dg)
141         {
142                 bool more;
143                 int  result,
144                      tokens;
145 
146                 do {
147                    more = consume;
148                    result = dg (tokens, slice);
149                    ++tokens;
150                    } while (more && !result);
151                 return result;
152         }
153 
154         /**********************************************************************
155 
156                 Iterate over a set of tokens and delimiters, exposing a
157                 token count starting at zero.
158 
159         **********************************************************************/
160 
161         int opApply (scope int delegate(ref int, ref cstring, ref cstring) dg)
162         {
163                 bool more;
164                 int  result,
165                      tokens;
166 
167                 do {
168                    delim = null;
169                    more = consume;
170                    result = dg (tokens, slice, delim);
171                    ++tokens;
172                    } while (more && !result);
173                 return result;
174         }
175 
176         /***********************************************************************
177 
178                 Locate the next token. Returns the token if found, null
179                 otherwise. Null indicates an end of stream condition. To
180                 sweep a conduit for lines using method next():
181                 ---
182                 auto lines = new Lines!(char) (new File("myfile"));
183                 while (lines.next)
184                        Cout (lines.get).newline;
185                 ---
186 
187                 Alternatively, we can extract one line from a conduit:
188                 ---
189                 auto line = (new Lines!(char) (new File("myfile"))).next;
190                 ---
191 
192                 The difference between next() and foreach() is that the
193                 latter processes all tokens in one go, whereas the former
194                 processes in a piecemeal fashion. To wit:
195                 ---
196                 foreach (line; new Lines!(char) (new File("myfile")))
197                          Cout(line).newline;
198                 ---
199 
200         ***********************************************************************/
201 
202         final cstring next ()
203         {
204                 if (consume() || slice.length)
205                     return slice;
206                 return null;
207         }
208 
209         /***********************************************************************
210 
211                 Set the content of the current slice to the provided
212                 start and end points.
213 
214         ***********************************************************************/
215 
216         protected final size_t set (const(char)* content, size_t start, size_t end)
217         {
218                 slice = content [start .. end];
219                 return end;
220         }
221 
222         /***********************************************************************
223 
224                 Set the content of the current slice to the provided
225                 start and end points, and delimiter to the segment
226                 between end & next (inclusive.)
227 
228         ***********************************************************************/
229 
230         protected final size_t set (const(char)* content, size_t start, size_t end, size_t next)
231         {
232                 slice = content [start .. end];
233                 delim = content [end .. next+1];
234                 return end;
235         }
236 
237         /***********************************************************************
238 
239                 Called when a scanner fails to find a matching pattern.
240                 This may cause more content to be loaded, and a rescan
241                 initiated.
242 
243         ***********************************************************************/
244 
245         protected final size_t notFound ()
246         {
247                 return Eof;
248         }
249 
250         /***********************************************************************
251 
252                 Invoked when a scanner matches a pattern. The provided
253                 value should be the index of the last element of the
254                 matching pattern, which is converted back to a void[]
255                 index.
256 
257         ***********************************************************************/
258 
259         protected final size_t found (size_t i)
260         {
261                 return (i + 1);
262         }
263 
264         /***********************************************************************
265 
266                 See if set of characters holds a particular instance.
267 
268         ***********************************************************************/
269 
270         protected final bool has (cstring set, char match)
271         {
272                 foreach (c; set)
273                          if (match is c)
274                              return true;
275                 return false;
276         }
277 
278         /***********************************************************************
279 
280                 Consume the next token and place it in 'slice'. Returns
281                 true when there are potentially more tokens.
282 
283         ***********************************************************************/
284 
285         private bool consume ()
286         {
287                 if (source.next (&scan))
288                     return true;
289 
290                 // consume trailing token
291                 source.reader ((const(void)[] arr)
292                               {
293                               slice = (cast(const(char)*) arr.ptr) [0 .. arr.length];
294                               return arr.length;
295                               });
296                 return false;
297         }
298 }