1 /*******************************************************************************
2 
3     Class for parsing streams of CSV data.
4 
5     Currently the class is capable of parsing only fairly simple, well-formatted
6     CSV. The following basic format features are supported:
7 
8         * Newline (\n) separated lines.
9         * Comma (or arbitrary character) -separated fields.
10         * Quoted fields (a " character, followed by any number of characters,
11           and delimited by another " and a separator character). Separators
12           (commas) and newlines (\n) may both appear inside quoted fields.
13 
14     Usage:
15 
16     ---
17 
18         import ocean.io.Stdout;
19         import ocean.io.device.File;
20 
21         scope file = new File("example.csv", File.ReadExisting);
22         scope csv = new CSV;
23 
24         csv.parse(file,
25         (char[][] fields)
26         {
27             Stdout.formatln("Row={}", fields);
28             return true; // tells CSV instance to continue parsing
29         });
30 
31     ---
32 
33     Copyright:
34         Copyright (c) 2009-2016 dunnhumby Germany GmbH.
35         All rights reserved.
36 
37     License:
38         Boost Software License Version 1.0. See LICENSE_BOOST.txt for details.
39         Alternatively, this file may be distributed under the terms of the Tango
40         3-Clause BSD License (see LICENSE_BSD.txt for details).
41 
42 *******************************************************************************/
43 
44 module ocean.text.csv.CSV;
45 
46 
47 import ocean.core.Enforce;
48 
49 import ocean.meta.types.Qualifiers;
50 
51 import ocean.util.container.AppendBuffer;
52 
53 import ocean.io.model.IConduit;
54 
55 import ocean.core.Verify;
56 
57 version (unittest) import ocean.core.Test;
58 
59 
60 /*******************************************************************************
61 
62     Simple CSV parser. Passes extracted fields, one row at a time to a
63     user-provided delegate.
64 
65 *******************************************************************************/
66 
67 public class CSV
68 {
69     /***************************************************************************
70 
71         Type of delegate which receives parsed CSV rows.
72 
73     ***************************************************************************/
74 
75     public alias bool delegate ( cstring[] fields ) RowDg;
76 
77 
78     /***************************************************************************
79 
80         Separator character. Defaults to comma, but may be set before calling
81         parse().
82 
83     ***************************************************************************/
84 
85     public char separator = ',';
86 
87 
88     /***************************************************************************
89 
90         Buffer used to build up a full row as data is read from the input
91         stream.
92 
93     ***************************************************************************/
94 
95     private AppendBuffer!(char) row;
96 
97 
98     /***************************************************************************
99 
100         List of slices into the row buffer, used to split the row into fields.
101 
102     ***************************************************************************/
103 
104     private AppendBuffer!(cstring) fields;
105 
106     /***************************************************************************
107 
108         Fixed size buffer for reading for stream
109 
110     ***************************************************************************/
111 
112     private mstring buffer;
113 
114 
115     /***************************************************************************
116 
117         Constructor.
118 
119     ***************************************************************************/
120 
121     public this ( )
122     {
123         this.row = new AppendBuffer!(char);
124         this.fields = new AppendBuffer!(cstring);
125         this.buffer = new char[512];
126     }
127 
128     /***************************************************************************
129 
130         Parses CSV data from the provided stream. Parsing ends when an EOF is
131         encountered. As rows are extracted and parsed, they are passed to the
132         provided delegate.
133 
134         Params:
135             stream = stream to read CSV data from
136             row_dg = delegate to receive parsed rows
137 
138     ***************************************************************************/
139 
140     public void parse ( InputStream stream, scope RowDg row_dg )
141     {
142         verify(stream !is null, "InputStream is null");
143         verify(row_dg !is null, "Row delegate is null");
144 
145         this.row.clear();
146 
147         // appends chunk of data from stream when encountering any of control
148         // symbols
149         scope append_chunk = ( mstring data, ref size_t start, size_t end )
150         {
151             this.row ~= data[start .. end];
152             start = end + 1;
153         };
154 
155         // indicates that the beginning of a stream chunk is already in the
156         // middle of a quote
157         bool in_quote = false;
158 
159         size_t bytes_read;
160 
161         while ((bytes_read = stream.read(this.buffer)) != InputStream.Eof)
162         {
163             size_t chunk_start = 0;
164             auto data = this.buffer[0 .. bytes_read];
165 
166             foreach (i, c; data)
167             {
168                 verify(c != '\0');
169 
170                 if (c == this.separator && !in_quote)
171                 {
172                     // trick: make use of the fact there won't be a \0 symbol
173                     // in the input stream and replace separator symbol with \0
174                     // to disambugate from escaped separator and make parsing
175                     // a single row trivial
176                     append_chunk(data, chunk_start, i);
177                     this.row ~= '\0';
178                     continue;
179                 }
180 
181                 if (c == '"')
182                 {
183                     in_quote = !in_quote;
184 
185                     if (data[i-1] == '"')
186                     {
187                         // need adjustment, it was escaped quote last time and
188                         // not the end of quote
189                         this.row ~= "\"";
190                         chunk_start++;
191                     }
192                     else
193                         append_chunk(data, chunk_start, i);
194                     continue;
195                 }
196 
197                 if (c == '\n')
198                 {
199                     if (in_quote)
200                         continue;
201                     append_chunk(data, chunk_start, i);
202 
203                     // if row_dg returns 'false', no further parsing is needed
204                     if (!this.parseRow(row_dg))
205                         return;
206                     this.row.clear();
207                     continue;
208                 }
209             }
210 
211             if (chunk_start < data.length )
212                 this.row ~= data[chunk_start .. $];
213         }
214 
215         if (row.length)
216             this.parseRow(row_dg);
217     }
218 
219 
220     /***************************************************************************
221 
222         Parses the current row (contained in this.row) and passes the parsed
223         fields to the provided delegate.
224 
225         Params:
226             row_dg = delegate to receive parsed rows
227 
228     ***************************************************************************/
229 
230     private bool parseRow ( scope RowDg row_dg )
231     {
232         this.fields.clear();
233 
234         size_t field_start;
235 
236         foreach (i, c; this.row[])
237         {
238             if (c == '\0')
239             {
240                 this.fields ~= this.row[field_start .. i];
241                 field_start = i + 1;
242             }
243         }
244 
245         this.fields ~= this.row[field_start .. this.row.length];
246         return row_dg(this.fields[]);
247     }
248 }
249 
250 
251 
252 /*******************************************************************************
253 
254     UnitTest
255 
256 *******************************************************************************/
257 
258 version (unittest)
259 {
260     import ocean.io.device.Array;
261 }
262 
263 unittest
264 {
265     void test ( NamedTest t, CSV csv, cstring str, cstring[][] expected )
266     {
267         scope array = new Array(1024);
268         array.append(str);
269 
270         size_t test_row;
271         csv.parse(array,
272         ( cstring[] parsed_fields )
273         {
274             auto fields = expected[test_row++];
275 
276             foreach ( i, f; parsed_fields )
277             {
278                 t.test!("==")(f, fields[i]);
279             }
280             return true;
281         });
282     }
283 
284     scope csv = new CSV;
285 
286     test(new NamedTest("Single Row"), csv,
287 `An,Example,Simple,CSV,Row`,
288         [["An", "Example", "Simple", "CSV", "Row"]]);
289 
290     test(new NamedTest("Single row + quoted comma"), csv,
291 `An,Example,"Quoted,Field",CSV,Row`,
292         [["An", "Example", "Quoted,Field", "CSV", "Row"]]);
293 
294     test(new NamedTest("Single row + quoted newline"), csv,
295 `An,Example,"Quoted
296 Field",CSV,Row`,
297         [["An", "Example", "Quoted\nField", "CSV", "Row"]]);
298 
299     test(new NamedTest("Two rows"), csv,
300 `An,Example,Simple,CSV,Row
301 This,Time,With,Two,Rows`,
302         [["An", "Example", "Simple", "CSV", "Row"],
303          ["This","Time","With","Two","Rows"]]);
304 
305     test(new NamedTest("Quoted field last"), csv,
306 `An,Example,"Quoted"`,
307         [["An", "Example", "Quoted"]]);
308 
309     test(new NamedTest("Partially quoted field"), csv,
310 `An,Example,"Quot"ed`,
311         [["An", "Example", "Quoted"]]);
312 
313     test(new NamedTest("Escaped quote"), csv,
314 `An,""Example"","Quoted"`,
315         [["An", "\"Example\"", "Quoted"]]);
316 
317 }
318