1 /******************************************************************************* 2 3 Class for parsing streams of CSV data. 4 5 Currently the class is capable of parsing only fairly simple, well-formatted 6 CSV. The following basic format features are supported: 7 8 * Newline (\n) separated lines. 9 * Comma (or arbitrary character) -separated fields. 10 * Quoted fields (a " character, followed by any number of characters, 11 and delimited by another " and a separator character). Separators 12 (commas) and newlines (\n) may both appear inside quoted fields. 13 14 Usage: 15 16 --- 17 18 import ocean.io.Stdout; 19 import ocean.io.device.File; 20 21 scope file = new File("example.csv", File.ReadExisting); 22 scope csv = new CSV; 23 24 csv.parse(file, 25 (char[][] fields) 26 { 27 Stdout.formatln("Row={}", fields); 28 return true; // tells CSV instance to continue parsing 29 }); 30 31 --- 32 33 Copyright: 34 Copyright (c) 2009-2016 dunnhumby Germany GmbH. 35 All rights reserved. 36 37 License: 38 Boost Software License Version 1.0. See LICENSE_BOOST.txt for details. 39 Alternatively, this file may be distributed under the terms of the Tango 40 3-Clause BSD License (see LICENSE_BSD.txt for details). 41 42 *******************************************************************************/ 43 44 module ocean.text.csv.CSV; 45 46 47 import ocean.core.Enforce; 48 49 import ocean.meta.types.Qualifiers; 50 51 import ocean.util.container.AppendBuffer; 52 53 import ocean.io.model.IConduit; 54 55 import ocean.core.Verify; 56 57 version (unittest) import ocean.core.Test; 58 59 60 /******************************************************************************* 61 62 Simple CSV parser. Passes extracted fields, one row at a time to a 63 user-provided delegate. 64 65 *******************************************************************************/ 66 67 public class CSV 68 { 69 /*************************************************************************** 70 71 Type of delegate which receives parsed CSV rows. 72 73 ***************************************************************************/ 74 75 public alias bool delegate ( cstring[] fields ) RowDg; 76 77 78 /*************************************************************************** 79 80 Separator character. Defaults to comma, but may be set before calling 81 parse(). 82 83 ***************************************************************************/ 84 85 public char separator = ','; 86 87 88 /*************************************************************************** 89 90 Buffer used to build up a full row as data is read from the input 91 stream. 92 93 ***************************************************************************/ 94 95 private AppendBuffer!(char) row; 96 97 98 /*************************************************************************** 99 100 List of slices into the row buffer, used to split the row into fields. 101 102 ***************************************************************************/ 103 104 private AppendBuffer!(cstring) fields; 105 106 /*************************************************************************** 107 108 Fixed size buffer for reading for stream 109 110 ***************************************************************************/ 111 112 private mstring buffer; 113 114 115 /*************************************************************************** 116 117 Constructor. 118 119 ***************************************************************************/ 120 121 public this ( ) 122 { 123 this.row = new AppendBuffer!(char); 124 this.fields = new AppendBuffer!(cstring); 125 this.buffer = new char[512]; 126 } 127 128 /*************************************************************************** 129 130 Parses CSV data from the provided stream. Parsing ends when an EOF is 131 encountered. As rows are extracted and parsed, they are passed to the 132 provided delegate. 133 134 Params: 135 stream = stream to read CSV data from 136 row_dg = delegate to receive parsed rows 137 138 ***************************************************************************/ 139 140 public void parse ( InputStream stream, scope RowDg row_dg ) 141 { 142 verify(stream !is null, "InputStream is null"); 143 verify(row_dg !is null, "Row delegate is null"); 144 145 this.row.clear(); 146 147 // appends chunk of data from stream when encountering any of control 148 // symbols 149 scope append_chunk = ( mstring data, ref size_t start, size_t end ) 150 { 151 this.row ~= data[start .. end]; 152 start = end + 1; 153 }; 154 155 // indicates that the beginning of a stream chunk is already in the 156 // middle of a quote 157 bool in_quote = false; 158 159 size_t bytes_read; 160 161 while ((bytes_read = stream.read(this.buffer)) != InputStream.Eof) 162 { 163 size_t chunk_start = 0; 164 auto data = this.buffer[0 .. bytes_read]; 165 166 foreach (i, c; data) 167 { 168 verify(c != '\0'); 169 170 if (c == this.separator && !in_quote) 171 { 172 // trick: make use of the fact there won't be a \0 symbol 173 // in the input stream and replace separator symbol with \0 174 // to disambugate from escaped separator and make parsing 175 // a single row trivial 176 append_chunk(data, chunk_start, i); 177 this.row ~= '\0'; 178 continue; 179 } 180 181 if (c == '"') 182 { 183 in_quote = !in_quote; 184 185 if (data[i-1] == '"') 186 { 187 // need adjustment, it was escaped quote last time and 188 // not the end of quote 189 this.row ~= "\""; 190 chunk_start++; 191 } 192 else 193 append_chunk(data, chunk_start, i); 194 continue; 195 } 196 197 if (c == '\n') 198 { 199 if (in_quote) 200 continue; 201 append_chunk(data, chunk_start, i); 202 203 // if row_dg returns 'false', no further parsing is needed 204 if (!this.parseRow(row_dg)) 205 return; 206 this.row.clear(); 207 continue; 208 } 209 } 210 211 if (chunk_start < data.length ) 212 this.row ~= data[chunk_start .. $]; 213 } 214 215 if (row.length) 216 this.parseRow(row_dg); 217 } 218 219 220 /*************************************************************************** 221 222 Parses the current row (contained in this.row) and passes the parsed 223 fields to the provided delegate. 224 225 Params: 226 row_dg = delegate to receive parsed rows 227 228 ***************************************************************************/ 229 230 private bool parseRow ( scope RowDg row_dg ) 231 { 232 this.fields.clear(); 233 234 size_t field_start; 235 236 foreach (i, c; this.row[]) 237 { 238 if (c == '\0') 239 { 240 this.fields ~= this.row[field_start .. i]; 241 field_start = i + 1; 242 } 243 } 244 245 this.fields ~= this.row[field_start .. this.row.length]; 246 return row_dg(this.fields[]); 247 } 248 } 249 250 251 252 /******************************************************************************* 253 254 UnitTest 255 256 *******************************************************************************/ 257 258 version (unittest) 259 { 260 import ocean.io.device.Array; 261 } 262 263 unittest 264 { 265 void test ( NamedTest t, CSV csv, cstring str, cstring[][] expected ) 266 { 267 scope array = new Array(1024); 268 array.append(str); 269 270 size_t test_row; 271 csv.parse(array, 272 ( cstring[] parsed_fields ) 273 { 274 auto fields = expected[test_row++]; 275 276 foreach ( i, f; parsed_fields ) 277 { 278 t.test!("==")(f, fields[i]); 279 } 280 return true; 281 }); 282 } 283 284 scope csv = new CSV; 285 286 test(new NamedTest("Single Row"), csv, 287 `An,Example,Simple,CSV,Row`, 288 [["An", "Example", "Simple", "CSV", "Row"]]); 289 290 test(new NamedTest("Single row + quoted comma"), csv, 291 `An,Example,"Quoted,Field",CSV,Row`, 292 [["An", "Example", "Quoted,Field", "CSV", "Row"]]); 293 294 test(new NamedTest("Single row + quoted newline"), csv, 295 `An,Example,"Quoted 296 Field",CSV,Row`, 297 [["An", "Example", "Quoted\nField", "CSV", "Row"]]); 298 299 test(new NamedTest("Two rows"), csv, 300 `An,Example,Simple,CSV,Row 301 This,Time,With,Two,Rows`, 302 [["An", "Example", "Simple", "CSV", "Row"], 303 ["This","Time","With","Two","Rows"]]); 304 305 test(new NamedTest("Quoted field last"), csv, 306 `An,Example,"Quoted"`, 307 [["An", "Example", "Quoted"]]); 308 309 test(new NamedTest("Partially quoted field"), csv, 310 `An,Example,"Quot"ed`, 311 [["An", "Example", "Quoted"]]); 312 313 test(new NamedTest("Escaped quote"), csv, 314 `An,""Example"","Quoted"`, 315 [["An", "\"Example\"", "Quoted"]]); 316 317 } 318