1 /******************************************************************************* 2 3 Zlib decoder which supports pkzip and gzip archives, and can be stored in a 4 pool. 5 6 In general it is not possible to stream pkzip archives. This is because the 7 format supports some quirky features which were important in the days of 8 floppy diskettes. 9 10 This class supports an important case where streaming is 11 possible: an archive which consists of a single file stored at 12 the start of the archive, using DEFLATE compression. 13 14 Needs linking with -lz. 15 16 Usage example: 17 18 --- 19 20 import ocean.io.compress.ZipStream; 21 22 auto unzipper = new ZipStreamDecompressor; 23 24 unzipper.reset(); 25 26 try 27 { 28 unzipper.start(); 29 } 30 catch (Exception e) 31 { 32 // Error! 33 } 34 35 // `downloader` is a hypothetical source which provides chunks of 36 // compressed data, eg downloaded from a socket 37 // Before processing, it may be wise to check that the first bytes in 38 // the file are equal to GZipFileSignature (for gzip files) or 39 // ocean.util.compress.c.Zip.ZipLocalFileHeaderSignature (for pkzip); an 40 // exception will be thrown if this is not true. 41 42 foreach (compressed_chunk; downloader) 43 { 44 try 45 { 46 uncompressed = unzipper.decompress(compressed_chunk); 47 ^ 48 Stdout.format("{}", uncompressed); 49 } 50 catch (Exception e) 51 { 52 // Error! 53 } 54 } 55 56 if (!unzipper.end()) 57 { 58 // Error! 59 } 60 --- 61 62 63 copyright: Copyright (c) 2016 dunnhumby Germany GmbH. All rights reserved 64 65 *******************************************************************************/ 66 67 module ocean.io.compress.ZipStream; 68 69 70 import ocean.core.Array : startsWith; 71 import ocean.core.Exception; 72 import ocean.io.compress.ZlibStream; 73 import ocean.meta.types.Qualifiers; 74 import ocean.util.compress.c.Zip; 75 import ocean.util.container.AppendBuffer; 76 import ocean.util.digest.Crc32; 77 78 79 /******************************************************************************* 80 81 The file signature (magic number) used to identify a GZIP file 82 83 *******************************************************************************/ 84 85 public static istring GZipFileSignature = "\x1F\x8b"; 86 87 88 89 /******************************************************************************* 90 91 Zlib decoder which supports both gzip and pkzip compressed streams. 92 93 Pkzip files are supported only if they contain a single file which is 94 located at the start of the archive, and which contain a complete local file 95 header record (that is, the length and CRC of that file are specified at the 96 start of the archive). 97 98 *******************************************************************************/ 99 100 public class ZipStreamDecompressor : ZlibStreamDecompressor 101 { 102 /*************************************************************************** 103 104 Object pool index, allows instances of this type to be stored in a 105 pool. 106 107 ***************************************************************************/ 108 109 public size_t object_pool_index; 110 111 112 /*************************************************************************** 113 114 Feed decompression buffer. 115 116 ***************************************************************************/ 117 118 private AppendBuffer!(ubyte) uncompressed; 119 120 121 /*************************************************************************** 122 123 CRC instance for validating Pkzip files 124 125 ***************************************************************************/ 126 127 private Crc32 crc; 128 129 130 /*************************************************************************** 131 132 Header of the current compressed file, if this is a PKZIP archive 133 134 ***************************************************************************/ 135 136 private ZipLocalFileHeaderRecord zip_header; 137 138 139 /*************************************************************************** 140 141 State of the decompression. The file may be a GZip file, or a PKZIP 142 archive. 143 144 ***************************************************************************/ 145 146 private enum DecompressState 147 { 148 NotStarted, /// Decompression has not yet begun 149 GzipStarted, /// Gzip decompression is in progress 150 PkzipHeader, /// A Pkzip local file header is being read 151 PkzipExtra, /// A Pkzip extra field is being skipped 152 PkZipBody, /// Pkzip compressed data is being read 153 PkZipTrailer /// Data after the compressed file is being skipped 154 }; 155 156 private DecompressState state; 157 158 159 /*************************************************************************** 160 161 Counter which drops to zero when the current PKZIP section has finished 162 163 ***************************************************************************/ 164 165 private int pkzip_byte_counter; 166 167 168 /*************************************************************************** 169 170 Reusable exception thrown when a zip file cannot be decompressed 171 172 ***************************************************************************/ 173 174 public static class DecompressionException : Exception 175 { 176 177 /*********************************************************************** 178 179 Provides standard reusable exception API 180 181 ***********************************************************************/ 182 183 mixin ReusableExceptionImplementation!(); 184 185 } 186 187 188 /*************************************************************************** 189 190 Reusable exception instance 191 192 ***************************************************************************/ 193 194 195 private DecompressionException exception; 196 197 198 /*************************************************************************** 199 200 Constructor 201 202 ***************************************************************************/ 203 204 public this () 205 { 206 this.uncompressed = new AppendBuffer!(ubyte); 207 this.crc = new Crc32; 208 this.exception = new DecompressionException; 209 } 210 211 212 /*************************************************************************** 213 214 Begin processing of a compressed file 215 216 ***************************************************************************/ 217 218 public void reset ( ) 219 { 220 this.state = DecompressState.NotStarted; 221 } 222 223 224 /*************************************************************************** 225 226 Release the resources used for decompression, and perform a consistency 227 check 228 229 Returns: 230 true if the file was well-formed, false if it was inconsistent 231 232 ***************************************************************************/ 233 234 public bool endDecompression ( ) 235 { 236 // If decompression was started, end it. 237 238 if ( this.state == DecompressState.GzipStarted || 239 this.state == DecompressState.PkZipBody ) 240 { 241 return this.end(); 242 } 243 244 // If it was a PKZIP file and we reached the end, 245 // it is OK 246 247 if ( this.state == DecompressState.PkZipTrailer ) 248 { 249 return true; 250 } 251 252 // Any other situation is an error 253 254 return false; 255 } 256 257 258 /*************************************************************************** 259 260 Decompress a chunk of input data 261 262 Params: 263 data = received data chunk 264 265 Returns: 266 the uncompressed data 267 268 Throws: 269 if a decompression error occurs 270 271 ***************************************************************************/ 272 273 public ubyte [] decompress ( const(ubyte) [] data ) 274 { 275 if ( this.state == DecompressState.NotStarted ) 276 { 277 // Use the first bytes to identify which format it is 278 279 if ( startsWith(cast(cstring)data, GZipFileSignature) ) 280 { 281 // GZip file 282 283 this.state = DecompressState.GzipStarted; 284 285 this.start(); 286 } 287 else if ( startsWith(cast(cstring)data, 288 ZipLocalFileHeaderSignature) ) 289 { 290 // PKZip file 291 292 this.state = DecompressState.PkzipHeader; 293 this.uncompressed.clear(); 294 data = data[ZipLocalFileHeaderSignature.length..$]; 295 } 296 else 297 { 298 this.exception.set("Unsupported file format"); 299 throw this.exception; 300 } 301 } 302 303 if ( this.state == DecompressState.PkzipHeader ) 304 { 305 // Append data to 'uncompressed' until we've obtained the header 306 307 if ( this.uncompressed.length + data.length 308 < this.zip_header.sizeof ) 309 { 310 this.uncompressed.append(data); 311 return null; 312 } 313 314 auto len = this.zip_header.sizeof - this.uncompressed.length; 315 316 this.uncompressed.append(data[0..len]); 317 this.zip_header = *cast(ZipLocalFileHeaderRecord *) 318 (this.uncompressed[]); 319 320 // Check that the file format is one which we support 321 322 if ( this.zip_header.isCrcMissing() ) 323 { 324 this.exception.set("Zip file is not streamable - No CRC"); 325 throw this.exception; 326 } 327 328 if ( !this.zip_header.isDeflateCompressed() ) 329 { 330 // This error most likely indicates data corruption, or a tiny 331 // file. Deflate compression has been standard since 1993. 332 // Tiny files are STORED instead of DEFLATED. 333 this.exception.set("Zip file uses unsupported compression"); 334 throw this.exception; 335 } 336 337 data = data[ len..$ ]; 338 339 this.state = DecompressState.PkzipExtra; 340 341 // Calculate the number of bytes which need to be skipped 342 343 this.pkzip_byte_counter = this.zip_header.file_name_length + 344 this.zip_header.extra_field_length; 345 346 } 347 348 if ( this.state == DecompressState.PkzipExtra ) 349 { 350 // Skip the filename and the 'extra field' 351 352 if ( this.pkzip_byte_counter >= data.length ) 353 { 354 this.pkzip_byte_counter -= data.length; 355 356 return null; 357 } 358 359 data = data[this.pkzip_byte_counter .. $]; 360 361 // Now, we start decompressing the actual zip stream 362 // It does not have any header encoding 363 364 this.start(Encoding.None); 365 366 // Reset the CRC. This is a workaround for a terrible Tango design 367 // (the CRC is reset when you read the digest -- which means that 368 // if the digest wasn't read, the next CRC will be incorrect) 369 370 this.crc.crc32Digest(); 371 372 // Determine how many bytes of compressed data to wait for 373 374 this.pkzip_byte_counter = this.zip_header.compressed_size; 375 376 this.state = DecompressState.PkZipBody; 377 } 378 379 // Now, obtain the compressed data 380 381 auto compressed_data = data; 382 383 if ( this.state == DecompressState.PkZipBody ) 384 { 385 if ( data.length >= this.pkzip_byte_counter ) 386 { 387 compressed_data = data[0 .. this.pkzip_byte_counter]; 388 } 389 390 this.pkzip_byte_counter -= compressed_data.length; 391 392 data = data[compressed_data.length .. $]; 393 } 394 395 if ( this.state == DecompressState.PkZipTrailer ) 396 { 397 // Don't need anything more from the file. 398 // Just skip everything. 399 400 return null; 401 } 402 403 this.uncompressed.clear(); 404 405 // The cast is necessary only because ZLibStreamDecompressor isn't 406 // const-correct. 407 this.decodeChunk(cast(ubyte[])compressed_data, 408 ( ubyte[] uncompressed_chunk ) 409 { 410 this.uncompressed.append(uncompressed_chunk); 411 } 412 ); 413 414 if ( this.state == DecompressState.PkZipBody ) 415 { 416 this.crc.update(this.uncompressed[]); 417 418 // Check if we have finished reading the compressed data 419 420 if ( this.pkzip_byte_counter == 0 ) 421 { 422 this.state = DecompressState.PkZipTrailer; 423 424 // Check if it was genuinely the end of a compressed stream 425 426 if ( !this.end() ) 427 { 428 this.exception.set("Zip file is incomplete"); 429 throw this.exception; 430 } 431 432 // Now that the compressed data has finished, check if 433 // the checksum was correct. 434 // Note that the CRC is reset when you read the digest. 435 436 auto calculated_crc = this.crc.crc32Digest(); 437 438 if ( calculated_crc != this.zip_header.crc_32 ) 439 { 440 this.exception.set("Zip file checksum failed."); 441 throw this.exception; 442 } 443 444 } 445 } 446 return this.uncompressed[]; 447 } 448 } 449 450 451 unittest 452 { 453 import ocean.core.Test; 454 455 // A tiny PKZIP file using DEFLATE compression 456 457 immutable rawZip = 458 "\x50\x4b\x03\x04\x14\x00\x00\x00\x08\x00\xa2\x6e\x2d\x50\x2f\xbd" ~ 459 "\x37\x12\x08\x00\x00\x00\x10\x00\x00\x00\x08\x00\x1c\x00\x74\x65" ~ 460 "\x73\x74\x2e\x74\x78\x74\x55\x54\x09\x00\x03\x30\x68\x1c\x5e\x30" ~ 461 "\x68\x1c\x5e\x75\x78\x0b\x00\x01\x04\xe8\x03\x00\x00\x04\xe8\x03" ~ 462 "\x00\x00\x4b\x4c\x4c\x4a\x44\x42\x5c\x00\x50\x4b\x01\x02\x1e\x03" ~ 463 "\x14\x00\x00\x00\x08\x00\xa2\x6e\x2d\x50\x2f\xbd\x37\x12\x08\x00" ~ 464 "\x00\x00\x10\x00\x00\x00\x08\x00\x18\x00\x00\x00\x00\x00\x01\x00" ~ 465 "\x00\x00\xb4\x81\x00\x00\x00\x00\x74\x65\x73\x74\x2e\x74\x78\x74" ~ 466 "\x55\x54\x05\x00\x03\x30\x68\x1c\x5e\x75\x78\x0b\x00\x01\x04\xe8" ~ 467 "\x03\x00\x00\x04\xe8\x03\x00\x00\x50\x4b\x05\x06\x00\x00\x00\x00" ~ 468 "\x01\x00\x01\x00\x4e\x00\x00\x00\x4a\x00\x00\x00\x00\x00"; 469 470 auto unzipper = new ZipStreamDecompressor; 471 unzipper.reset(); 472 unzipper.start(); 473 auto decom = unzipper.decompress(cast(const(ubyte)[]) rawZip); 474 test!("==")(decom, "aabaabaabaabaab\n"); 475 test(unzipper.endDecompression()); 476 477 // Now process the same file, in chunks. 478 479 unzipper.reset(); 480 unzipper.start(); 481 decom = unzipper.decompress(cast(const(ubyte)[]) rawZip[0..10]); 482 test(decom.length == 0); 483 decom = unzipper.decompress(cast(const(ubyte)[]) rawZip[10..60]); 484 test(decom.length == 0); 485 decom = unzipper.decompress(cast(const(ubyte)[]) rawZip[60..70]); 486 test!("==")(decom, "aab"); 487 decom = unzipper.decompress(cast(const(ubyte)[]) rawZip[70..72]); 488 test!("==")(decom, "aabaabaabaab"); 489 decom = unzipper.decompress(cast(const(ubyte)[]) rawZip[72..$]); 490 test!("==")(decom, "\n"); 491 test(unzipper.endDecompression()); 492 }