1 /*******************************************************************************
2 
3     Zlib decoder which supports pkzip and gzip archives, and can be stored in a
4     pool.
5 
6     In general it is not possible to stream pkzip archives. This is because the
7     format supports some quirky features which were important in the days of
8     floppy diskettes.
9 
10     This class supports an important case where streaming is
11     possible: an archive which consists of a single file stored at
12     the start of the archive, using DEFLATE compression.
13 
14     Needs linking with -lz.
15 
16     Usage example:
17 
18     ---
19 
20         import ocean.io.compress.ZipStream;
21 
22         auto unzipper = new ZipStreamDecompressor;
23 
24         unzipper.reset();
25 
26         try
27         {
28             unzipper.start();
29         }
30         catch (Exception e)
31         {
32             // Error!
33         }
34 
35         // `downloader` is a hypothetical source which provides chunks of
36         // compressed data, eg downloaded from a socket
37         // Before processing, it may be wise to check that the first bytes in
38         // the file are equal to GZipFileSignature (for gzip files) or
39         // ocean.util.compress.c.Zip.ZipLocalFileHeaderSignature (for pkzip); an
40         // exception will be thrown if this is not true.
41 
42         foreach (compressed_chunk; downloader)
43         {
44             try
45             {
46                 uncompressed = unzipper.decompress(compressed_chunk);
47 ^
48                 Stdout.format("{}", uncompressed);
49             }
50             catch (Exception e)
51             {
52                 // Error!
53             }
54         }
55 
56         if (!unzipper.end())
57         {
58             // Error!
59         }
60     ---
61 
62 
63     copyright:  Copyright (c) 2016 dunnhumby Germany GmbH. All rights reserved
64 
65 *******************************************************************************/
66 
67 module ocean.io.compress.ZipStream;
68 
69 
70 import ocean.core.Array : startsWith;
71 import ocean.core.Exception;
72 import ocean.io.compress.ZlibStream;
73 import ocean.meta.types.Qualifiers;
74 import ocean.util.compress.c.Zip;
75 import ocean.util.container.AppendBuffer;
76 import ocean.util.digest.Crc32;
77 
78 
79 /*******************************************************************************
80 
81     The file signature (magic number) used to identify a GZIP file
82 
83 *******************************************************************************/
84 
85 public static istring GZipFileSignature = "\x1F\x8b";
86 
87 
88 
89 /*******************************************************************************
90 
91     Zlib decoder which supports both gzip and pkzip compressed streams.
92 
93     Pkzip files are supported only if they contain a single file which is
94     located at the start of the archive, and which contain a complete local file
95     header record (that is, the length and CRC of that file are specified at the
96     start of the archive).
97 
98 *******************************************************************************/
99 
100 public class ZipStreamDecompressor : ZlibStreamDecompressor
101 {
102     /***************************************************************************
103 
104         Object pool index, allows instances of this type to be stored in a
105         pool.
106 
107     ***************************************************************************/
108 
109     public size_t object_pool_index;
110 
111 
112     /***************************************************************************
113 
114         Feed decompression buffer.
115 
116     ***************************************************************************/
117 
118     private AppendBuffer!(ubyte) uncompressed;
119 
120 
121     /***************************************************************************
122 
123         CRC instance for validating Pkzip files
124 
125     ***************************************************************************/
126 
127     private Crc32 crc;
128 
129 
130     /***************************************************************************
131 
132         Header of the current compressed file, if this is a PKZIP archive
133 
134     ***************************************************************************/
135 
136     private ZipLocalFileHeaderRecord zip_header;
137 
138 
139     /***************************************************************************
140 
141         State of the decompression. The file may be a GZip file, or a PKZIP
142         archive.
143 
144     ***************************************************************************/
145 
146     private enum DecompressState
147     {
148         NotStarted,    /// Decompression has not yet begun
149         GzipStarted,   /// Gzip decompression is in progress
150         PkzipHeader,   /// A Pkzip local file header is being read
151         PkzipExtra,    /// A Pkzip extra field is being skipped
152         PkZipBody,     /// Pkzip compressed data is being read
153         PkZipTrailer   /// Data after the compressed file is being skipped
154     };
155 
156     private DecompressState state;
157 
158 
159     /***************************************************************************
160 
161         Counter which drops to zero when the current PKZIP section has finished
162 
163     ***************************************************************************/
164 
165     private int pkzip_byte_counter;
166 
167 
168     /***************************************************************************
169 
170         Reusable exception thrown when a zip file cannot be decompressed
171 
172     ***************************************************************************/
173 
174     public static class DecompressionException : Exception
175     {
176 
177         /***********************************************************************
178 
179             Provides standard reusable exception API
180 
181         ***********************************************************************/
182 
183         mixin ReusableExceptionImplementation!();
184 
185     }
186 
187 
188     /***************************************************************************
189 
190         Reusable exception instance
191 
192     ***************************************************************************/
193 
194 
195     private DecompressionException exception;
196 
197 
198     /***************************************************************************
199 
200         Constructor
201 
202     ***************************************************************************/
203 
204     public this ()
205     {
206         this.uncompressed = new AppendBuffer!(ubyte);
207         this.crc = new Crc32;
208         this.exception = new DecompressionException;
209     }
210 
211 
212     /***************************************************************************
213 
214         Begin processing of a compressed file
215 
216     ***************************************************************************/
217 
218     public void reset ( )
219     {
220         this.state = DecompressState.NotStarted;
221     }
222 
223 
224     /***************************************************************************
225 
226         Release the resources used for decompression, and perform a consistency
227         check
228 
229         Returns:
230             true if the file was well-formed, false if it was inconsistent
231 
232     ***************************************************************************/
233 
234     public bool endDecompression ( )
235     {
236         // If decompression was started, end it.
237 
238         if ( this.state == DecompressState.GzipStarted ||
239             this.state == DecompressState.PkZipBody )
240         {
241             return this.end();
242         }
243 
244         // If it was a PKZIP file and we reached the end,
245         // it is OK
246 
247         if ( this.state == DecompressState.PkZipTrailer )
248         {
249             return true;
250         }
251 
252         // Any other situation is an error
253 
254         return false;
255     }
256 
257 
258     /***************************************************************************
259 
260         Decompress a chunk of input data
261 
262         Params:
263             data = received data chunk
264 
265         Returns:
266             the uncompressed data
267 
268         Throws:
269             if a decompression error occurs
270 
271     ***************************************************************************/
272 
273     public ubyte [] decompress ( const(ubyte) [] data )
274     {
275         if ( this.state == DecompressState.NotStarted )
276         {
277             // Use the first bytes to identify which format it is
278 
279             if ( startsWith(cast(cstring)data, GZipFileSignature) )
280             {
281                 // GZip file
282 
283                 this.state = DecompressState.GzipStarted;
284 
285                 this.start();
286             }
287             else if ( startsWith(cast(cstring)data,
288                 ZipLocalFileHeaderSignature) )
289             {
290                 // PKZip file
291 
292                 this.state = DecompressState.PkzipHeader;
293                 this.uncompressed.clear();
294                 data = data[ZipLocalFileHeaderSignature.length..$];
295             }
296             else
297             {
298                 this.exception.set("Unsupported file format");
299                 throw this.exception;
300             }
301         }
302 
303         if ( this.state == DecompressState.PkzipHeader )
304         {
305             // Append data to 'uncompressed' until we've obtained the header
306 
307             if ( this.uncompressed.length + data.length
308                 < this.zip_header.sizeof )
309             {
310                 this.uncompressed.append(data);
311                 return null;
312             }
313 
314             auto len = this.zip_header.sizeof - this.uncompressed.length;
315 
316             this.uncompressed.append(data[0..len]);
317             this.zip_header = *cast(ZipLocalFileHeaderRecord *)
318                             (this.uncompressed[]);
319 
320             // Check that the file format is one which we support
321 
322             if ( this.zip_header.isCrcMissing() )
323             {
324                 this.exception.set("Zip file is not streamable - No CRC");
325                 throw this.exception;
326             }
327 
328             if ( !this.zip_header.isDeflateCompressed() )
329             {
330                 // This error most likely indicates data corruption, or a tiny
331                 // file. Deflate compression has been standard since 1993.
332                 // Tiny files are STORED instead of DEFLATED.
333                 this.exception.set("Zip file uses unsupported compression");
334                 throw this.exception;
335             }
336 
337             data = data[ len..$ ];
338 
339             this.state = DecompressState.PkzipExtra;
340 
341             // Calculate the number of bytes which need to be skipped
342 
343             this.pkzip_byte_counter = this.zip_header.file_name_length +
344                 this.zip_header.extra_field_length;
345 
346         }
347 
348         if ( this.state == DecompressState.PkzipExtra )
349         {
350             // Skip the filename and the 'extra field'
351 
352             if ( this.pkzip_byte_counter >= data.length )
353             {
354                 this.pkzip_byte_counter -= data.length;
355 
356                 return null;
357             }
358 
359             data = data[this.pkzip_byte_counter .. $];
360 
361             // Now, we start decompressing the actual zip stream
362             // It does not have any header encoding
363 
364             this.start(Encoding.None);
365 
366             // Reset the CRC. This is a workaround for a terrible Tango design
367             // (the CRC is reset when you read the digest -- which means that
368             // if the digest wasn't read, the next CRC will be incorrect)
369 
370             this.crc.crc32Digest();
371 
372             // Determine how many bytes of compressed data to wait for
373 
374             this.pkzip_byte_counter = this.zip_header.compressed_size;
375 
376             this.state = DecompressState.PkZipBody;
377         }
378 
379         // Now, obtain the compressed data
380 
381         auto compressed_data = data;
382 
383         if ( this.state == DecompressState.PkZipBody )
384         {
385             if ( data.length >= this.pkzip_byte_counter )
386             {
387                 compressed_data = data[0 .. this.pkzip_byte_counter];
388             }
389 
390             this.pkzip_byte_counter -= compressed_data.length;
391 
392             data = data[compressed_data.length .. $];
393         }
394 
395         if ( this.state == DecompressState.PkZipTrailer )
396         {
397             // Don't need anything more from the file.
398             // Just skip everything.
399 
400             return null;
401         }
402 
403         this.uncompressed.clear();
404 
405         // The cast is necessary only because ZLibStreamDecompressor isn't
406         // const-correct.
407         this.decodeChunk(cast(ubyte[])compressed_data,
408             ( ubyte[] uncompressed_chunk )
409             {
410                 this.uncompressed.append(uncompressed_chunk);
411             }
412         );
413 
414         if ( this.state == DecompressState.PkZipBody )
415         {
416             this.crc.update(this.uncompressed[]);
417 
418             // Check if we have finished reading the compressed data
419 
420             if ( this.pkzip_byte_counter == 0 )
421             {
422                 this.state = DecompressState.PkZipTrailer;
423 
424                 // Check if it was genuinely the end of a compressed stream
425 
426                 if ( !this.end() )
427                 {
428                     this.exception.set("Zip file is incomplete");
429                     throw this.exception;
430                 }
431 
432                 // Now that the compressed data has finished, check if
433                 // the checksum was correct.
434                 // Note that the CRC is reset when you read the digest.
435 
436                 auto calculated_crc = this.crc.crc32Digest();
437 
438                 if ( calculated_crc != this.zip_header.crc_32 )
439                 {
440                     this.exception.set("Zip file checksum failed.");
441                     throw this.exception;
442                 }
443 
444             }
445         }
446         return this.uncompressed[];
447     }
448 }
449 
450 
451 unittest
452 {
453     import ocean.core.Test;
454 
455     // A tiny PKZIP file using DEFLATE compression
456 
457     immutable rawZip =
458         "\x50\x4b\x03\x04\x14\x00\x00\x00\x08\x00\xa2\x6e\x2d\x50\x2f\xbd" ~
459         "\x37\x12\x08\x00\x00\x00\x10\x00\x00\x00\x08\x00\x1c\x00\x74\x65" ~
460         "\x73\x74\x2e\x74\x78\x74\x55\x54\x09\x00\x03\x30\x68\x1c\x5e\x30" ~
461         "\x68\x1c\x5e\x75\x78\x0b\x00\x01\x04\xe8\x03\x00\x00\x04\xe8\x03" ~
462         "\x00\x00\x4b\x4c\x4c\x4a\x44\x42\x5c\x00\x50\x4b\x01\x02\x1e\x03" ~
463         "\x14\x00\x00\x00\x08\x00\xa2\x6e\x2d\x50\x2f\xbd\x37\x12\x08\x00" ~
464         "\x00\x00\x10\x00\x00\x00\x08\x00\x18\x00\x00\x00\x00\x00\x01\x00" ~
465         "\x00\x00\xb4\x81\x00\x00\x00\x00\x74\x65\x73\x74\x2e\x74\x78\x74" ~
466         "\x55\x54\x05\x00\x03\x30\x68\x1c\x5e\x75\x78\x0b\x00\x01\x04\xe8" ~
467         "\x03\x00\x00\x04\xe8\x03\x00\x00\x50\x4b\x05\x06\x00\x00\x00\x00" ~
468         "\x01\x00\x01\x00\x4e\x00\x00\x00\x4a\x00\x00\x00\x00\x00";
469 
470     auto unzipper = new ZipStreamDecompressor;
471     unzipper.reset();
472     unzipper.start();
473     auto decom = unzipper.decompress(cast(const(ubyte)[]) rawZip);
474     test!("==")(decom, "aabaabaabaabaab\n");
475     test(unzipper.endDecompression());
476 
477     // Now process the same file, in chunks.
478 
479     unzipper.reset();
480     unzipper.start();
481     decom = unzipper.decompress(cast(const(ubyte)[]) rawZip[0..10]);
482     test(decom.length == 0);
483     decom = unzipper.decompress(cast(const(ubyte)[]) rawZip[10..60]);
484     test(decom.length == 0);
485     decom = unzipper.decompress(cast(const(ubyte)[]) rawZip[60..70]);
486     test!("==")(decom, "aab");
487     decom = unzipper.decompress(cast(const(ubyte)[]) rawZip[70..72]);
488     test!("==")(decom, "aabaabaabaab");
489     decom = unzipper.decompress(cast(const(ubyte)[]) rawZip[72..$]);
490     test!("==")(decom, "\n");
491     test(unzipper.endDecompression());
492 }