ocean.text.convert.UnicodeBom source code

1 /*******************************************************************************
2 
3         Copyright:
4             Copyright (c) 2004 Kris Bell.
5             Some parts copyright (c) 2009-2016 dunnhumby Germany GmbH.
6             All rights reserved.
7 
8         License:
9             Tango Dual License: 3-Clause BSD License / Academic Free License v3.0.
10             See LICENSE_TANGO.txt for details.
11 
12         Version: Initial release: December 2005
13 
14         Authors: Kris
15 
16 *******************************************************************************/
17 
18 module ocean.text.convert.UnicodeBom;
19 
20 import core.exception : onUnicodeError;
21 
22 import ocean.meta.types.Qualifiers;
23 import ocean.core.ByteSwap;
24 
25 import  Utf = ocean.text.convert.Utf;
26 
27 version (unittest) import ocean.core.Test;
28 
29 /*******************************************************************************
30 
31      see http://icu.sourceforge.net/docs/papers/forms_of_unicode/#t2
32 
33 *******************************************************************************/
34 
35 enum Encoding {
36     Unknown,
37     UTF_8N,
38     UTF_8,
39     UTF_16,
40     UTF_16BE,
41     UTF_16LE,
42     UTF_32,
43     UTF_32BE,
44     UTF_32LE,
45 };
46 
47 /*******************************************************************************
48 
49     Convert unicode content
50 
51     Unicode is an encoding of textual material. The purpose of this module
52     is to interface external-encoding with a programmer-defined internal-
53     encoding. This internal encoding is declared via the template argument
54     T, whilst the external encoding is either specified or derived.
55 
56     Three internal encodings are supported: char, wchar, and dchar. The
57     methods herein operate upon arrays of this type. That is, decode()
58     returns an array of the type, while encode() expect an array of said
59     type.
60 
61     Supported external encodings are as follow:
62 
63             Encoding.Unknown
64             Encoding.UTF_8N
65             Encoding.UTF_8
66             Encoding.UTF_16
67             Encoding.UTF_16BE
68             Encoding.UTF_16LE
69             Encoding.UTF_32
70             Encoding.UTF_32BE
71             Encoding.UTF_32LE
72 
73     These can be divided into non-explicit and explicit encodings:
74 
75             Encoding.Unknown
76             Encoding.UTF_8
77             Encoding.UTF_16
78             Encoding.UTF_32
79 
80 
81             Encoding.UTF_8N
82             Encoding.UTF_16BE
83             Encoding.UTF_16LE
84             Encoding.UTF_32BE
85             Encoding.UTF_32LE
86 
87     The former group of non-explicit encodings may be used to 'discover'
88     an unknown encoding, by examining the first few bytes of the content
89     for a signature. This signature is optional, but is often written such
90     that the content is self-describing. When an encoding is unknown, using
91     one of the non-explicit encodings will cause the decode() method to look
92     for a signature and adjust itself accordingly. It is possible that a
93     ZWNBSP character might be confused with the signature; today's unicode
94     content is supposed to use the WORD-JOINER character instead.
95 
96     The group of explicit encodings are for use when the content encoding
97     is known. These *must* be used when converting back to external encoding,
98     since written content must be in a known format. It should be noted that,
99     during a decode() operation, the existence of a signature is in conflict
100     with these explicit varieties.
101 
102 
103     See
104     $(LINK http://www.utf-8.com/)
105     $(LINK http://www.hackcraft.net/xmlUnicode/)
106     $(LINK http://www.unicode.org/faq/utf_bom.html/)
107     $(LINK http://www.azillionmonkeys.com/qed/unicode.html/)
108     $(LINK http://icu.sourceforge.net/docs/papers/forms_of_unicode/)
109 
110 *******************************************************************************/
111 
112 class UnicodeBom(T) : BomSniffer
113 {
114     static if (!is (T == char) && !is (T == wchar) && !is (T == dchar))
115         pragma (msg, "Template type must be char, wchar, or dchar");
116 
117     /***********************************************************************
118 
119         Construct a instance using the given external encoding ~ one
120         of the Encoding.xx types
121 
122     ***********************************************************************/
123 
124     this (Encoding encoding)
125     {
126         setup (encoding);
127     }
128 
129     /***********************************************************************
130 
131         Convert the provided content. The content is inspected
132         for a BOM signature, which is stripped. An exception is
133         thrown if a signature is present when, according to the
134         encoding type, it should not be. Conversely, An exception
135         is thrown if there is no known signature where the current
136         encoding expects one to be present.
137 
138         Where 'ate' is provided, it will be set to the number of
139         elements consumed from the input and the decoder operates
140         in streaming-mode. That is: 'dst' should be supplied since
141         it is not resized or allocated.
142 
143     ***********************************************************************/
144 
145     final const(T)[] decode (void[] content, T[] dst=null, size_t* ate=null)
146     {
147         // look for a BOM
148         auto info = test (content);
149 
150         // are we expecting a BOM?
151         if (lookup[encoding].test)
152             if (info)
153             {
154                 // yep ~ and we got one
155                 setup (info.encoding, true);
156 
157                 // strip BOM from content
158                 content = content [info.bom.length .. $];
159             }
160             else
161                 // can this encoding be defaulted?
162                 if (settings.fallback)
163                     setup (settings.fallback, false);
164                 else
165                     onUnicodeError("UnicodeBom.decode :: unknown or missing BOM", 0);
166         else
167             if (info)
168                 // found a BOM when using an explicit encoding
169                 onUnicodeError("UnicodeBom.decode :: explicit encoding does not permit BOM", 0);
170 
171         // convert it to internal representation
172         auto ret = into (swapBytes(content), settings.type, dst, ate);
173         if (ate && info)
174             *ate += info.bom.length;
175         return ret;
176     }
177 
178     /***********************************************************************
179 
180         Perform encoding of content. Note that the encoding must be
181         of the explicit variety by the time we get here
182 
183     ***********************************************************************/
184 
185     final void[] encode (T[] content, void[] dst=null)
186     {
187         if (settings.test)
188             onUnicodeError("UnicodeBom.encode :: cannot write to a non-specific encoding", 0);
189 
190         // convert it to external representation, and write
191         auto dst_ret = from (content, settings.type, dst);
192         if (dst_ret.ptr is dst.ptr)
193             return swapBytes (dst);
194         else
195             return swapBytes (dst_ret.dup);
196     }
197 
198     /***********************************************************************
199 
200         Swap bytes around, as required by the encoding
201 
202     ***********************************************************************/
203 
204     private final void[] swapBytes (void[] content)
205     {
206         bool endian = settings.endian;
207         bool swap   = settings.bigEndian;
208 
209         version (BigEndian)
210             swap = !swap;
211 
212         if (endian && swap)
213         {
214             if (settings.type == Utf16)
215                 ByteSwap.swap16 (content.ptr, content.length);
216             else
217                 ByteSwap.swap32 (content.ptr, content.length);
218         }
219         return content;
220     }
221 
222     /***********************************************************************
223 
224         Convert from 'type' into the given T.
225 
226         Where 'ate' is provided, it will be set to the number of
227         elements consumed from the input and the decoder operates
228         in streaming-mode. That is: 'dst' should be supplied since
229         it is not resized or allocated.
230 
231     ***********************************************************************/
232 
233     static const(T)[] into (void[] x, uint type, T[] dst=null, size_t* ate = null)
234     {
235         const(T)[] ret;
236 
237         static if (is (T == char))
238         {
239             if (type == Utf8)
240             {
241                 if (ate)
242                     *ate = x.length;
243                 ret = cast(char[]) x;
244             }
245             else
246                 if (type == Utf16)
247                     ret = Utf.toString (cast(wchar[]) x, dst, ate);
248                 else
249                     if (type == Utf32)
250                         ret = Utf.toString (cast(dchar[]) x, dst, ate);
251         }
252 
253         static if (is (T == wchar))
254         {
255             if (type == Utf16)
256             {
257                 if (ate)
258                     *ate = x.length;
259                 ret = cast(wchar[]) x;
260             }
261             else
262                 if (type == Utf8)
263                     ret = Utf.toString16 (cast(char[]) x, dst, ate);
264                 else
265                     if (type == Utf32)
266                         ret = Utf.toString16 (cast(dchar[]) x, dst, ate);
267         }
268 
269         static if (is (T == dchar))
270         {
271             if (type == Utf32)
272             {
273                 if (ate)
274                     *ate = x.length;
275                 ret = cast(const(dchar)[]) x;
276             }
277             else
278                 if (type == Utf8)
279                     ret = Utf.toString32 (cast(char[]) x, dst, ate);
280                 else
281                     if (type == Utf16)
282                         ret = Utf.toString32 (cast(wchar[]) x, dst, ate);
283         }
284         return ret;
285     }
286 
287 
288     /***********************************************************************
289 
290         Convert from T into the given 'type'.
291 
292         Where 'ate' is provided, it will be set to the number of
293         elements consumed from the input and the decoder operates
294         in streaming-mode. That is: 'dst' should be supplied since
295         it is not resized or allocated.
296 
297     ***********************************************************************/
298 
299     static const(void)[] from (T[] x, uint type, void[] dst=null, size_t* ate=null)
300     {
301         const(void)[] ret;
302 
303         static if (is (T == char))
304         {
305             if (type == Utf8)
306             {
307                 if (ate)
308                     *ate = x.length;
309                 ret = x;
310             }
311             else
312                 if (type == Utf16)
313                     ret = Utf.toString16 (x, cast(wchar[]) dst, ate);
314                 else
315                     if (type == Utf32)
316                         ret = Utf.toString32 (x, cast(dchar[]) dst, ate);
317         }
318 
319         static if (is (T == wchar))
320         {
321             if (type == Utf16)
322             {
323                 if (ate)
324                     *ate = x.length;
325                 ret = x;
326             }
327             else
328                 if (type == Utf8)
329                     ret = Utf.toString (x, cast(char[]) dst, ate);
330                 else
331                     if (type == Utf32)
332                         ret = Utf.toString32 (x, cast(dchar[]) dst, ate);
333         }
334 
335         static if (is (T == dchar))
336         {
337             if (type == Utf32)
338             {
339                 if (ate)
340                     *ate = x.length;
341                 ret = x;
342             }
343             else
344                 if (type == Utf8)
345                     ret = Utf.toString (x, cast(char[]) dst, ate);
346                 else
347                     if (type == Utf16)
348                         ret = Utf.toString16 (x, cast(wchar[]) dst, ate);
349         }
350 
351         return ret;
352     }
353 }
354 
355 /*******************************************************************************
356 
357         Handle byte-order-mark prefixes
358 
359 *******************************************************************************/
360 
361 class BomSniffer
362 {
363     private bool     found;        // was an encoding discovered?
364     private Encoding encoder;      // the current encoding
365     private const(Info)* settings;     // pointer to encoding configuration
366 
367     private struct  Info
368     {
369         int      type;          // type of element (char/wchar/dchar)
370         Encoding encoding;      // Encoding.xx encoding
371         char[]   bom;           // pattern to match for signature
372         bool     test,          // should we test for this encoding?
373                  endian,        // this encoding have endian concerns?
374                  bigEndian;     // is this a big-endian encoding?
375         Encoding fallback;      // can this encoding be defaulted?
376     };
377 
378     private enum {Utf8, Utf16, Utf32};
379 
380     private static const(Info[]) lookup = [
381         {Utf8,  Encoding.Unknown,  null,        true,  false, false, Encoding.UTF_8},
382         {Utf8,  Encoding.UTF_8N,   null,        true,  false, false, Encoding.UTF_8},
383         {Utf8,  Encoding.UTF_8,    "\xEF\xBB\xBF",   false},
384         {Utf16, Encoding.UTF_16,   null,        true,  false, false, Encoding.UTF_16BE},
385         {Utf16, Encoding.UTF_16BE, "\xFE\xFF", false, true, true},
386         {Utf16, Encoding.UTF_16LE, "\xFF\xFE", false, true},
387         {Utf32, Encoding.UTF_32,   null,        true,  false, false, Encoding.UTF_32BE},
388         {Utf32, Encoding.UTF_32BE, "\x00\x00\xFE\xFF", false, true, true},
389         {Utf32, Encoding.UTF_32LE, "\xFF\xFE\x00\x00", false, true},
390     ];
391 
392     /***********************************************************************
393 
394         Return the current encoding. This is either the originally
395         specified encoding, or a derived one obtained by inspecting
396         the content for a BOM. The latter is performed as part of
397         the decode() method
398 
399     ***********************************************************************/
400 
401     final Encoding encoding ()
402     {
403         return encoder;
404     }
405 
406     /***********************************************************************
407 
408         Was an encoding located in the text (configured via setup)
409 
410     ***********************************************************************/
411 
412     final bool encoded ()
413     {
414         return found;
415     }
416 
417     /***********************************************************************
418 
419         Return the signature (BOM) of the current encoding
420 
421     ***********************************************************************/
422 
423     final const(void)[] signature ()
424     {
425         return settings.bom;
426     }
427 
428     /***********************************************************************
429 
430         Configure this instance with unicode converters
431 
432     ***********************************************************************/
433 
434     final void setup (Encoding encoding, bool found = false)
435     {
436         this.settings = &lookup[encoding];
437         this.encoder = encoding;
438         this.found = found;
439     }
440 
441     /***********************************************************************
442 
443         Scan the BOM signatures looking for a match. We scan in
444         reverse order to get the longest match first
445 
446     ***********************************************************************/
447 
448     static final const(Info)* test (void[] content)
449     {
450         for (auto info=lookup.ptr+lookup.length; --info >= lookup.ptr;)
451         {
452             if (info.bom)
453             {
454                 auto len = info.bom.length;
455                 if (len <= content.length)
456                     if (content[0..len] == info.bom[0..len])
457                         return info;
458             }
459         }
460         return null;
461     }
462 }
463 
464 /*******************************************************************************
465 
466 *******************************************************************************/
467 
468 unittest
469 {
470     void[] INPUT2 = "abc\xE3\x81\x82\xE3\x81\x84\xE3\x81\x86".dup;
471     void[] INPUT = "\xEF\xBB\xBF" ~ INPUT2;
472     auto bom = new UnicodeBom!(char)(Encoding.Unknown);
473     size_t ate;
474     char[256] buf;
475 
476     auto temp = bom.decode (INPUT, buf, &ate);
477     test (ate == INPUT.length);
478     test (bom.encoding == Encoding.UTF_8);
479 
480     temp = bom.decode (INPUT2, buf, &ate);
481     test (ate == INPUT2.length);
482     test (bom.encoding == Encoding.UTF_8);
483 }