1 /*******************************************************************************
2 3 Copyright:
4 Copyright (c) 2004 Kris Bell.
5 Some parts copyright (c) 2009-2016 dunnhumby Germany GmbH.
6 All rights reserved.
7 8 License:
9 Tango Dual License: 3-Clause BSD License / Academic Free License v3.0.
10 See LICENSE_TANGO.txt for details.
11 12 Version: Initial release: December 2005
13 14 Authors: Kris
15 16 *******************************************************************************/17 18 moduleocean.text.convert.UnicodeBom;
19 20 importcore.exception : onUnicodeError;
21 22 importocean.meta.types.Qualifiers;
23 importocean.core.ByteSwap;
24 25 importUtf = ocean.text.convert.Utf;
26 27 version (unittest) importocean.core.Test;
28 29 /*******************************************************************************
30 31 see http://icu.sourceforge.net/docs/papers/forms_of_unicode/#t2
32 33 *******************************************************************************/34 35 enumEncoding {
36 Unknown,
37 UTF_8N,
38 UTF_8,
39 UTF_16,
40 UTF_16BE,
41 UTF_16LE,
42 UTF_32,
43 UTF_32BE,
44 UTF_32LE,
45 };
46 47 /*******************************************************************************
48 49 Convert unicode content
50 51 Unicode is an encoding of textual material. The purpose of this module
52 is to interface external-encoding with a programmer-defined internal-
53 encoding. This internal encoding is declared via the template argument
54 T, whilst the external encoding is either specified or derived.
55 56 Three internal encodings are supported: char, wchar, and dchar. The
57 methods herein operate upon arrays of this type. That is, decode()
58 returns an array of the type, while encode() expect an array of said
59 type.
60 61 Supported external encodings are as follow:
62 63 Encoding.Unknown
64 Encoding.UTF_8N
65 Encoding.UTF_8
66 Encoding.UTF_16
67 Encoding.UTF_16BE
68 Encoding.UTF_16LE
69 Encoding.UTF_32
70 Encoding.UTF_32BE
71 Encoding.UTF_32LE
72 73 These can be divided into non-explicit and explicit encodings:
74 75 Encoding.Unknown
76 Encoding.UTF_8
77 Encoding.UTF_16
78 Encoding.UTF_32
79 80 81 Encoding.UTF_8N
82 Encoding.UTF_16BE
83 Encoding.UTF_16LE
84 Encoding.UTF_32BE
85 Encoding.UTF_32LE
86 87 The former group of non-explicit encodings may be used to 'discover'
88 an unknown encoding, by examining the first few bytes of the content
89 for a signature. This signature is optional, but is often written such
90 that the content is self-describing. When an encoding is unknown, using
91 one of the non-explicit encodings will cause the decode() method to look
92 for a signature and adjust itself accordingly. It is possible that a
93 ZWNBSP character might be confused with the signature; today's unicode
94 content is supposed to use the WORD-JOINER character instead.
95 96 The group of explicit encodings are for use when the content encoding
97 is known. These *must* be used when converting back to external encoding,
98 since written content must be in a known format. It should be noted that,
99 during a decode() operation, the existence of a signature is in conflict
100 with these explicit varieties.
101 102 103 See
104 $(LINK http://www.utf-8.com/)
105 $(LINK http://www.hackcraft.net/xmlUnicode/)
106 $(LINK http://www.unicode.org/faq/utf_bom.html/)
107 $(LINK http://www.azillionmonkeys.com/qed/unicode.html/)
108 $(LINK http://icu.sourceforge.net/docs/papers/forms_of_unicode/)
109 110 *******************************************************************************/111 112 classUnicodeBom(T) : BomSniffer113 {
114 staticif (!is (T == char) && !is (T == wchar) && !is (T == dchar))
115 pragma (msg, "Template type must be char, wchar, or dchar");
116 117 /***********************************************************************
118 119 Construct a instance using the given external encoding ~ one
120 of the Encoding.xx types
121 122 ***********************************************************************/123 124 this (Encodingencoding)
125 {
126 setup (encoding);
127 }
128 129 /***********************************************************************
130 131 Convert the provided content. The content is inspected
132 for a BOM signature, which is stripped. An exception is
133 thrown if a signature is present when, according to the
134 encoding type, it should not be. Conversely, An exception
135 is thrown if there is no known signature where the current
136 encoding expects one to be present.
137 138 Where 'ate' is provided, it will be set to the number of
139 elements consumed from the input and the decoder operates
140 in streaming-mode. That is: 'dst' should be supplied since
141 it is not resized or allocated.
142 143 ***********************************************************************/144 145 finalconst(T)[] decode (void[] content, T[] dst=null, size_t* ate=null)
146 {
147 // look for a BOM148 autoinfo = test (content);
149 150 // are we expecting a BOM?151 if (lookup[encoding].test)
152 if (info)
153 {
154 // yep ~ and we got one155 setup (info.encoding, true);
156 157 // strip BOM from content158 content = content [info.bom.length .. $];
159 }
160 else161 // can this encoding be defaulted?162 if (settings.fallback)
163 setup (settings.fallback, false);
164 else165 onUnicodeError("UnicodeBom.decode :: unknown or missing BOM", 0);
166 else167 if (info)
168 // found a BOM when using an explicit encoding169 onUnicodeError("UnicodeBom.decode :: explicit encoding does not permit BOM", 0);
170 171 // convert it to internal representation172 autoret = into (swapBytes(content), settings.type, dst, ate);
173 if (ate && info)
174 *ate += info.bom.length;
175 returnret;
176 }
177 178 /***********************************************************************
179 180 Perform encoding of content. Note that the encoding must be
181 of the explicit variety by the time we get here
182 183 ***********************************************************************/184 185 finalvoid[] encode (T[] content, void[] dst=null)
186 {
187 if (settings.test)
188 onUnicodeError("UnicodeBom.encode :: cannot write to a non-specific encoding", 0);
189 190 // convert it to external representation, and write191 autodst_ret = from (content, settings.type, dst);
192 if (dst_ret.ptrisdst.ptr)
193 returnswapBytes (dst);
194 else195 returnswapBytes (dst_ret.dup);
196 }
197 198 /***********************************************************************
199 200 Swap bytes around, as required by the encoding
201 202 ***********************************************************************/203 204 privatefinalvoid[] swapBytes (void[] content)
205 {
206 boolendian = settings.endian;
207 boolswap = settings.bigEndian;
208 209 version (BigEndian)
210 swap = !swap;
211 212 if (endian && swap)
213 {
214 if (settings.type == Utf16)
215 ByteSwap.swap16 (content.ptr, content.length);
216 else217 ByteSwap.swap32 (content.ptr, content.length);
218 }
219 returncontent;
220 }
221 222 /***********************************************************************
223 224 Convert from 'type' into the given T.
225 226 Where 'ate' is provided, it will be set to the number of
227 elements consumed from the input and the decoder operates
228 in streaming-mode. That is: 'dst' should be supplied since
229 it is not resized or allocated.
230 231 ***********************************************************************/232 233 staticconst(T)[] into (void[] x, uinttype, T[] dst=null, size_t* ate = null)
234 {
235 const(T)[] ret;
236 237 staticif (is (T == char))
238 {
239 if (type == Utf8)
240 {
241 if (ate)
242 *ate = x.length;
243 ret = cast(char[]) x;
244 }
245 else246 if (type == Utf16)
247 ret = Utf.toString (cast(wchar[]) x, dst, ate);
248 else249 if (type == Utf32)
250 ret = Utf.toString (cast(dchar[]) x, dst, ate);
251 }
252 253 staticif (is (T == wchar))
254 {
255 if (type == Utf16)
256 {
257 if (ate)
258 *ate = x.length;
259 ret = cast(wchar[]) x;
260 }
261 else262 if (type == Utf8)
263 ret = Utf.toString16 (cast(char[]) x, dst, ate);
264 else265 if (type == Utf32)
266 ret = Utf.toString16 (cast(dchar[]) x, dst, ate);
267 }
268 269 staticif (is (T == dchar))
270 {
271 if (type == Utf32)
272 {
273 if (ate)
274 *ate = x.length;
275 ret = cast(const(dchar)[]) x;
276 }
277 else278 if (type == Utf8)
279 ret = Utf.toString32 (cast(char[]) x, dst, ate);
280 else281 if (type == Utf16)
282 ret = Utf.toString32 (cast(wchar[]) x, dst, ate);
283 }
284 returnret;
285 }
286 287 288 /***********************************************************************
289 290 Convert from T into the given 'type'.
291 292 Where 'ate' is provided, it will be set to the number of
293 elements consumed from the input and the decoder operates
294 in streaming-mode. That is: 'dst' should be supplied since
295 it is not resized or allocated.
296 297 ***********************************************************************/298 299 staticconst(void)[] from (T[] x, uinttype, void[] dst=null, size_t* ate=null)
300 {
301 const(void)[] ret;
302 303 staticif (is (T == char))
304 {
305 if (type == Utf8)
306 {
307 if (ate)
308 *ate = x.length;
309 ret = x;
310 }
311 else312 if (type == Utf16)
313 ret = Utf.toString16 (x, cast(wchar[]) dst, ate);
314 else315 if (type == Utf32)
316 ret = Utf.toString32 (x, cast(dchar[]) dst, ate);
317 }
318 319 staticif (is (T == wchar))
320 {
321 if (type == Utf16)
322 {
323 if (ate)
324 *ate = x.length;
325 ret = x;
326 }
327 else328 if (type == Utf8)
329 ret = Utf.toString (x, cast(char[]) dst, ate);
330 else331 if (type == Utf32)
332 ret = Utf.toString32 (x, cast(dchar[]) dst, ate);
333 }
334 335 staticif (is (T == dchar))
336 {
337 if (type == Utf32)
338 {
339 if (ate)
340 *ate = x.length;
341 ret = x;
342 }
343 else344 if (type == Utf8)
345 ret = Utf.toString (x, cast(char[]) dst, ate);
346 else347 if (type == Utf16)
348 ret = Utf.toString16 (x, cast(wchar[]) dst, ate);
349 }
350 351 returnret;
352 }
353 }
354 355 /*******************************************************************************
356 357 Handle byte-order-mark prefixes
358 359 *******************************************************************************/360 361 classBomSniffer362 {
363 privateboolfound; // was an encoding discovered?364 privateEncodingencoder; // the current encoding365 privateconst(Info)* settings; // pointer to encoding configuration366 367 privatestructInfo368 {
369 inttype; // type of element (char/wchar/dchar)370 Encodingencoding; // Encoding.xx encoding371 char[] bom; // pattern to match for signature372 booltest, // should we test for this encoding?373 endian, // this encoding have endian concerns?374 bigEndian; // is this a big-endian encoding?375 Encodingfallback; // can this encoding be defaulted?376 };
377 378 privateenum {Utf8, Utf16, Utf32};
379 380 privatestaticconst(Info[]) lookup = [
381 {Utf8, Encoding.Unknown, null, true, false, false, Encoding.UTF_8},
382 {Utf8, Encoding.UTF_8N, null, true, false, false, Encoding.UTF_8},
383 {Utf8, Encoding.UTF_8, "\xEF\xBB\xBF", false},
384 {Utf16, Encoding.UTF_16, null, true, false, false, Encoding.UTF_16BE},
385 {Utf16, Encoding.UTF_16BE, "\xFE\xFF", false, true, true},
386 {Utf16, Encoding.UTF_16LE, "\xFF\xFE", false, true},
387 {Utf32, Encoding.UTF_32, null, true, false, false, Encoding.UTF_32BE},
388 {Utf32, Encoding.UTF_32BE, "\x00\x00\xFE\xFF", false, true, true},
389 {Utf32, Encoding.UTF_32LE, "\xFF\xFE\x00\x00", false, true},
390 ];
391 392 /***********************************************************************
393 394 Return the current encoding. This is either the originally
395 specified encoding, or a derived one obtained by inspecting
396 the content for a BOM. The latter is performed as part of
397 the decode() method
398 399 ***********************************************************************/400 401 finalEncodingencoding ()
402 {
403 returnencoder;
404 }
405 406 /***********************************************************************
407 408 Was an encoding located in the text (configured via setup)
409 410 ***********************************************************************/411 412 finalboolencoded ()
413 {
414 returnfound;
415 }
416 417 /***********************************************************************
418 419 Return the signature (BOM) of the current encoding
420 421 ***********************************************************************/422 423 finalconst(void)[] signature ()
424 {
425 returnsettings.bom;
426 }
427 428 /***********************************************************************
429 430 Configure this instance with unicode converters
431 432 ***********************************************************************/433 434 finalvoidsetup (Encodingencoding, boolfound = false)
435 {
436 this.settings = &lookup[encoding];
437 this.encoder = encoding;
438 this.found = found;
439 }
440 441 /***********************************************************************
442 443 Scan the BOM signatures looking for a match. We scan in
444 reverse order to get the longest match first
445 446 ***********************************************************************/447 448 staticfinalconst(Info)* test (void[] content)
449 {
450 for (autoinfo=lookup.ptr+lookup.length; --info >= lookup.ptr;)
451 {
452 if (info.bom)
453 {
454 autolen = info.bom.length;
455 if (len <= content.length)
456 if (content[0..len] == info.bom[0..len])
457 returninfo;
458 }
459 }
460 returnnull;
461 }
462 }
463 464 /*******************************************************************************
465 466 *******************************************************************************/467 468 unittest469 {
470 void[] INPUT2 = "abc\xE3\x81\x82\xE3\x81\x84\xE3\x81\x86".dup;
471 void[] INPUT = "\xEF\xBB\xBF" ~ INPUT2;
472 autobom = newUnicodeBom!(char)(Encoding.Unknown);
473 size_tate;
474 char[256] buf;
475 476 autotemp = bom.decode (INPUT, buf, &ate);
477 test (ate == INPUT.length);
478 test (bom.encoding == Encoding.UTF_8);
479 480 temp = bom.decode (INPUT2, buf, &ate);
481 test (ate == INPUT2.length);
482 test (bom.encoding == Encoding.UTF_8);
483 }