1 /******************************************************************************* 2 3 Copyright: 4 Copyright (c) 2004 Kris Bell. 5 Some parts copyright (c) 2009-2016 dunnhumby Germany GmbH. 6 All rights reserved. 7 8 License: 9 Tango Dual License: 3-Clause BSD License / Academic Free License v3.0. 10 See LICENSE_TANGO.txt for details. 11 12 Version: Initial release: December 2005 13 14 Authors: Kris 15 16 *******************************************************************************/ 17 18 module ocean.text.convert.UnicodeBom; 19 20 import core.exception : onUnicodeError; 21 22 import ocean.meta.types.Qualifiers; 23 import ocean.core.ByteSwap; 24 25 import Utf = ocean.text.convert.Utf; 26 27 version (unittest) import ocean.core.Test; 28 29 /******************************************************************************* 30 31 see http://icu.sourceforge.net/docs/papers/forms_of_unicode/#t2 32 33 *******************************************************************************/ 34 35 enum Encoding { 36 Unknown, 37 UTF_8N, 38 UTF_8, 39 UTF_16, 40 UTF_16BE, 41 UTF_16LE, 42 UTF_32, 43 UTF_32BE, 44 UTF_32LE, 45 }; 46 47 /******************************************************************************* 48 49 Convert unicode content 50 51 Unicode is an encoding of textual material. The purpose of this module 52 is to interface external-encoding with a programmer-defined internal- 53 encoding. This internal encoding is declared via the template argument 54 T, whilst the external encoding is either specified or derived. 55 56 Three internal encodings are supported: char, wchar, and dchar. The 57 methods herein operate upon arrays of this type. That is, decode() 58 returns an array of the type, while encode() expect an array of said 59 type. 60 61 Supported external encodings are as follow: 62 63 Encoding.Unknown 64 Encoding.UTF_8N 65 Encoding.UTF_8 66 Encoding.UTF_16 67 Encoding.UTF_16BE 68 Encoding.UTF_16LE 69 Encoding.UTF_32 70 Encoding.UTF_32BE 71 Encoding.UTF_32LE 72 73 These can be divided into non-explicit and explicit encodings: 74 75 Encoding.Unknown 76 Encoding.UTF_8 77 Encoding.UTF_16 78 Encoding.UTF_32 79 80 81 Encoding.UTF_8N 82 Encoding.UTF_16BE 83 Encoding.UTF_16LE 84 Encoding.UTF_32BE 85 Encoding.UTF_32LE 86 87 The former group of non-explicit encodings may be used to 'discover' 88 an unknown encoding, by examining the first few bytes of the content 89 for a signature. This signature is optional, but is often written such 90 that the content is self-describing. When an encoding is unknown, using 91 one of the non-explicit encodings will cause the decode() method to look 92 for a signature and adjust itself accordingly. It is possible that a 93 ZWNBSP character might be confused with the signature; today's unicode 94 content is supposed to use the WORD-JOINER character instead. 95 96 The group of explicit encodings are for use when the content encoding 97 is known. These *must* be used when converting back to external encoding, 98 since written content must be in a known format. It should be noted that, 99 during a decode() operation, the existence of a signature is in conflict 100 with these explicit varieties. 101 102 103 See 104 $(LINK http://www.utf-8.com/) 105 $(LINK http://www.hackcraft.net/xmlUnicode/) 106 $(LINK http://www.unicode.org/faq/utf_bom.html/) 107 $(LINK http://www.azillionmonkeys.com/qed/unicode.html/) 108 $(LINK http://icu.sourceforge.net/docs/papers/forms_of_unicode/) 109 110 *******************************************************************************/ 111 112 class UnicodeBom(T) : BomSniffer 113 { 114 static if (!is (T == char) && !is (T == wchar) && !is (T == dchar)) 115 pragma (msg, "Template type must be char, wchar, or dchar"); 116 117 /*********************************************************************** 118 119 Construct a instance using the given external encoding ~ one 120 of the Encoding.xx types 121 122 ***********************************************************************/ 123 124 this (Encoding encoding) 125 { 126 setup (encoding); 127 } 128 129 /*********************************************************************** 130 131 Convert the provided content. The content is inspected 132 for a BOM signature, which is stripped. An exception is 133 thrown if a signature is present when, according to the 134 encoding type, it should not be. Conversely, An exception 135 is thrown if there is no known signature where the current 136 encoding expects one to be present. 137 138 Where 'ate' is provided, it will be set to the number of 139 elements consumed from the input and the decoder operates 140 in streaming-mode. That is: 'dst' should be supplied since 141 it is not resized or allocated. 142 143 ***********************************************************************/ 144 145 final const(T)[] decode (void[] content, T[] dst=null, size_t* ate=null) 146 { 147 // look for a BOM 148 auto info = test (content); 149 150 // are we expecting a BOM? 151 if (lookup[encoding].test) 152 if (info) 153 { 154 // yep ~ and we got one 155 setup (info.encoding, true); 156 157 // strip BOM from content 158 content = content [info.bom.length .. $]; 159 } 160 else 161 // can this encoding be defaulted? 162 if (settings.fallback) 163 setup (settings.fallback, false); 164 else 165 onUnicodeError("UnicodeBom.decode :: unknown or missing BOM", 0); 166 else 167 if (info) 168 // found a BOM when using an explicit encoding 169 onUnicodeError("UnicodeBom.decode :: explicit encoding does not permit BOM", 0); 170 171 // convert it to internal representation 172 auto ret = into (swapBytes(content), settings.type, dst, ate); 173 if (ate && info) 174 *ate += info.bom.length; 175 return ret; 176 } 177 178 /*********************************************************************** 179 180 Perform encoding of content. Note that the encoding must be 181 of the explicit variety by the time we get here 182 183 ***********************************************************************/ 184 185 final void[] encode (T[] content, void[] dst=null) 186 { 187 if (settings.test) 188 onUnicodeError("UnicodeBom.encode :: cannot write to a non-specific encoding", 0); 189 190 // convert it to external representation, and write 191 auto dst_ret = from (content, settings.type, dst); 192 if (dst_ret.ptr is dst.ptr) 193 return swapBytes (dst); 194 else 195 return swapBytes (dst_ret.dup); 196 } 197 198 /*********************************************************************** 199 200 Swap bytes around, as required by the encoding 201 202 ***********************************************************************/ 203 204 private final void[] swapBytes (void[] content) 205 { 206 bool endian = settings.endian; 207 bool swap = settings.bigEndian; 208 209 version (BigEndian) 210 swap = !swap; 211 212 if (endian && swap) 213 { 214 if (settings.type == Utf16) 215 ByteSwap.swap16 (content.ptr, content.length); 216 else 217 ByteSwap.swap32 (content.ptr, content.length); 218 } 219 return content; 220 } 221 222 /*********************************************************************** 223 224 Convert from 'type' into the given T. 225 226 Where 'ate' is provided, it will be set to the number of 227 elements consumed from the input and the decoder operates 228 in streaming-mode. That is: 'dst' should be supplied since 229 it is not resized or allocated. 230 231 ***********************************************************************/ 232 233 static const(T)[] into (void[] x, uint type, T[] dst=null, size_t* ate = null) 234 { 235 const(T)[] ret; 236 237 static if (is (T == char)) 238 { 239 if (type == Utf8) 240 { 241 if (ate) 242 *ate = x.length; 243 ret = cast(char[]) x; 244 } 245 else 246 if (type == Utf16) 247 ret = Utf.toString (cast(wchar[]) x, dst, ate); 248 else 249 if (type == Utf32) 250 ret = Utf.toString (cast(dchar[]) x, dst, ate); 251 } 252 253 static if (is (T == wchar)) 254 { 255 if (type == Utf16) 256 { 257 if (ate) 258 *ate = x.length; 259 ret = cast(wchar[]) x; 260 } 261 else 262 if (type == Utf8) 263 ret = Utf.toString16 (cast(char[]) x, dst, ate); 264 else 265 if (type == Utf32) 266 ret = Utf.toString16 (cast(dchar[]) x, dst, ate); 267 } 268 269 static if (is (T == dchar)) 270 { 271 if (type == Utf32) 272 { 273 if (ate) 274 *ate = x.length; 275 ret = cast(const(dchar)[]) x; 276 } 277 else 278 if (type == Utf8) 279 ret = Utf.toString32 (cast(char[]) x, dst, ate); 280 else 281 if (type == Utf16) 282 ret = Utf.toString32 (cast(wchar[]) x, dst, ate); 283 } 284 return ret; 285 } 286 287 288 /*********************************************************************** 289 290 Convert from T into the given 'type'. 291 292 Where 'ate' is provided, it will be set to the number of 293 elements consumed from the input and the decoder operates 294 in streaming-mode. That is: 'dst' should be supplied since 295 it is not resized or allocated. 296 297 ***********************************************************************/ 298 299 static const(void)[] from (T[] x, uint type, void[] dst=null, size_t* ate=null) 300 { 301 const(void)[] ret; 302 303 static if (is (T == char)) 304 { 305 if (type == Utf8) 306 { 307 if (ate) 308 *ate = x.length; 309 ret = x; 310 } 311 else 312 if (type == Utf16) 313 ret = Utf.toString16 (x, cast(wchar[]) dst, ate); 314 else 315 if (type == Utf32) 316 ret = Utf.toString32 (x, cast(dchar[]) dst, ate); 317 } 318 319 static if (is (T == wchar)) 320 { 321 if (type == Utf16) 322 { 323 if (ate) 324 *ate = x.length; 325 ret = x; 326 } 327 else 328 if (type == Utf8) 329 ret = Utf.toString (x, cast(char[]) dst, ate); 330 else 331 if (type == Utf32) 332 ret = Utf.toString32 (x, cast(dchar[]) dst, ate); 333 } 334 335 static if (is (T == dchar)) 336 { 337 if (type == Utf32) 338 { 339 if (ate) 340 *ate = x.length; 341 ret = x; 342 } 343 else 344 if (type == Utf8) 345 ret = Utf.toString (x, cast(char[]) dst, ate); 346 else 347 if (type == Utf16) 348 ret = Utf.toString16 (x, cast(wchar[]) dst, ate); 349 } 350 351 return ret; 352 } 353 } 354 355 /******************************************************************************* 356 357 Handle byte-order-mark prefixes 358 359 *******************************************************************************/ 360 361 class BomSniffer 362 { 363 private bool found; // was an encoding discovered? 364 private Encoding encoder; // the current encoding 365 private const(Info)* settings; // pointer to encoding configuration 366 367 private struct Info 368 { 369 int type; // type of element (char/wchar/dchar) 370 Encoding encoding; // Encoding.xx encoding 371 char[] bom; // pattern to match for signature 372 bool test, // should we test for this encoding? 373 endian, // this encoding have endian concerns? 374 bigEndian; // is this a big-endian encoding? 375 Encoding fallback; // can this encoding be defaulted? 376 }; 377 378 private enum {Utf8, Utf16, Utf32}; 379 380 private static const(Info[]) lookup = [ 381 {Utf8, Encoding.Unknown, null, true, false, false, Encoding.UTF_8}, 382 {Utf8, Encoding.UTF_8N, null, true, false, false, Encoding.UTF_8}, 383 {Utf8, Encoding.UTF_8, "\xEF\xBB\xBF", false}, 384 {Utf16, Encoding.UTF_16, null, true, false, false, Encoding.UTF_16BE}, 385 {Utf16, Encoding.UTF_16BE, "\xFE\xFF", false, true, true}, 386 {Utf16, Encoding.UTF_16LE, "\xFF\xFE", false, true}, 387 {Utf32, Encoding.UTF_32, null, true, false, false, Encoding.UTF_32BE}, 388 {Utf32, Encoding.UTF_32BE, "\x00\x00\xFE\xFF", false, true, true}, 389 {Utf32, Encoding.UTF_32LE, "\xFF\xFE\x00\x00", false, true}, 390 ]; 391 392 /*********************************************************************** 393 394 Return the current encoding. This is either the originally 395 specified encoding, or a derived one obtained by inspecting 396 the content for a BOM. The latter is performed as part of 397 the decode() method 398 399 ***********************************************************************/ 400 401 final Encoding encoding () 402 { 403 return encoder; 404 } 405 406 /*********************************************************************** 407 408 Was an encoding located in the text (configured via setup) 409 410 ***********************************************************************/ 411 412 final bool encoded () 413 { 414 return found; 415 } 416 417 /*********************************************************************** 418 419 Return the signature (BOM) of the current encoding 420 421 ***********************************************************************/ 422 423 final const(void)[] signature () 424 { 425 return settings.bom; 426 } 427 428 /*********************************************************************** 429 430 Configure this instance with unicode converters 431 432 ***********************************************************************/ 433 434 final void setup (Encoding encoding, bool found = false) 435 { 436 this.settings = &lookup[encoding]; 437 this.encoder = encoding; 438 this.found = found; 439 } 440 441 /*********************************************************************** 442 443 Scan the BOM signatures looking for a match. We scan in 444 reverse order to get the longest match first 445 446 ***********************************************************************/ 447 448 static final const(Info)* test (void[] content) 449 { 450 for (auto info=lookup.ptr+lookup.length; --info >= lookup.ptr;) 451 { 452 if (info.bom) 453 { 454 auto len = info.bom.length; 455 if (len <= content.length) 456 if (content[0..len] == info.bom[0..len]) 457 return info; 458 } 459 } 460 return null; 461 } 462 } 463 464 /******************************************************************************* 465 466 *******************************************************************************/ 467 468 unittest 469 { 470 void[] INPUT2 = "abc\xE3\x81\x82\xE3\x81\x84\xE3\x81\x86".dup; 471 void[] INPUT = "\xEF\xBB\xBF" ~ INPUT2; 472 auto bom = new UnicodeBom!(char)(Encoding.Unknown); 473 size_t ate; 474 char[256] buf; 475 476 auto temp = bom.decode (INPUT, buf, &ate); 477 test (ate == INPUT.length); 478 test (bom.encoding == Encoding.UTF_8); 479 480 temp = bom.decode (INPUT2, buf, &ate); 481 test (ate == INPUT2.length); 482 test (bom.encoding == Encoding.UTF_8); 483 }