1 /******************************************************************************* 2 3 Character encoding conversion. 4 5 Character encoding conversion using the C iconv library 6 (ocean.text.util.c.Iconv). 7 8 Usage: 9 This module can be used by creating an instance of the StringEncode 10 class with the template parameters of the desired character encoding 11 conversion: 12 13 --- 14 15 auto string_enc = new StringEncode!("ISO-8859-1", "UTF-8"); 16 17 --- 18 19 The conversion function is called as follows: 20 21 --- 22 23 char[] input = "A string to be converted"; 24 char[] output; // The buffer which is written into 25 26 string_enc.convert(input, output); 27 28 --- 29 30 Copyright: 31 Copyright (c) 2009-2016 dunnhumby Germany GmbH. 32 All rights reserved. 33 34 License: 35 Boost Software License Version 1.0. See LICENSE_BOOST.txt for details. 36 Alternatively, this file may be distributed under the terms of the Tango 37 3-Clause BSD License (see LICENSE_BSD.txt for details). 38 39 *******************************************************************************/ 40 41 module ocean.text.util.StringEncode; 42 43 44 45 46 import ocean.transition; 47 48 version(UnitTest) import ocean.core.Test; 49 50 static if (__VERSION__ >= 2000 && __VERSION__ < 2073) 51 { 52 extern (C) 53 { 54 alias void* iconv_t; 55 56 iconv_t iconv_open (in char* tocode, in char* fromcode); 57 58 size_t iconv (iconv_t cd, char** inbuf, size_t* inbytesleft, 59 char** outbuf, size_t* outbytesleft); 60 61 int iconv_close (iconv_t cd); 62 } 63 } 64 else 65 import core.sys.posix.iconv; 66 67 import core.stdc.errno; 68 69 /****************************************************************************** 70 71 IconvException 72 73 *******************************************************************************/ 74 75 class IconvException : Exception 76 { 77 static immutable MSG = "Iconv: Error"; 78 79 this ( istring msg = MSG, istring file = __FILE__, int line = __LINE__ ) 80 { 81 super(msg, file, line); 82 } 83 84 alias .InvalidMbSeq InvalidMbSeq; 85 alias .IncompleteMbSeq IncompleteMbSeq; 86 } 87 88 /************************************************************************** 89 90 Invalid Multibyte Sequence 91 92 **************************************************************************/ 93 94 class InvalidMbSeq : IconvException 95 { 96 static immutable msg = "Iconv: Invalid Multibyte Sequence"; 97 98 this ( istring file = __FILE__, int line = __LINE__ ) 99 { 100 super(this.msg, file, line); 101 } 102 } 103 104 /************************************************************************** 105 106 Incomplete Multibyte Sequence 107 108 **************************************************************************/ 109 110 class IncompleteMbSeq : IconvException 111 { 112 static immutable msg = "Iconv: Incomplete Multibyte Sequence"; 113 114 this ( istring file = __FILE__, int line = __LINE__ ) 115 { 116 super(this.msg, file, line); 117 } 118 } 119 120 /******************************************************************************* 121 122 Encoder interface. 123 124 *******************************************************************************/ 125 126 interface StringEncoder 127 { 128 /*************************************************************************** 129 130 Converts a string from one encoding to another. 131 132 Params: 133 input = string to convert 134 output = converted string 135 136 ***************************************************************************/ 137 138 public void convert ( cstring input, ref mstring output ); 139 } 140 141 142 143 /******************************************************************************* 144 145 StringEncode class 146 The template parameters are the character encoding types for the input 147 and output of the converter. 148 149 *******************************************************************************/ 150 151 public class StringEncode ( istring fromcode, istring tocode ) : StringEncoder 152 { 153 /*************************************************************************** 154 155 The conversion descriptor which iconv uses internally 156 157 ***************************************************************************/ 158 159 private iconv_t cd; 160 161 162 /*************************************************************************** 163 164 Exceptions which could be thrown by this class. (These are created as 165 class members so that there is no risk of convert() being called over 166 and over, and newing exceptions each time, leading to an accumulation of 167 memory over time.) 168 169 ***************************************************************************/ 170 171 private IconvException.InvalidMbSeq exception_InvalidMbSeq; 172 173 private IconvException.IncompleteMbSeq exception_IncompleteMbSeq; 174 175 private IconvException exception_Generic; 176 177 178 /*************************************************************************** 179 180 Constructor. 181 Initialises iconv with the desired character encoding conversion types, 182 and sets default values for the public bool properties above. 183 184 ***************************************************************************/ 185 186 public this ( ) 187 { 188 this.cd = iconv_open(tocode.ptr, fromcode.ptr); 189 190 this.exception_InvalidMbSeq = new IconvException.InvalidMbSeq; 191 192 this.exception_IncompleteMbSeq = new IconvException.IncompleteMbSeq; 193 194 this.exception_Generic = new IconvException; 195 } 196 197 198 /*************************************************************************** 199 200 Destructor. 201 Simply closes down the C iconv library. 202 203 ***************************************************************************/ 204 205 private ~this ( ) 206 { 207 iconv_close(this.cd); 208 } 209 210 /*************************************************************************** 211 212 Converts a string in one encoding type to another (as specified by the 213 class' template parameters). 214 215 Makes a guess at the required size of output buffer, simply setting it 216 to the same size as the input buffer. Then repeatedly tries converting 217 the input and increasing the size of the output buffer until the 218 conversion succeeds. 219 220 To avoid repeated memory allocation, if you need to call this function 221 many times, it's best to always pass the same output buffer. 222 223 Params: 224 input = the array of characters to be converted. 225 output = array of characters which will be filled with the results 226 of the conversion. The output array is resized to fit the 227 results. 228 229 ***************************************************************************/ 230 231 public override void convert ( cstring input, ref mstring output ) 232 { 233 enableStomping(output); 234 output.length = input.length; 235 enableStomping(output); 236 237 // Do the conversion. Keep trying until there is no E2BIG error. 238 size_t inbytesleft = input.length; 239 size_t outbytesleft = output.length; 240 Const!(char)* inptr = input.ptr; 241 char* outptr = output.ptr; 242 243 ptrdiff_t result; 244 245 bool too_big = false; 246 247 do 248 { 249 // Attempt the conversion 250 // FIXME: remove cast with https://github.com/dlang/druntime/pull/1742 251 result = iconv(this.cd, cast(char**) &inptr, &inbytesleft, &outptr, &outbytesleft); 252 253 // If it wasn't E2BIG, we're finished 254 too_big = (result < 0 && errno() == E2BIG); 255 256 if (too_big) 257 { 258 // Conversion failed because the output buffer was too small. 259 // Resize the output buffer and try again. 260 // To improve performance, we pass the number of bytes already 261 // processed to iconv. But, because extending the buffer may 262 // result in a memory allocation, outptr may become invalid. 263 264 // Convert 'outptr' to an index 265 size_t out_so_far = outptr - output.ptr; 266 267 output.length = output.length + input.length; 268 outbytesleft += input.length; 269 270 // Readjust outptr to the same position relative to output.ptr, 271 // in case memory allocation just occured 272 outptr = output.ptr + out_so_far; 273 } 274 } 275 while ( too_big ); 276 277 output.length = output.length - outbytesleft; 278 enableStomping(output); 279 280 // Check for any errors from iconv and throw them as exceptions 281 if (result < 0) 282 { 283 switch (errno()) 284 { 285 case EILSEQ: 286 throw this.exception_InvalidMbSeq; 287 288 case EINVAL: 289 throw this.exception_IncompleteMbSeq; 290 291 default: 292 throw this.exception_Generic; 293 } 294 } 295 } 296 } 297 298 299 300 /******************************************************************************* 301 302 String encoder sequence. Runs a sequence of encoders over a string until one 303 achieves a successful encoding. 304 305 Params: 306 Encoders = tuple of types of encoders 307 308 *******************************************************************************/ 309 310 public class StringEncoderSequence ( Encoders... ) 311 { 312 /*************************************************************************** 313 314 Static constructor - ensures that all template types implement the 315 Encoder interface. 316 317 ***************************************************************************/ 318 319 static this ( ) 320 { 321 foreach ( E; Encoders ) 322 { 323 static assert(is(E : StringEncoder)); 324 } 325 } 326 327 328 /*************************************************************************** 329 330 Array of encoders. 331 332 ***************************************************************************/ 333 334 private StringEncoder[] encoders; 335 336 337 /*************************************************************************** 338 339 Constructor. News an instance of each of the template types. 340 341 ***************************************************************************/ 342 343 public this ( ) 344 { 345 foreach ( E; Encoders ) 346 { 347 this.encoders ~= new E; 348 } 349 } 350 351 /*************************************************************************** 352 353 Runs the encoders in sequence until one succeeds. 354 355 This method is aliased with opCall. 356 357 Params: 358 input = text to convert 359 output = converted text 360 361 Returns: 362 converted text, or "" if all encoders failed. 363 364 ***************************************************************************/ 365 366 public mstring convert ( cstring input, ref mstring output ) 367 { 368 output.length = 0; 369 enableStomping(output); 370 371 foreach ( e; this.encoders ) 372 { 373 try 374 { 375 if ( convert(e, input, output) ) 376 { 377 return output; 378 } 379 } 380 // Exceptions thrown by an encoder are ignored. 381 catch ( IconvException.InvalidMbSeq e ) 382 { 383 } 384 catch ( IconvException.IncompleteMbSeq e ) 385 { 386 } 387 catch ( IconvException e ) 388 { 389 } 390 } 391 392 output.length = 0; 393 enableStomping(output); 394 return output; 395 } 396 397 public alias convert opCall; 398 399 400 /*************************************************************************** 401 402 Attempts to convert the given text with the given encoder. 403 404 Params: 405 encoder = encoder to use 406 input = text to convert 407 output = converted text 408 409 Returns: 410 true if the text was converted successfully 411 412 ***************************************************************************/ 413 414 private bool convert ( StringEncoder encoder, cstring input, ref mstring output ) 415 { 416 try 417 { 418 encoder.convert(input, output); 419 return true; 420 } 421 catch ( IconvException.InvalidMbSeq ) 422 { 423 return false; 424 } 425 } 426 } 427 428 /// 429 unittest 430 { 431 alias StringEncode!("UTF-8", "UTF-8//TRANSLIT") Utf8Converter; 432 alias StringEncode!("ISO-8859-1", "UTF-8//TRANSLIT") Iso_8859_1_Converter; 433 alias StringEncoderSequence!(Utf8Converter, Iso_8859_1_Converter) Utf8Encoder; 434 435 Utf8Encoder utf8_encoder = new Utf8Encoder(); 436 mstring buff; 437 utf8_encoder.convert("Soon\u2122", buff); 438 test(buff == "Soon™", buff); 439 }