1 /******************************************************************************* 2 3 Character encoding conversion. 4 5 Character encoding conversion using the C iconv library 6 (ocean.text.util.c.Iconv). 7 8 Usage: 9 This module can be used by creating an instance of the StringEncode 10 class with the template parameters of the desired character encoding 11 conversion: 12 13 --- 14 15 auto string_enc = new StringEncode!("ISO-8859-1", "UTF-8"); 16 17 --- 18 19 The conversion function is called as follows: 20 21 --- 22 23 char[] input = "A string to be converted"; 24 char[] output; // The buffer which is written into 25 26 string_enc.convert(input, output); 27 28 --- 29 30 Copyright: 31 Copyright (c) 2009-2016 dunnhumby Germany GmbH. 32 All rights reserved. 33 34 License: 35 Boost Software License Version 1.0. See LICENSE_BOOST.txt for details. 36 Alternatively, this file may be distributed under the terms of the Tango 37 3-Clause BSD License (see LICENSE_BSD.txt for details). 38 39 *******************************************************************************/ 40 41 module ocean.text.util.StringEncode; 42 43 44 45 46 import ocean.meta.types.Qualifiers; 47 48 version (unittest) import ocean.core.Test; 49 50 import core.sys.posix.iconv; 51 import core.stdc.errno; 52 53 /****************************************************************************** 54 55 IconvException 56 57 *******************************************************************************/ 58 59 class IconvException : Exception 60 { 61 static immutable MSG = "Iconv: Error"; 62 63 this ( istring msg = MSG, istring file = __FILE__, int line = __LINE__ ) 64 { 65 super(msg, file, line); 66 } 67 68 alias .InvalidMbSeq InvalidMbSeq; 69 alias .IncompleteMbSeq IncompleteMbSeq; 70 } 71 72 /************************************************************************** 73 74 Invalid Multibyte Sequence 75 76 **************************************************************************/ 77 78 class InvalidMbSeq : IconvException 79 { 80 static immutable msg = "Iconv: Invalid Multibyte Sequence"; 81 82 this ( istring file = __FILE__, int line = __LINE__ ) 83 { 84 super(this.msg, file, line); 85 } 86 } 87 88 /************************************************************************** 89 90 Incomplete Multibyte Sequence 91 92 **************************************************************************/ 93 94 class IncompleteMbSeq : IconvException 95 { 96 static immutable msg = "Iconv: Incomplete Multibyte Sequence"; 97 98 this ( istring file = __FILE__, int line = __LINE__ ) 99 { 100 super(this.msg, file, line); 101 } 102 } 103 104 /******************************************************************************* 105 106 Encoder interface. 107 108 *******************************************************************************/ 109 110 interface StringEncoder 111 { 112 /*************************************************************************** 113 114 Converts a string from one encoding to another. 115 116 Params: 117 input = string to convert 118 output = converted string 119 120 ***************************************************************************/ 121 122 public void convert ( cstring input, ref mstring output ); 123 } 124 125 126 127 /******************************************************************************* 128 129 StringEncode class 130 The template parameters are the character encoding types for the input 131 and output of the converter. 132 133 *******************************************************************************/ 134 135 public class StringEncode ( istring fromcode, istring tocode ) : StringEncoder 136 { 137 /*************************************************************************** 138 139 The conversion descriptor which iconv uses internally 140 141 ***************************************************************************/ 142 143 private iconv_t cd; 144 145 146 /*************************************************************************** 147 148 Exceptions which could be thrown by this class. (These are created as 149 class members so that there is no risk of convert() being called over 150 and over, and newing exceptions each time, leading to an accumulation of 151 memory over time.) 152 153 ***************************************************************************/ 154 155 private IconvException.InvalidMbSeq exception_InvalidMbSeq; 156 157 private IconvException.IncompleteMbSeq exception_IncompleteMbSeq; 158 159 private IconvException exception_Generic; 160 161 162 /*************************************************************************** 163 164 Constructor. 165 Initialises iconv with the desired character encoding conversion types, 166 and sets default values for the public bool properties above. 167 168 ***************************************************************************/ 169 170 public this ( ) 171 { 172 this.cd = iconv_open(tocode.ptr, fromcode.ptr); 173 174 this.exception_InvalidMbSeq = new IconvException.InvalidMbSeq; 175 176 this.exception_IncompleteMbSeq = new IconvException.IncompleteMbSeq; 177 178 this.exception_Generic = new IconvException; 179 } 180 181 182 /*************************************************************************** 183 184 Destructor. 185 Simply closes down the C iconv library. 186 187 ***************************************************************************/ 188 189 private ~this ( ) 190 { 191 iconv_close(this.cd); 192 } 193 194 /*************************************************************************** 195 196 Converts a string in one encoding type to another (as specified by the 197 class' template parameters). 198 199 Makes a guess at the required size of output buffer, simply setting it 200 to the same size as the input buffer. Then repeatedly tries converting 201 the input and increasing the size of the output buffer until the 202 conversion succeeds. 203 204 To avoid repeated memory allocation, if you need to call this function 205 many times, it's best to always pass the same output buffer. 206 207 Params: 208 input = the array of characters to be converted. 209 output = array of characters which will be filled with the results 210 of the conversion. The output array is resized to fit the 211 results. 212 213 ***************************************************************************/ 214 215 public override void convert ( cstring input, ref mstring output ) 216 { 217 assumeSafeAppend(output); 218 output.length = input.length; 219 assumeSafeAppend(output); 220 221 // Do the conversion. Keep trying until there is no E2BIG error. 222 size_t inbytesleft = input.length; 223 size_t outbytesleft = output.length; 224 const(char)* inptr = input.ptr; 225 char* outptr = output.ptr; 226 227 ptrdiff_t result; 228 229 bool too_big = false; 230 231 do 232 { 233 // Attempt the conversion 234 // FIXME: remove cast with https://github.com/dlang/druntime/pull/1742 235 result = iconv(this.cd, cast(char**) &inptr, &inbytesleft, &outptr, &outbytesleft); 236 237 // If it wasn't E2BIG, we're finished 238 too_big = (result < 0 && errno() == E2BIG); 239 240 if (too_big) 241 { 242 // Conversion failed because the output buffer was too small. 243 // Resize the output buffer and try again. 244 // To improve performance, we pass the number of bytes already 245 // processed to iconv. But, because extending the buffer may 246 // result in a memory allocation, outptr may become invalid. 247 248 // Convert 'outptr' to an index 249 size_t out_so_far = outptr - output.ptr; 250 251 output.length = output.length + input.length; 252 outbytesleft += input.length; 253 254 // Readjust outptr to the same position relative to output.ptr, 255 // in case memory allocation just occured 256 outptr = output.ptr + out_so_far; 257 } 258 } 259 while ( too_big ); 260 261 output.length = output.length - outbytesleft; 262 assumeSafeAppend(output); 263 264 // Check for any errors from iconv and throw them as exceptions 265 if (result < 0) 266 { 267 switch (errno()) 268 { 269 case EILSEQ: 270 throw this.exception_InvalidMbSeq; 271 272 case EINVAL: 273 throw this.exception_IncompleteMbSeq; 274 275 default: 276 throw this.exception_Generic; 277 } 278 } 279 } 280 } 281 282 283 284 /******************************************************************************* 285 286 String encoder sequence. Runs a sequence of encoders over a string until one 287 achieves a successful encoding. 288 289 Params: 290 Encoders = tuple of types of encoders 291 292 *******************************************************************************/ 293 294 public class StringEncoderSequence ( Encoders... ) 295 { 296 /*************************************************************************** 297 298 Static constructor - ensures that all template types implement the 299 Encoder interface. 300 301 ***************************************************************************/ 302 303 static this ( ) 304 { 305 foreach ( E; Encoders ) 306 { 307 static assert(is(E : StringEncoder)); 308 } 309 } 310 311 312 /*************************************************************************** 313 314 Array of encoders. 315 316 ***************************************************************************/ 317 318 private StringEncoder[] encoders; 319 320 321 /*************************************************************************** 322 323 Constructor. News an instance of each of the template types. 324 325 ***************************************************************************/ 326 327 public this ( ) 328 { 329 foreach ( E; Encoders ) 330 { 331 this.encoders ~= new E; 332 } 333 } 334 335 /*************************************************************************** 336 337 Runs the encoders in sequence until one succeeds. 338 339 This method is aliased with opCall. 340 341 Params: 342 input = text to convert 343 output = converted text 344 345 Returns: 346 converted text, or "" if all encoders failed. 347 348 ***************************************************************************/ 349 350 public mstring convert ( cstring input, ref mstring output ) 351 { 352 output.length = 0; 353 assumeSafeAppend(output); 354 355 foreach ( e; this.encoders ) 356 { 357 try 358 { 359 if ( convert(e, input, output) ) 360 { 361 return output; 362 } 363 } 364 // Exceptions thrown by an encoder are ignored. 365 catch ( IconvException.InvalidMbSeq e ) 366 { 367 } 368 catch ( IconvException.IncompleteMbSeq e ) 369 { 370 } 371 catch ( IconvException e ) 372 { 373 } 374 } 375 376 output.length = 0; 377 assumeSafeAppend(output); 378 return output; 379 } 380 381 public alias convert opCall; 382 383 384 /*************************************************************************** 385 386 Attempts to convert the given text with the given encoder. 387 388 Params: 389 encoder = encoder to use 390 input = text to convert 391 output = converted text 392 393 Returns: 394 true if the text was converted successfully 395 396 ***************************************************************************/ 397 398 private bool convert ( StringEncoder encoder, cstring input, ref mstring output ) 399 { 400 try 401 { 402 encoder.convert(input, output); 403 return true; 404 } 405 catch ( IconvException.InvalidMbSeq ) 406 { 407 return false; 408 } 409 } 410 } 411 412 /// 413 unittest 414 { 415 alias StringEncode!("UTF-8", "UTF-8//TRANSLIT") Utf8Converter; 416 alias StringEncode!("ISO-8859-1", "UTF-8//TRANSLIT") Iso_8859_1_Converter; 417 alias StringEncoderSequence!(Utf8Converter, Iso_8859_1_Converter) Utf8Encoder; 418 419 Utf8Encoder utf8_encoder = new Utf8Encoder(); 420 mstring buff; 421 utf8_encoder.convert("Soon\u2122", buff); 422 test(buff == "Soon™", buff); 423 }