1 /*******************************************************************************
2 
3     Character encoding conversion.
4 
5     Character encoding conversion using the C iconv library
6     (ocean.text.util.c.Iconv).
7 
8     Usage:
9         This module can be used by creating an instance of the StringEncode
10         class with the template parameters of the desired character encoding
11         conversion:
12 
13         ---
14 
15             auto string_enc = new StringEncode!("ISO-8859-1", "UTF-8");
16 
17         ---
18 
19         The conversion function is called as follows:
20 
21         ---
22 
23             char[] input = "A string to be converted";
24             char[] output; // The buffer which is written into
25 
26             string_enc.convert(input, output);
27 
28         ---
29 
30     Copyright:
31         Copyright (c) 2009-2016 dunnhumby Germany GmbH.
32         All rights reserved.
33 
34     License:
35         Boost Software License Version 1.0. See LICENSE_BOOST.txt for details.
36         Alternatively, this file may be distributed under the terms of the Tango
37         3-Clause BSD License (see LICENSE_BSD.txt for details).
38 
39 *******************************************************************************/
40 
41 module ocean.text.util.StringEncode;
42 
43 
44 
45 
46 import ocean.transition;
47 
48 version(UnitTest) import ocean.core.Test;
49 
50 static if (__VERSION__ >= 2000 && __VERSION__ < 2073)
51 {
52     extern (C)
53     {
54         alias void* iconv_t;
55 
56         iconv_t iconv_open (in char* tocode, in char* fromcode);
57 
58         size_t iconv (iconv_t cd, char** inbuf, size_t* inbytesleft,
59             char** outbuf, size_t* outbytesleft);
60 
61         int iconv_close (iconv_t cd);
62     }
63 }
64 else
65     import core.sys.posix.iconv;
66 
67 import core.stdc.errno;
68 
69 /******************************************************************************
70 
71     IconvException
72 
73 *******************************************************************************/
74 
75 class IconvException : Exception
76 {
77     static immutable MSG = "Iconv: Error";
78 
79     this ( istring msg = MSG, istring file = __FILE__, int line = __LINE__ )
80     {
81         super(msg, file, line);
82     }
83 
84     alias .InvalidMbSeq InvalidMbSeq;
85     alias .IncompleteMbSeq IncompleteMbSeq;
86 }
87 
88 /**************************************************************************
89 
90     Invalid Multibyte Sequence
91 
92  **************************************************************************/
93 
94 class InvalidMbSeq :  IconvException
95 {
96     static immutable msg = "Iconv: Invalid Multibyte Sequence";
97 
98     this ( istring file = __FILE__, int line = __LINE__ )
99     {
100         super(this.msg, file, line);
101     }
102 }
103 
104 /**************************************************************************
105 
106     Incomplete Multibyte Sequence
107 
108  **************************************************************************/
109 
110 class IncompleteMbSeq :  IconvException
111 {
112     static immutable msg = "Iconv: Incomplete Multibyte Sequence";
113 
114     this ( istring file = __FILE__, int line = __LINE__ )
115     {
116         super(this.msg, file, line);
117     }
118 }
119 
120 /*******************************************************************************
121 
122     Encoder interface.
123 
124 *******************************************************************************/
125 
126 interface StringEncoder
127 {
128     /***************************************************************************
129 
130         Converts a string from one encoding to another.
131 
132         Params:
133             input = string to convert
134             output = converted string
135 
136     ***************************************************************************/
137 
138     public void convert ( cstring input, ref mstring output );
139 }
140 
141 
142 
143 /*******************************************************************************
144 
145     StringEncode class
146     The template parameters are the character encoding types for the input
147     and output of the converter.
148 
149 *******************************************************************************/
150 
151 public class StringEncode ( istring fromcode, istring tocode ) : StringEncoder
152 {
153     /***************************************************************************
154 
155         The conversion descriptor which iconv uses internally
156 
157     ***************************************************************************/
158 
159     private iconv_t cd;
160 
161 
162     /***************************************************************************
163 
164         Exceptions which could be thrown by this class. (These are created as
165         class members so that there is no risk of convert() being called over
166         and over, and newing exceptions each time, leading to an accumulation of
167         memory over time.)
168 
169     ***************************************************************************/
170 
171     private IconvException.InvalidMbSeq exception_InvalidMbSeq;
172 
173     private IconvException.IncompleteMbSeq exception_IncompleteMbSeq;
174 
175     private IconvException exception_Generic;
176 
177 
178     /***************************************************************************
179 
180         Constructor.
181         Initialises iconv with the desired character encoding conversion types,
182         and sets default values for the public bool properties above.
183 
184     ***************************************************************************/
185 
186     public this ( )
187     {
188         this.cd = iconv_open(tocode.ptr, fromcode.ptr);
189 
190         this.exception_InvalidMbSeq = new IconvException.InvalidMbSeq;
191 
192         this.exception_IncompleteMbSeq = new IconvException.IncompleteMbSeq;
193 
194         this.exception_Generic = new IconvException;
195     }
196 
197 
198     /***************************************************************************
199 
200         Destructor.
201         Simply closes down the C iconv library.
202 
203     ***************************************************************************/
204 
205     private ~this ( )
206     {
207         iconv_close(this.cd);
208     }
209 
210     /***************************************************************************
211 
212         Converts a string in one encoding type to another (as specified by the
213         class' template parameters).
214 
215         Makes a guess at the required size of output buffer, simply setting it
216         to the same size as the input buffer. Then repeatedly tries converting
217         the input and increasing the size of the output buffer until the
218         conversion succeeds.
219 
220         To avoid repeated memory allocation, if you need to call this function
221         many times, it's best to always pass the same output buffer.
222 
223         Params:
224             input = the array of characters to be converted.
225             output = array of characters which will be filled with the results
226                      of the conversion. The output array is resized to fit the
227                      results.
228 
229     ***************************************************************************/
230 
231     public override void convert ( cstring input, ref mstring output )
232     {
233         enableStomping(output);
234         output.length = input.length;
235         enableStomping(output);
236 
237         // Do the conversion. Keep trying until there is no E2BIG error.
238         size_t inbytesleft  = input.length;
239         size_t outbytesleft = output.length;
240         Const!(char)* inptr  = input.ptr;
241         char* outptr = output.ptr;
242 
243         ptrdiff_t result;
244 
245         bool too_big = false;
246 
247         do
248         {
249             // Attempt the conversion
250             // FIXME: remove cast with https://github.com/dlang/druntime/pull/1742
251             result = iconv(this.cd, cast(char**) &inptr, &inbytesleft, &outptr, &outbytesleft);
252 
253             // If it wasn't E2BIG, we're finished
254             too_big = (result < 0 && errno() == E2BIG);
255 
256             if (too_big)
257             {
258                 // Conversion failed because the output buffer was too small.
259                 // Resize the output buffer and try again.
260                 // To improve performance, we pass the number of bytes already
261                 // processed to iconv. But, because extending the buffer may
262                 // result in a memory allocation, outptr may become invalid.
263 
264                 // Convert 'outptr' to an index
265                 size_t out_so_far = outptr - output.ptr;
266 
267                 output.length = output.length + input.length;
268                 outbytesleft += input.length;
269 
270                 // Readjust outptr to the same position relative to output.ptr,
271                 // in case memory allocation just occured
272                 outptr = output.ptr + out_so_far;
273             }
274         }
275         while ( too_big );
276 
277         output.length = output.length - outbytesleft;
278         enableStomping(output);
279 
280         // Check for any errors from iconv and throw them as exceptions
281         if (result < 0)
282         {
283             switch (errno())
284             {
285                 case EILSEQ:
286                     throw this.exception_InvalidMbSeq;
287 
288                 case EINVAL:
289                     throw this.exception_IncompleteMbSeq;
290 
291                 default:
292                     throw this.exception_Generic;
293             }
294         }
295     }
296 }
297 
298 
299 
300 /*******************************************************************************
301 
302     String encoder sequence. Runs a sequence of encoders over a string until one
303     achieves a successful encoding.
304 
305     Params:
306         Encoders = tuple of types of encoders
307 
308 *******************************************************************************/
309 
310 public class StringEncoderSequence ( Encoders... )
311 {
312     /***************************************************************************
313 
314         Static constructor - ensures that all template types implement the
315         Encoder interface.
316 
317     ***************************************************************************/
318 
319     static this ( )
320     {
321         foreach ( E; Encoders )
322         {
323             static assert(is(E : StringEncoder));
324         }
325     }
326 
327 
328     /***************************************************************************
329 
330         Array of encoders.
331 
332     ***************************************************************************/
333 
334     private StringEncoder[] encoders;
335 
336 
337     /***************************************************************************
338 
339         Constructor. News an instance of each of the template types.
340 
341     ***************************************************************************/
342 
343     public this ( )
344     {
345         foreach ( E; Encoders )
346         {
347             this.encoders ~= new E;
348         }
349     }
350 
351     /***************************************************************************
352 
353         Runs the encoders in sequence until one succeeds.
354 
355         This method is aliased with opCall.
356 
357         Params:
358             input = text to convert
359             output = converted text
360 
361         Returns:
362             converted text, or "" if all encoders failed.
363 
364     ***************************************************************************/
365 
366     public mstring convert ( cstring input, ref mstring output )
367     {
368         output.length = 0;
369         enableStomping(output);
370 
371         foreach ( e; this.encoders )
372         {
373             try
374             {
375                 if ( convert(e, input, output) )
376                 {
377                     return output;
378                 }
379             }
380             // Exceptions thrown by an encoder are ignored.
381             catch ( IconvException.InvalidMbSeq e )
382             {
383             }
384             catch ( IconvException.IncompleteMbSeq e )
385             {
386             }
387             catch ( IconvException e )
388             {
389             }
390         }
391 
392         output.length = 0;
393         enableStomping(output);
394         return output;
395     }
396 
397     public alias convert opCall;
398 
399 
400     /***************************************************************************
401 
402         Attempts to convert the given text with the given encoder.
403 
404         Params:
405             encoder = encoder to use
406             input = text to convert
407             output = converted text
408 
409         Returns:
410             true if the text was converted successfully
411 
412     ***************************************************************************/
413 
414     private bool convert ( StringEncoder encoder, cstring input, ref mstring output )
415     {
416         try
417         {
418             encoder.convert(input, output);
419             return true;
420         }
421         catch ( IconvException.InvalidMbSeq )
422         {
423             return false;
424         }
425     }
426 }
427 
428 ///
429 unittest
430 {
431     alias StringEncode!("UTF-8", "UTF-8//TRANSLIT") Utf8Converter;
432     alias StringEncode!("ISO-8859-1", "UTF-8//TRANSLIT") Iso_8859_1_Converter;
433     alias StringEncoderSequence!(Utf8Converter, Iso_8859_1_Converter) Utf8Encoder;
434 
435     Utf8Encoder utf8_encoder = new Utf8Encoder();
436     mstring buff;
437     utf8_encoder.convert("Soon\u2122", buff);
438     test(buff == "Soon™", buff);
439 }