1 /*******************************************************************************
2 
3     Character encoding conversion.
4 
5     Character encoding conversion using the C iconv library
6     (ocean.text.util.c.Iconv).
7 
8     Usage:
9         This module can be used by creating an instance of the StringEncode
10         class with the template parameters of the desired character encoding
11         conversion:
12 
13         ---
14 
15             auto string_enc = new StringEncode!("ISO-8859-1", "UTF-8");
16 
17         ---
18 
19         The conversion function is called as follows:
20 
21         ---
22 
23             char[] input = "A string to be converted";
24             char[] output; // The buffer which is written into
25 
26             string_enc.convert(input, output);
27 
28         ---
29 
30     Copyright:
31         Copyright (c) 2009-2016 dunnhumby Germany GmbH.
32         All rights reserved.
33 
34     License:
35         Boost Software License Version 1.0. See LICENSE_BOOST.txt for details.
36         Alternatively, this file may be distributed under the terms of the Tango
37         3-Clause BSD License (see LICENSE_BSD.txt for details).
38 
39 *******************************************************************************/
40 
41 module ocean.text.util.StringEncode;
42 
43 
44 
45 
46 import ocean.meta.types.Qualifiers;
47 
48 version (unittest) import ocean.core.Test;
49 
50 import core.sys.posix.iconv;
51 import core.stdc.errno;
52 
53 /******************************************************************************
54 
55     IconvException
56 
57 *******************************************************************************/
58 
59 class IconvException : Exception
60 {
61     static immutable MSG = "Iconv: Error";
62 
63     this ( istring msg = MSG, istring file = __FILE__, int line = __LINE__ )
64     {
65         super(msg, file, line);
66     }
67 
68     alias .InvalidMbSeq InvalidMbSeq;
69     alias .IncompleteMbSeq IncompleteMbSeq;
70 }
71 
72 /**************************************************************************
73 
74     Invalid Multibyte Sequence
75 
76  **************************************************************************/
77 
78 class InvalidMbSeq :  IconvException
79 {
80     static immutable msg = "Iconv: Invalid Multibyte Sequence";
81 
82     this ( istring file = __FILE__, int line = __LINE__ )
83     {
84         super(this.msg, file, line);
85     }
86 }
87 
88 /**************************************************************************
89 
90     Incomplete Multibyte Sequence
91 
92  **************************************************************************/
93 
94 class IncompleteMbSeq :  IconvException
95 {
96     static immutable msg = "Iconv: Incomplete Multibyte Sequence";
97 
98     this ( istring file = __FILE__, int line = __LINE__ )
99     {
100         super(this.msg, file, line);
101     }
102 }
103 
104 /*******************************************************************************
105 
106     Encoder interface.
107 
108 *******************************************************************************/
109 
110 interface StringEncoder
111 {
112     /***************************************************************************
113 
114         Converts a string from one encoding to another.
115 
116         Params:
117             input = string to convert
118             output = converted string
119 
120     ***************************************************************************/
121 
122     public void convert ( cstring input, ref mstring output );
123 }
124 
125 
126 
127 /*******************************************************************************
128 
129     StringEncode class
130     The template parameters are the character encoding types for the input
131     and output of the converter.
132 
133 *******************************************************************************/
134 
135 public class StringEncode ( istring fromcode, istring tocode ) : StringEncoder
136 {
137     /***************************************************************************
138 
139         The conversion descriptor which iconv uses internally
140 
141     ***************************************************************************/
142 
143     private iconv_t cd;
144 
145 
146     /***************************************************************************
147 
148         Exceptions which could be thrown by this class. (These are created as
149         class members so that there is no risk of convert() being called over
150         and over, and newing exceptions each time, leading to an accumulation of
151         memory over time.)
152 
153     ***************************************************************************/
154 
155     private IconvException.InvalidMbSeq exception_InvalidMbSeq;
156 
157     private IconvException.IncompleteMbSeq exception_IncompleteMbSeq;
158 
159     private IconvException exception_Generic;
160 
161 
162     /***************************************************************************
163 
164         Constructor.
165         Initialises iconv with the desired character encoding conversion types,
166         and sets default values for the public bool properties above.
167 
168     ***************************************************************************/
169 
170     public this ( )
171     {
172         this.cd = iconv_open(tocode.ptr, fromcode.ptr);
173 
174         this.exception_InvalidMbSeq = new IconvException.InvalidMbSeq;
175 
176         this.exception_IncompleteMbSeq = new IconvException.IncompleteMbSeq;
177 
178         this.exception_Generic = new IconvException;
179     }
180 
181 
182     /***************************************************************************
183 
184         Destructor.
185         Simply closes down the C iconv library.
186 
187     ***************************************************************************/
188 
189     private ~this ( )
190     {
191         iconv_close(this.cd);
192     }
193 
194     /***************************************************************************
195 
196         Converts a string in one encoding type to another (as specified by the
197         class' template parameters).
198 
199         Makes a guess at the required size of output buffer, simply setting it
200         to the same size as the input buffer. Then repeatedly tries converting
201         the input and increasing the size of the output buffer until the
202         conversion succeeds.
203 
204         To avoid repeated memory allocation, if you need to call this function
205         many times, it's best to always pass the same output buffer.
206 
207         Params:
208             input = the array of characters to be converted.
209             output = array of characters which will be filled with the results
210                      of the conversion. The output array is resized to fit the
211                      results.
212 
213     ***************************************************************************/
214 
215     public override void convert ( cstring input, ref mstring output )
216     {
217         assumeSafeAppend(output);
218         output.length = input.length;
219         assumeSafeAppend(output);
220 
221         // Do the conversion. Keep trying until there is no E2BIG error.
222         size_t inbytesleft  = input.length;
223         size_t outbytesleft = output.length;
224         const(char)* inptr  = input.ptr;
225         char* outptr = output.ptr;
226 
227         ptrdiff_t result;
228 
229         bool too_big = false;
230 
231         do
232         {
233             // Attempt the conversion
234             // FIXME: remove cast with https://github.com/dlang/druntime/pull/1742
235             result = iconv(this.cd, cast(char**) &inptr, &inbytesleft, &outptr, &outbytesleft);
236 
237             // If it wasn't E2BIG, we're finished
238             too_big = (result < 0 && errno() == E2BIG);
239 
240             if (too_big)
241             {
242                 // Conversion failed because the output buffer was too small.
243                 // Resize the output buffer and try again.
244                 // To improve performance, we pass the number of bytes already
245                 // processed to iconv. But, because extending the buffer may
246                 // result in a memory allocation, outptr may become invalid.
247 
248                 // Convert 'outptr' to an index
249                 size_t out_so_far = outptr - output.ptr;
250 
251                 output.length = output.length + input.length;
252                 outbytesleft += input.length;
253 
254                 // Readjust outptr to the same position relative to output.ptr,
255                 // in case memory allocation just occured
256                 outptr = output.ptr + out_so_far;
257             }
258         }
259         while ( too_big );
260 
261         output.length = output.length - outbytesleft;
262         assumeSafeAppend(output);
263 
264         // Check for any errors from iconv and throw them as exceptions
265         if (result < 0)
266         {
267             switch (errno())
268             {
269                 case EILSEQ:
270                     throw this.exception_InvalidMbSeq;
271 
272                 case EINVAL:
273                     throw this.exception_IncompleteMbSeq;
274 
275                 default:
276                     throw this.exception_Generic;
277             }
278         }
279     }
280 }
281 
282 
283 
284 /*******************************************************************************
285 
286     String encoder sequence. Runs a sequence of encoders over a string until one
287     achieves a successful encoding.
288 
289     Params:
290         Encoders = tuple of types of encoders
291 
292 *******************************************************************************/
293 
294 public class StringEncoderSequence ( Encoders... )
295 {
296     /***************************************************************************
297 
298         Static constructor - ensures that all template types implement the
299         Encoder interface.
300 
301     ***************************************************************************/
302 
303     static this ( )
304     {
305         foreach ( E; Encoders )
306         {
307             static assert(is(E : StringEncoder));
308         }
309     }
310 
311 
312     /***************************************************************************
313 
314         Array of encoders.
315 
316     ***************************************************************************/
317 
318     private StringEncoder[] encoders;
319 
320 
321     /***************************************************************************
322 
323         Constructor. News an instance of each of the template types.
324 
325     ***************************************************************************/
326 
327     public this ( )
328     {
329         foreach ( E; Encoders )
330         {
331             this.encoders ~= new E;
332         }
333     }
334 
335     /***************************************************************************
336 
337         Runs the encoders in sequence until one succeeds.
338 
339         This method is aliased with opCall.
340 
341         Params:
342             input = text to convert
343             output = converted text
344 
345         Returns:
346             converted text, or "" if all encoders failed.
347 
348     ***************************************************************************/
349 
350     public mstring convert ( cstring input, ref mstring output )
351     {
352         output.length = 0;
353         assumeSafeAppend(output);
354 
355         foreach ( e; this.encoders )
356         {
357             try
358             {
359                 if ( convert(e, input, output) )
360                 {
361                     return output;
362                 }
363             }
364             // Exceptions thrown by an encoder are ignored.
365             catch ( IconvException.InvalidMbSeq e )
366             {
367             }
368             catch ( IconvException.IncompleteMbSeq e )
369             {
370             }
371             catch ( IconvException e )
372             {
373             }
374         }
375 
376         output.length = 0;
377         assumeSafeAppend(output);
378         return output;
379     }
380 
381     public alias convert opCall;
382 
383 
384     /***************************************************************************
385 
386         Attempts to convert the given text with the given encoder.
387 
388         Params:
389             encoder = encoder to use
390             input = text to convert
391             output = converted text
392 
393         Returns:
394             true if the text was converted successfully
395 
396     ***************************************************************************/
397 
398     private bool convert ( StringEncoder encoder, cstring input, ref mstring output )
399     {
400         try
401         {
402             encoder.convert(input, output);
403             return true;
404         }
405         catch ( IconvException.InvalidMbSeq )
406         {
407             return false;
408         }
409     }
410 }
411 
412 ///
413 unittest
414 {
415     alias StringEncode!("UTF-8", "UTF-8//TRANSLIT") Utf8Converter;
416     alias StringEncode!("ISO-8859-1", "UTF-8//TRANSLIT") Iso_8859_1_Converter;
417     alias StringEncoderSequence!(Utf8Converter, Iso_8859_1_Converter) Utf8Encoder;
418 
419     Utf8Encoder utf8_encoder = new Utf8Encoder();
420     mstring buff;
421     utf8_encoder.convert("Soon\u2122", buff);
422     test(buff == "Soon™", buff);
423 }