1 /*******************************************************************************
2 
3     Fast Unicode transcoders. These are particularly sensitive to
4     minor changes on 32bit x86 devices, because the register set of
5     those devices is so small. Beware of subtle changes which might
6     extend the execution-period by as much as 200%. Because of this,
7     three of the six transcoders might read past the end of input by
8     one, two, or three bytes before arresting themselves. Note that
9     support for streaming adds a 15% overhead to the dchar => char
10     conversion, but has little effect on the others.
11 
12     These routines were tuned on an Intel P4; other devices may work
13     more efficiently with a slightly different approach, though this
14     is likely to be reasonably optimal on AMD x86 CPUs also. These
15     algorithms would benefit significantly from those extra AMD64
16     registers. On a 3GHz P4, the dchar/char conversions take around
17     2500ns to process an array of 1000 ASCII elements. Invoking the
18     memory manager doubles that period, and quadruples the time for
19     arrays of 100 elements. Memory allocation can slow down notably
20     in a multi-threaded environment, so avoid that where possible.
21 
22     Surrogate-pairs are dealt with in a non-optimal fashion when
23     transcoding between utf16 and utf8. Such cases are considered
24     to be boundary-conditions for this module.
25 
26     There are three common cases where the input may be incomplete,
27     including each 'widening' case of utf8 => utf16, utf8 => utf32,
28     and utf16 => utf32. An edge-case is utf16 => utf8, if surrogate
29     pairs are present. Such cases will throw an exception, unless
30     streaming-mode is enabled ~ in the latter mode, an additional
31     integer is returned indicating how many elements of the input
32     have been consumed. In all cases, a correct slice of the output
33     is returned.
34 
35     For details on Unicode processing see:
36     $(UL $(LINK http://www.utf-8.com/))
37     $(UL $(LINK http://www.hackcraft.net/xmlUnicode/))
38     $(UL $(LINK http://www.azillionmonkeys.com/qed/unicode.html/))
39     $(UL $(LINK http://icu.sourceforge.net/docs/papers/forms_of_unicode/))
40 
41     Copyright:
42         Copyright (c) 2004 Kris Bell.
43         Some parts copyright (c) 2009-2016 dunnhumby Germany GmbH.
44         All rights reserved.
45 
46     License:
47         Tango Dual License: 3-Clause BSD License / Academic Free License v3.0.
48         See LICENSE_TANGO.txt for details.
49 
50     Version: Initial release: Oct 2004
51 
52     Authors: Kris
53 
54  *******************************************************************************/
55 
56 module ocean.text.convert.Utf;
57 
58 static import core.exception;
59 
60 import ocean.meta.types.Qualifiers;
61 
62 /*******************************************************************************
63 
64   Symmetric calls for equivalent types; these return the provided
65   input with no conversion
66 
67  *******************************************************************************/
68 
69 const(char)[]  toString (const(char)[] src, char[] dst=null, size_t* ate=null) {return src;}
70 const(wchar)[] toString (const(wchar)[] src, wchar[] dst, size_t* ate=null) {return src;}
71 const(dchar)[] toString (const(dchar)[] src, dchar[] dst, size_t* ate=null) {return src;}
72 
73 
74 /*******************************************************************************
75 
76   Encode a string of characters into an UTF-8 string, providing one character
77   at a time to the delegate.
78 
79   This allow to shift the allocation strategy on the user, which might have
80   more information about the kind of data passed to this function.
81 
82   Parameters:
83     input = UTF-8, UTF-16 or UTF-32 encoded string to encode to UTF-8
84     dg    = Output delegate to pass the result to
85 
86   Note:
87     Unlike the other `toString` variant, UTF-16 -> UTF-8 doesn't support
88     surrogate pairs and will call `onUnicodeError`.
89 
90 *******************************************************************************/
91 
92 public void toString (const(char)[] input, scope size_t delegate(cstring) dg)
93 {
94     dg(input);
95 }
96 
97 /// Ditto
98 public void toString (const(wchar)[] input, scope size_t delegate(cstring) dg)
99 {
100     char[4] buff;
101     foreach (size_t idx, wchar c; input)
102     {
103         if (c < 0x80)
104             dg((cast(const(char)*) &c)[0 .. 1]);
105         else if (c < 0x0800)
106         {
107             buff[0] = cast(char)(0xc0 | ((c >> 6) & 0x3f));
108             buff[1] = cast(char)(0x80 | (c & 0x3f));
109             dg(buff[0 .. 2]);
110         }
111         else if (c < 0xd800 || c > 0xdfff)
112         {
113             buff[0] = cast(char)(0xe0 | ((c >> 12) & 0x3f));
114             buff[1] = cast(char)(0x80 | ((c >> 6)  & 0x3f));
115             buff[2] = cast(char)(0x80 | (c & 0x3f));
116             dg(buff[0 .. 3]);
117         }
118         else
119             core.exception.onUnicodeError("Unicode.toString : Surrogate pair not supported", 0);
120     }
121 }
122 
123 /// Ditto
124 public void toString (const(dchar)[] input, scope size_t delegate(cstring) dg)
125 {
126     char[4] buff;
127     foreach (size_t idx, dchar c; input)
128     {
129         if (c < 0x80)
130             dg((cast(const(char)*) &c)[0 .. 1]);
131         else if (c < 0x0800)
132         {
133             buff[0] = cast(char)(0xc0 | ((c >> 6) & 0x3f));
134             buff[1] = cast(char)(0x80 | (c & 0x3f));
135             dg(buff[0 .. 2]);
136         }
137         else if (c < 0x10000)
138         {
139             buff[0] = cast(char)(0xe0 | ((c >> 12) & 0x3f));
140             buff[1] = cast(char)(0x80 | ((c >> 6)  & 0x3f));
141             buff[2] = cast(char)(0x80 | (c & 0x3f));
142             dg(buff[0 .. 3]);
143         }
144         else if (c < 0x110000)
145         {
146             buff[0] = cast(char)(0xf0 | ((c >> 18) & 0x3f));
147             buff[1] = cast(char)(0x80 | ((c >> 12) & 0x3f));
148             buff[2] = cast(char)(0x80 | ((c >> 6)  & 0x3f));
149             buff[3] = cast(char)(0x80 | (c & 0x3f));
150             dg(buff);
151         }
152         else
153             core.exception.onUnicodeError("Unicode.toString : invalid dchar", idx);
154     }
155 }
156 
157 
158 /*******************************************************************************
159 
160     Encode Utf8 up to a maximum of 4 bytes long (five & six byte
161     variations are not supported).
162 
163     If the output is provided off the stack, it should be large
164     enough to encompass the entire transcoding; failing to do
165     so will cause the output to be moved onto the heap instead.
166 
167     Returns a slice of the output buffer, corresponding to the
168     converted characters. For optimum performance, the returned
169     buffer should be specified as 'output' on subsequent calls.
170     For example:
171 
172     ---
173     char[] output;
174 
175     char[] result = toString (input, output);
176 
177     // reset output after a realloc
178     if (result.length > output.length)
179     output = result;
180     ---
181 
182     Where 'ate' is provided, it will be set to the number of
183     elements consumed from the input, and the output buffer
184     will not be resized (or allocated). This represents a
185     streaming mode, where slices of the input are processed
186     in sequence rather than all at one time (should use 'ate'
187     as an index for slicing into unconsumed input).
188 
189  *******************************************************************************/
190 
191 mstring toString (const(wchar)[] input, mstring output=null, size_t* ate=null)
192 {
193     if (ate)
194         *ate = input.length;
195     else
196     {
197         // potentially reallocate output
198         auto estimate = input.length * 2 + 3;
199         if (output.length < estimate)
200             output.length = estimate;
201     }
202 
203     char* pOut = output.ptr;
204     char* pMax = pOut + output.length - 3;
205 
206     foreach (size_t eaten, wchar b; input)
207     {
208         // about to overflow the output?
209         if (pOut > pMax)
210         {
211             // if streaming, just return the unused input
212             if (ate)
213             {
214                 *ate = eaten;
215                 break;
216             }
217 
218             // reallocate the output buffer
219             auto len = pOut - output.ptr;
220             output.length = len + len / 2;
221             pOut = output.ptr + len;
222             pMax = output.ptr + output.length - 3;
223         }
224 
225         if (b < 0x80)
226             *pOut++ = cast(char) b;
227         else
228         {
229             if (b < 0x0800)
230             {
231                 pOut[0] = cast(wchar)(0xc0 | ((b >> 6) & 0x3f));
232                 pOut[1] = cast(wchar)(0x80 | (b & 0x3f));
233                 pOut += 2;
234             }
235             else
236             {
237                 if (b < 0xd800 || b > 0xdfff)
238                 {
239                     pOut[0] = cast(wchar)(0xe0 | ((b >> 12) & 0x3f));
240                     pOut[1] = cast(wchar)(0x80 | ((b >> 6)  & 0x3f));
241                     pOut[2] = cast(wchar)(0x80 | (b & 0x3f));
242                     pOut += 3;
243                 }
244                 else
245                 {
246                     // deal with surrogate-pairs
247                     return toString (toString32(input, null, ate), output);
248                 }
249             }
250         }
251     }
252 
253     // return the produced output
254     return output [0..(pOut - output.ptr)];
255 }
256 
257 /*******************************************************************************
258 
259   Decode Utf8 produced by the above toString() method.
260 
261   If the output is provided off the stack, it should be large
262   enough to encompass the entire transcoding; failing to do
263   so will cause the output to be moved onto the heap instead.
264 
265   Returns a slice of the output buffer, corresponding to the
266   converted characters. For optimum performance, the returned
267   buffer should be specified as 'output' on subsequent calls.
268 
269   Where 'ate' is provided, it will be set to the number of
270   elements consumed from the input, and the output buffer
271   will not be resized (or allocated). This represents a
272   streaming mode, where slices of the input are processed
273   in sequence rather than all at one time (should use 'ate'
274   as an index for slicing into unconsumed input).
275 
276  *******************************************************************************/
277 
278 wchar[] toString16 (cstring input, wchar[] output=null, size_t* ate=null)
279 {
280     int     produced;
281     auto    pIn = input.ptr;
282     auto    pMax = pIn + input.length;
283     const(char)* pValid;
284 
285     if (ate is null)
286     {
287         if (input.length > output.length)
288             output.length = input.length;
289     }
290 
291     if (input.length)
292     {
293         foreach (ref wchar d; output)
294         {
295             pValid = pIn;
296             wchar b = cast(wchar) *pIn;
297 
298             if (b & 0x80)
299             {
300                 if (b < 0xe0)
301                 {
302                     b &= 0x1f;
303                     b = cast(wchar)((b << 6) | (*++pIn & 0x3f));
304                 }
305                 else
306                     if (b < 0xf0)
307                     {
308                         b &= 0x0f;
309                         b = cast(wchar)((b << 6) | (pIn[1] & 0x3f));
310                         b = cast(wchar)((b << 6) | (pIn[2] & 0x3f));
311                         pIn += 2;
312                     }
313                     else
314                         // deal with surrogate-pairs
315                         return toString16 (toString32(input, null, ate), output);
316             }
317 
318             d = b;
319             ++produced;
320 
321             // did we read past the end of the input?
322             if (++pIn >= pMax)
323             {
324                 if (pIn > pMax)
325                 {
326                     // yep ~ return tail or throw error?
327                     if (ate)
328                     {
329                         pIn = pValid;
330                         --produced;
331                         break;
332                     }
333                     core.exception.onUnicodeError("Unicode.toString16 : incomplete utf8 input", pIn - input.ptr);
334                 }
335                 else
336                     break;
337             }
338         }
339     }
340 
341     // do we still have some input left?
342     if (ate)
343         *ate = pIn - input.ptr;
344     else
345     {
346         if (pIn < pMax)
347             // this should never happen!
348             core.exception.onUnicodeError("Unicode.toString16 : utf8 overflow", pIn - input.ptr);
349     }
350 
351     // return the produced output
352     return output [0..produced];
353 }
354 
355 
356 /*******************************************************************************
357 
358   Encode Utf8 up to a maximum of 4 bytes long (five & six
359   byte variations are not supported). Throws an exception
360   where the input dchar is greater than 0x10ffff.
361 
362   If the output is provided off the stack, it should be large
363   enough to encompass the entire transcoding; failing to do
364   so will cause the output to be moved onto the heap instead.
365 
366   Returns a slice of the output buffer, corresponding to the
367   converted characters. For optimum performance, the returned
368   buffer should be specified as 'output' on subsequent calls.
369 
370   Where 'ate' is provided, it will be set to the number of
371   elements consumed from the input, and the output buffer
372   will not be resized (or allocated). This represents a
373   streaming mode, where slices of the input are processed
374   in sequence rather than all at one time (should use 'ate'
375   as an index for slicing into unconsumed input).
376 
377  *******************************************************************************/
378 
379 mstring toString (const(dchar)[] input, mstring output=null, size_t* ate=null)
380 {
381     if (ate)
382         *ate = input.length;
383     else
384     {
385         // potentially reallocate output
386         auto estimate = input.length * 2 + 4;
387         if (output.length < estimate)
388             output.length = estimate;
389     }
390 
391     char* pOut = output.ptr;
392     char* pMax = pOut + output.length - 4;
393 
394     foreach (size_t eaten, dchar b; input)
395     {
396         // about to overflow the output?
397         if (pOut > pMax)
398         {
399             // if streaming, just return the unused input
400             if (ate)
401             {
402                 *ate = eaten;
403                 break;
404             }
405 
406             // reallocate the output buffer
407             auto len = pOut - output.ptr;
408             output.length = len + len / 2;
409             pOut = output.ptr + len;
410             pMax = output.ptr + output.length - 4;
411         }
412 
413         if (b < 0x80)
414             *pOut++ = cast(char) b;
415         else
416             if (b < 0x0800)
417             {
418                 pOut[0] = cast(wchar)(0xc0 | ((b >> 6) & 0x3f));
419                 pOut[1] = cast(wchar)(0x80 | (b & 0x3f));
420                 pOut += 2;
421             }
422             else
423                 if (b < 0x10000)
424                 {
425                     pOut[0] = cast(wchar)(0xe0 | ((b >> 12) & 0x3f));
426                     pOut[1] = cast(wchar)(0x80 | ((b >> 6)  & 0x3f));
427                     pOut[2] = cast(wchar)(0x80 | (b & 0x3f));
428                     pOut += 3;
429                 }
430                 else
431                     if (b < 0x110000)
432                     {
433                         pOut[0] = cast(wchar)(0xf0 | ((b >> 18) & 0x3f));
434                         pOut[1] = cast(wchar)(0x80 | ((b >> 12) & 0x3f));
435                         pOut[2] = cast(wchar)(0x80 | ((b >> 6)  & 0x3f));
436                         pOut[3] = cast(wchar)(0x80 | (b & 0x3f));
437                         pOut += 4;
438                     }
439                     else
440                         core.exception.onUnicodeError("Unicode.toString : invalid dchar", eaten);
441     }
442 
443     // return the produced output
444     return output [0..(pOut - output.ptr)];
445 }
446 
447 
448 /*******************************************************************************
449 
450   Decode Utf8 produced by the above toString() method.
451 
452   If the output is provided off the stack, it should be large
453   enough to encompass the entire transcoding; failing to do
454   so will cause the output to be moved onto the heap instead.
455 
456   Returns a slice of the output buffer, corresponding to the
457   converted characters. For optimum performance, the returned
458   buffer should be specified as 'output' on subsequent calls.
459 
460   Where 'ate' is provided, it will be set to the number of
461   elements consumed from the input, and the output buffer
462   will not be resized (or allocated). This represents a
463   streaming mode, where slices of the input are processed
464   in sequence rather than all at one time (should use 'ate'
465   as an index for slicing into unconsumed input).
466 
467  *******************************************************************************/
468 
469 dchar[] toString32 (const(char)[] input, dchar[] output=null, size_t* ate=null)
470 {
471     int     produced;
472     auto    pIn = input.ptr;
473     auto    pMax = pIn + input.length;
474     const(char)* pValid;
475 
476     if (ate is null)
477         if (input.length > output.length)
478             output.length = input.length;
479 
480     if (input.length)
481     {
482         foreach (ref dchar d; output)
483         {
484             pValid = pIn;
485             dchar b = cast(dchar) *pIn;
486 
487             if (b & 0x80)
488             {
489                 if (b < 0xe0)
490                 {
491                     b &= 0x1f;
492                     b = (b << 6) | (*++pIn & 0x3f);
493                 }
494                 else
495                 {
496                     if (b < 0xf0)
497                     {
498                         b &= 0x0f;
499                         b = (b << 6) | (pIn[1] & 0x3f);
500                         b = (b << 6) | (pIn[2] & 0x3f);
501                         pIn += 2;
502                     }
503                     else
504                     {
505                         b &= 0x07;
506                         b = (b << 6) | (pIn[1] & 0x3f);
507                         b = (b << 6) | (pIn[2] & 0x3f);
508                         b = (b << 6) | (pIn[3] & 0x3f);
509 
510                         if (b >= 0x110000)
511                             core.exception.onUnicodeError("Unicode.toString32 : invalid utf8 input", pIn - input.ptr);
512                         pIn += 3;
513                     }
514                 }
515             }
516 
517             d = b;
518             ++produced;
519 
520             // did we read past the end of the input?
521             if (++pIn >= pMax)
522             {
523                 if (pIn > pMax)
524                 {
525                     // yep ~ return tail or throw error?
526                     if (ate)
527                     {
528                         pIn = pValid;
529                         --produced;
530                         break;
531                     }
532                     core.exception.onUnicodeError("Unicode.toString32 : incomplete utf8 input", pIn - input.ptr);
533                 }
534                 else
535                     break;
536             }
537         }
538     }
539 
540     // do we still have some input left?
541     if (ate)
542         *ate = pIn - input.ptr;
543     else
544     {
545         if (pIn < pMax)
546             // this should never happen!
547             core.exception.onUnicodeError("Unicode.toString32 : utf8 overflow", pIn - input.ptr);
548     }
549 
550     // return the produced output
551     return output [0..produced];
552 }
553 
554 /*******************************************************************************
555 
556   Encode Utf16 up to a maximum of 2 bytes long. Throws an exception
557   where the input dchar is greater than 0x10ffff.
558 
559   If the output is provided off the stack, it should be large
560   enough to encompass the entire transcoding; failing to do
561   so will cause the output to be moved onto the heap instead.
562 
563   Returns a slice of the output buffer, corresponding to the
564   converted characters. For optimum performance, the returned
565   buffer should be specified as 'output' on subsequent calls.
566 
567   Where 'ate' is provided, it will be set to the number of
568   elements consumed from the input, and the output buffer
569   will not be resized (or allocated). This represents a
570   streaming mode, where slices of the input are processed
571   in sequence rather than all at one time (should use 'ate'
572   as an index for slicing into unconsumed input).
573 
574  *******************************************************************************/
575 
576 wchar[] toString16 (const(dchar)[] input, wchar[] output=null, size_t* ate=null)
577 {
578     if (ate)
579         *ate = input.length;
580     else
581     {
582         auto estimate = input.length * 2 + 2;
583         if (output.length < estimate)
584             output.length = estimate;
585     }
586 
587     wchar* pOut = output.ptr;
588     wchar* pMax = pOut + output.length - 2;
589 
590     foreach (size_t eaten, dchar b; input)
591     {
592         // about to overflow the output?
593         if (pOut > pMax)
594         {
595             // if streaming, just return the unused input
596             if (ate)
597             {
598                 *ate = eaten;
599                 break;
600             }
601 
602             // reallocate the output buffer
603             size_t len = pOut - output.ptr;
604             output.length = len + len / 2;
605             pOut = output.ptr + len;
606             pMax = output.ptr + output.length - 2;
607         }
608 
609         if (b < 0x10000)
610             *pOut++ = cast(wchar) b;
611         else
612             if (b < 0x110000)
613             {
614                 pOut[0] = cast(wchar)(0xd800 | (((b - 0x10000) >> 10) & 0x3ff));
615                 pOut[1] = cast(wchar)(0xdc00 | ((b - 0x10000) & 0x3ff));
616                 pOut += 2;
617             }
618             else
619                 core.exception.onUnicodeError("Unicode.toString16 : invalid dchar", eaten);
620     }
621 
622     // return the produced output
623     return output [0..(pOut - output.ptr)];
624 }
625 
626 /*******************************************************************************
627 
628   Decode Utf16 produced by the above toString16() method.
629 
630   If the output is provided off the stack, it should be large
631   enough to encompass the entire transcoding; failing to do
632   so will cause the output to be moved onto the heap instead.
633 
634   Returns a slice of the output buffer, corresponding to the
635   converted characters. For optimum performance, the returned
636   buffer should be specified as 'output' on subsequent calls.
637 
638   Where 'ate' is provided, it will be set to the number of
639   elements consumed from the input, and the output buffer
640   will not be resized (or allocated). This represents a
641   streaming mode, where slices of the input are processed
642   in sequence rather than all at one time (should use 'ate'
643   as an index for slicing into unconsumed input).
644 
645  *******************************************************************************/
646 
647 dchar[] toString32 (const(wchar)[] input, dchar[] output=null, size_t* ate=null)
648 {
649     int     produced;
650     auto    pIn = input.ptr;
651     auto    pMax = pIn + input.length;
652     const(wchar)* pValid;
653 
654     if (ate is null)
655         if (input.length > output.length)
656             output.length = input.length;
657 
658     if (input.length)
659     {
660         foreach (ref dchar d; output)
661         {
662             pValid = pIn;
663             dchar b = cast(dchar) *pIn;
664 
665             // simple conversion ~ see http://www.unicode.org/faq/utf_bom.html#35
666             if (b >= 0xd800 && b <= 0xdfff)
667                 b = ((b - 0xd7c0) << 10) + (*++pIn - 0xdc00);
668 
669             if (b >= 0x110000)
670                 core.exception.onUnicodeError("Unicode.toString32 : invalid utf16 input", pIn - input.ptr);
671 
672             d = b;
673             ++produced;
674 
675             if (++pIn >= pMax)
676             {
677                 if (pIn > pMax)
678                 {
679                     // yep ~ return tail or throw error?
680                     if (ate)
681                     {
682                         pIn = pValid;
683                         --produced;
684                         break;
685                     }
686                     core.exception.onUnicodeError("Unicode.toString32 : incomplete utf16 input", pIn - input.ptr);
687                 }
688                 else
689                     break;
690             }
691         }
692     }
693 
694     // do we still have some input left?
695     if (ate)
696         *ate = pIn - input.ptr;
697     else
698     {
699         if (pIn < pMax)
700             // this should never happen!
701             core.exception.onUnicodeError("Unicode.toString32 : utf16 overflow", pIn - input.ptr);
702     }
703 
704     // return the produced output
705     return output [0..produced];
706 }
707 
708 
709 /*******************************************************************************
710 
711   Decodes a single dchar from the given src text, and indicates how
712   many chars were consumed from src to do so.
713 
714  *******************************************************************************/
715 
716 dchar decode (cstring src, ref size_t ate)
717 {
718     dchar[1] ret;
719     return toString32 (src, ret, &ate)[0];
720 }
721 
722 /*******************************************************************************
723 
724   Decodes a single dchar from the given src text, and indicates how
725   many wchars were consumed from src to do so.
726 
727  *******************************************************************************/
728 
729 dchar decode (const(wchar)[] src, ref size_t ate)
730 {
731     dchar[1] ret;
732     return toString32 (src, ret, &ate)[0];
733 }
734 
735 /*******************************************************************************
736 
737   Encode a dchar into the provided dst array, and return a slice of
738   it representing the encoding
739 
740  *******************************************************************************/
741 
742 mstring encode (mstring dst, dchar c)
743 {
744     return toString ((&c)[0..1], dst);
745 }
746 
747 /*******************************************************************************
748 
749   Encode a dchar into the provided dst array, and return a slice of
750   it representing the encoding
751 
752  *******************************************************************************/
753 
754 wchar[] encode (wchar[] dst, dchar c)
755 {
756     return toString16 ((&c)[0..1], dst);
757 }
758 
759 /*******************************************************************************
760 
761   Is the given character valid?
762 
763  *******************************************************************************/
764 
765 bool isValid (dchar c)
766 {
767     return (c < 0xD800 || (c > 0xDFFF && c <= 0x10FFFF));
768 }
769 
770 /*******************************************************************************
771 
772   Convert from a char[] into the type of the dst provided.
773 
774   Returns a slice of the given dst, where it is sufficiently large
775   to house the result, or a heap-allocated array otherwise. Returns
776   the original input where no conversion is required.
777 
778  *******************************************************************************/
779 
780 const(T)[] fromString8 (T) (cstring s, T[] dst)
781 {
782     static if (is(T == char))
783         return s;
784     else static if (is(T == wchar))
785         return .toString16 (s, dst);
786     else static if (is(T == dchar))
787         return .toString32 (s, dst);
788     else
789         static assert (false);
790 }
791 
792 /*******************************************************************************
793 
794   Convert from a wchar[] into the type of the dst provided.
795 
796   Returns a slice of the given dst, where it is sufficiently large
797   to house the result, or a heap-allocated array otherwise. Returns
798   the original input where no conversion is required.
799 
800  *******************************************************************************/
801 
802 const(char)[] fromString16 (const(wchar)[] s, char[] dst)
803 {
804     return .toString (s, dst);
805 }
806 
807 const(wchar)[] fromString16 (const(wchar)[] s, wchar[] dst)
808 {
809     return s;
810 }
811 
812 
813 const(dchar)[] fromString16 (const(wchar)[] s, dchar[] dst)
814 {
815     return .toString32 (s, dst);
816 }
817 
818 /*******************************************************************************
819 
820   Convert from a dchar[] into the type of the dst provided.
821 
822   Returns a slice of the given dst, where it is sufficiently large
823   to house the result, or a heap-allocated array otherwise. Returns
824   the original input where no conversion is required.
825 
826  *******************************************************************************/
827 
828 const(char)[] fromString32 (const(dchar)[] s, char[] dst)
829 {
830     return .toString (s, dst);
831 }
832 
833 const(wchar)[] fromString32 (const(dchar)[] s, wchar[] dst)
834 {
835     return .toString16 (s, dst);
836 }
837 
838 const(dchar)[] fromString32 (const(dchar)[] s, dchar[] dst)
839 {
840     return s;
841 }
842 
843 /*******************************************************************************
844 
845   Adjust the content such that no partial encodings exist on the
846   left side of the provided text.
847 
848   Returns a slice of the input
849 
850  *******************************************************************************/
851 
852 T[] cropLeft(T) (T[] s)
853 {
854     static if (is (T == char))
855         for (int i=0; i < s.length && (s[i] & 0x80); ++i)
856             if ((s[i] & 0xc0) is 0xc0)
857                 return s [i..$];
858 
859     static if (is (T == wchar))
860         // skip if first char is a trailing surrogate
861         if ((s[0] & 0xfffffc00) is 0xdc00)
862             return s [1..$];
863 
864     return s;
865 }
866 
867 /*******************************************************************************
868 
869   Adjust the content such that no partial encodings exist on the
870   right side of the provided text.
871 
872   Returns a slice of the input
873 
874  *******************************************************************************/
875 
876 T[] cropRight(T) (T[] s)
877 {
878     if (s.length)
879     {
880         size_t i = s.length - 1;
881         static if (is (T == char))
882         {
883             while (i && (s[i] & 0x80))
884             {
885                 if ((s[i] & 0xc0) is 0xc0)
886                 {
887                     // located the first byte of a sequence
888                     ubyte b = s[i];
889                     size_t d = s.length - i;
890 
891                     // is it a 3 byte sequence?
892                     if (b & 0x20)
893                         --d;
894 
895                     // or a four byte sequence?
896                     if (b & 0x10)
897                         --d;
898 
899                     // is the sequence complete?
900                     if (d is 2)
901                         i = s.length;
902                     return s [0..i];
903                 }
904                 else
905                     --i;
906             }
907         }
908 
909         static if (is (T == wchar))
910         {
911             // skip if last char is a leading surrogate
912             if ((s[i] & 0xfffffc00) is 0xd800)
913                 return s [0..$-1];
914         }
915     }
916     return s;
917 }
918 
919 
920 
921 /*******************************************************************************
922 
923  *******************************************************************************/
924 
925 debug (Utf)
926 {
927     import ocean.io.Console;
928 
929     void main()
930     {
931         auto s = "[\xc2\xa2\xc2\xa2\xc2\xa2]";
932         Cout (s).newline;
933 
934         Cout (cropLeft(s[0..$])).newline;
935         Cout (cropLeft(s[1..$])).newline;
936         Cout (cropLeft(s[2..$])).newline;
937         Cout (cropLeft(s[3..$])).newline;
938         Cout (cropLeft(s[4..$])).newline;
939         Cout (cropLeft(s[5..$])).newline;
940 
941         Cout (cropRight(s[0..$])).newline;
942         Cout (cropRight(s[0..$-1])).newline;
943         Cout (cropRight(s[0..$-2])).newline;
944         Cout (cropRight(s[0..$-3])).newline;
945         Cout (cropRight(s[0..$-4])).newline;
946         Cout (cropRight(s[0..$-5])).newline;
947     }
948 }