1 /*******************************************************************************
2 3 Fast Unicode transcoders. These are particularly sensitive to
4 minor changes on 32bit x86 devices, because the register set of
5 those devices is so small. Beware of subtle changes which might
6 extend the execution-period by as much as 200%. Because of this,
7 three of the six transcoders might read past the end of input by
8 one, two, or three bytes before arresting themselves. Note that
9 support for streaming adds a 15% overhead to the dchar => char
10 conversion, but has little effect on the others.
11 12 These routines were tuned on an Intel P4; other devices may work
13 more efficiently with a slightly different approach, though this
14 is likely to be reasonably optimal on AMD x86 CPUs also. These
15 algorithms would benefit significantly from those extra AMD64
16 registers. On a 3GHz P4, the dchar/char conversions take around
17 2500ns to process an array of 1000 ASCII elements. Invoking the
18 memory manager doubles that period, and quadruples the time for
19 arrays of 100 elements. Memory allocation can slow down notably
20 in a multi-threaded environment, so avoid that where possible.
21 22 Surrogate-pairs are dealt with in a non-optimal fashion when
23 transcoding between utf16 and utf8. Such cases are considered
24 to be boundary-conditions for this module.
25 26 There are three common cases where the input may be incomplete,
27 including each 'widening' case of utf8 => utf16, utf8 => utf32,
28 and utf16 => utf32. An edge-case is utf16 => utf8, if surrogate
29 pairs are present. Such cases will throw an exception, unless
30 streaming-mode is enabled ~ in the latter mode, an additional
31 integer is returned indicating how many elements of the input
32 have been consumed. In all cases, a correct slice of the output
33 is returned.
34 35 For details on Unicode processing see:
36 $(UL $(LINK http://www.utf-8.com/))
37 $(UL $(LINK http://www.hackcraft.net/xmlUnicode/))
38 $(UL $(LINK http://www.azillionmonkeys.com/qed/unicode.html/))
39 $(UL $(LINK http://icu.sourceforge.net/docs/papers/forms_of_unicode/))
40 41 Copyright:
42 Copyright (c) 2004 Kris Bell.
43 Some parts copyright (c) 2009-2016 dunnhumby Germany GmbH.
44 All rights reserved.
45 46 License:
47 Tango Dual License: 3-Clause BSD License / Academic Free License v3.0.
48 See LICENSE_TANGO.txt for details.
49 50 Version: Initial release: Oct 2004
51 52 Authors: Kris
53 54 *******************************************************************************/55 56 moduleocean.text.convert.Utf;
57 58 staticimportcore.exception;
59 60 importocean.meta.types.Qualifiers;
61 62 /*******************************************************************************
63 64 Symmetric calls for equivalent types; these return the provided
65 input with no conversion
66 67 *******************************************************************************/68 69 const(char)[] toString (const(char)[] src, char[] dst=null, size_t* ate=null) {returnsrc;}
70 const(wchar)[] toString (const(wchar)[] src, wchar[] dst, size_t* ate=null) {returnsrc;}
71 const(dchar)[] toString (const(dchar)[] src, dchar[] dst, size_t* ate=null) {returnsrc;}
72 73 74 /*******************************************************************************
75 76 Encode a string of characters into an UTF-8 string, providing one character
77 at a time to the delegate.
78 79 This allow to shift the allocation strategy on the user, which might have
80 more information about the kind of data passed to this function.
81 82 Parameters:
83 input = UTF-8, UTF-16 or UTF-32 encoded string to encode to UTF-8
84 dg = Output delegate to pass the result to
85 86 Note:
87 Unlike the other `toString` variant, UTF-16 -> UTF-8 doesn't support
88 surrogate pairs and will call `onUnicodeError`.
89 90 *******************************************************************************/91 92 publicvoidtoString (const(char)[] input, scopesize_tdelegate(cstring) dg)
93 {
94 dg(input);
95 }
96 97 /// Ditto98 publicvoidtoString (const(wchar)[] input, scopesize_tdelegate(cstring) dg)
99 {
100 char[4] buff;
101 foreach (size_tidx, wcharc; input)
102 {
103 if (c < 0x80)
104 dg((cast(const(char)*) &c)[0 .. 1]);
105 elseif (c < 0x0800)
106 {
107 buff[0] = cast(char)(0xc0 | ((c >> 6) & 0x3f));
108 buff[1] = cast(char)(0x80 | (c & 0x3f));
109 dg(buff[0 .. 2]);
110 }
111 elseif (c < 0xd800 || c > 0xdfff)
112 {
113 buff[0] = cast(char)(0xe0 | ((c >> 12) & 0x3f));
114 buff[1] = cast(char)(0x80 | ((c >> 6) & 0x3f));
115 buff[2] = cast(char)(0x80 | (c & 0x3f));
116 dg(buff[0 .. 3]);
117 }
118 else119 core.exception.onUnicodeError("Unicode.toString : Surrogate pair not supported", 0);
120 }
121 }
122 123 /// Ditto124 publicvoidtoString (const(dchar)[] input, scopesize_tdelegate(cstring) dg)
125 {
126 char[4] buff;
127 foreach (size_tidx, dcharc; input)
128 {
129 if (c < 0x80)
130 dg((cast(const(char)*) &c)[0 .. 1]);
131 elseif (c < 0x0800)
132 {
133 buff[0] = cast(char)(0xc0 | ((c >> 6) & 0x3f));
134 buff[1] = cast(char)(0x80 | (c & 0x3f));
135 dg(buff[0 .. 2]);
136 }
137 elseif (c < 0x10000)
138 {
139 buff[0] = cast(char)(0xe0 | ((c >> 12) & 0x3f));
140 buff[1] = cast(char)(0x80 | ((c >> 6) & 0x3f));
141 buff[2] = cast(char)(0x80 | (c & 0x3f));
142 dg(buff[0 .. 3]);
143 }
144 elseif (c < 0x110000)
145 {
146 buff[0] = cast(char)(0xf0 | ((c >> 18) & 0x3f));
147 buff[1] = cast(char)(0x80 | ((c >> 12) & 0x3f));
148 buff[2] = cast(char)(0x80 | ((c >> 6) & 0x3f));
149 buff[3] = cast(char)(0x80 | (c & 0x3f));
150 dg(buff);
151 }
152 else153 core.exception.onUnicodeError("Unicode.toString : invalid dchar", idx);
154 }
155 }
156 157 158 /*******************************************************************************
159 160 Encode Utf8 up to a maximum of 4 bytes long (five & six byte
161 variations are not supported).
162 163 If the output is provided off the stack, it should be large
164 enough to encompass the entire transcoding; failing to do
165 so will cause the output to be moved onto the heap instead.
166 167 Returns a slice of the output buffer, corresponding to the
168 converted characters. For optimum performance, the returned
169 buffer should be specified as 'output' on subsequent calls.
170 For example:
171 172 ---
173 char[] output;
174 175 char[] result = toString (input, output);
176 177 // reset output after a realloc
178 if (result.length > output.length)
179 output = result;
180 ---
181 182 Where 'ate' is provided, it will be set to the number of
183 elements consumed from the input, and the output buffer
184 will not be resized (or allocated). This represents a
185 streaming mode, where slices of the input are processed
186 in sequence rather than all at one time (should use 'ate'
187 as an index for slicing into unconsumed input).
188 189 *******************************************************************************/190 191 mstringtoString (const(wchar)[] input, mstringoutput=null, size_t* ate=null)
192 {
193 if (ate)
194 *ate = input.length;
195 else196 {
197 // potentially reallocate output198 autoestimate = input.length * 2 + 3;
199 if (output.length < estimate)
200 output.length = estimate;
201 }
202 203 char* pOut = output.ptr;
204 char* pMax = pOut + output.length - 3;
205 206 foreach (size_teaten, wcharb; input)
207 {
208 // about to overflow the output?209 if (pOut > pMax)
210 {
211 // if streaming, just return the unused input212 if (ate)
213 {
214 *ate = eaten;
215 break;
216 }
217 218 // reallocate the output buffer219 autolen = pOut - output.ptr;
220 output.length = len + len / 2;
221 pOut = output.ptr + len;
222 pMax = output.ptr + output.length - 3;
223 }
224 225 if (b < 0x80)
226 *pOut++ = cast(char) b;
227 else228 {
229 if (b < 0x0800)
230 {
231 pOut[0] = cast(wchar)(0xc0 | ((b >> 6) & 0x3f));
232 pOut[1] = cast(wchar)(0x80 | (b & 0x3f));
233 pOut += 2;
234 }
235 else236 {
237 if (b < 0xd800 || b > 0xdfff)
238 {
239 pOut[0] = cast(wchar)(0xe0 | ((b >> 12) & 0x3f));
240 pOut[1] = cast(wchar)(0x80 | ((b >> 6) & 0x3f));
241 pOut[2] = cast(wchar)(0x80 | (b & 0x3f));
242 pOut += 3;
243 }
244 else245 {
246 // deal with surrogate-pairs247 returntoString (toString32(input, null, ate), output);
248 }
249 }
250 }
251 }
252 253 // return the produced output254 returnoutput [0..(pOut - output.ptr)];
255 }
256 257 /*******************************************************************************
258 259 Decode Utf8 produced by the above toString() method.
260 261 If the output is provided off the stack, it should be large
262 enough to encompass the entire transcoding; failing to do
263 so will cause the output to be moved onto the heap instead.
264 265 Returns a slice of the output buffer, corresponding to the
266 converted characters. For optimum performance, the returned
267 buffer should be specified as 'output' on subsequent calls.
268 269 Where 'ate' is provided, it will be set to the number of
270 elements consumed from the input, and the output buffer
271 will not be resized (or allocated). This represents a
272 streaming mode, where slices of the input are processed
273 in sequence rather than all at one time (should use 'ate'
274 as an index for slicing into unconsumed input).
275 276 *******************************************************************************/277 278 wchar[] toString16 (cstringinput, wchar[] output=null, size_t* ate=null)
279 {
280 intproduced;
281 autopIn = input.ptr;
282 autopMax = pIn + input.length;
283 const(char)* pValid;
284 285 if (ateisnull)
286 {
287 if (input.length > output.length)
288 output.length = input.length;
289 }
290 291 if (input.length)
292 {
293 foreach (refwchard; output)
294 {
295 pValid = pIn;
296 wcharb = cast(wchar) *pIn;
297 298 if (b & 0x80)
299 {
300 if (b < 0xe0)
301 {
302 b &= 0x1f;
303 b = cast(wchar)((b << 6) | (*++pIn & 0x3f));
304 }
305 else306 if (b < 0xf0)
307 {
308 b &= 0x0f;
309 b = cast(wchar)((b << 6) | (pIn[1] & 0x3f));
310 b = cast(wchar)((b << 6) | (pIn[2] & 0x3f));
311 pIn += 2;
312 }
313 else314 // deal with surrogate-pairs315 returntoString16 (toString32(input, null, ate), output);
316 }
317 318 d = b;
319 ++produced;
320 321 // did we read past the end of the input?322 if (++pIn >= pMax)
323 {
324 if (pIn > pMax)
325 {
326 // yep ~ return tail or throw error?327 if (ate)
328 {
329 pIn = pValid;
330 --produced;
331 break;
332 }
333 core.exception.onUnicodeError("Unicode.toString16 : incomplete utf8 input", pIn - input.ptr);
334 }
335 else336 break;
337 }
338 }
339 }
340 341 // do we still have some input left?342 if (ate)
343 *ate = pIn - input.ptr;
344 else345 {
346 if (pIn < pMax)
347 // this should never happen!348 core.exception.onUnicodeError("Unicode.toString16 : utf8 overflow", pIn - input.ptr);
349 }
350 351 // return the produced output352 returnoutput [0..produced];
353 }
354 355 356 /*******************************************************************************
357 358 Encode Utf8 up to a maximum of 4 bytes long (five & six
359 byte variations are not supported). Throws an exception
360 where the input dchar is greater than 0x10ffff.
361 362 If the output is provided off the stack, it should be large
363 enough to encompass the entire transcoding; failing to do
364 so will cause the output to be moved onto the heap instead.
365 366 Returns a slice of the output buffer, corresponding to the
367 converted characters. For optimum performance, the returned
368 buffer should be specified as 'output' on subsequent calls.
369 370 Where 'ate' is provided, it will be set to the number of
371 elements consumed from the input, and the output buffer
372 will not be resized (or allocated). This represents a
373 streaming mode, where slices of the input are processed
374 in sequence rather than all at one time (should use 'ate'
375 as an index for slicing into unconsumed input).
376 377 *******************************************************************************/378 379 mstringtoString (const(dchar)[] input, mstringoutput=null, size_t* ate=null)
380 {
381 if (ate)
382 *ate = input.length;
383 else384 {
385 // potentially reallocate output386 autoestimate = input.length * 2 + 4;
387 if (output.length < estimate)
388 output.length = estimate;
389 }
390 391 char* pOut = output.ptr;
392 char* pMax = pOut + output.length - 4;
393 394 foreach (size_teaten, dcharb; input)
395 {
396 // about to overflow the output?397 if (pOut > pMax)
398 {
399 // if streaming, just return the unused input400 if (ate)
401 {
402 *ate = eaten;
403 break;
404 }
405 406 // reallocate the output buffer407 autolen = pOut - output.ptr;
408 output.length = len + len / 2;
409 pOut = output.ptr + len;
410 pMax = output.ptr + output.length - 4;
411 }
412 413 if (b < 0x80)
414 *pOut++ = cast(char) b;
415 else416 if (b < 0x0800)
417 {
418 pOut[0] = cast(wchar)(0xc0 | ((b >> 6) & 0x3f));
419 pOut[1] = cast(wchar)(0x80 | (b & 0x3f));
420 pOut += 2;
421 }
422 else423 if (b < 0x10000)
424 {
425 pOut[0] = cast(wchar)(0xe0 | ((b >> 12) & 0x3f));
426 pOut[1] = cast(wchar)(0x80 | ((b >> 6) & 0x3f));
427 pOut[2] = cast(wchar)(0x80 | (b & 0x3f));
428 pOut += 3;
429 }
430 else431 if (b < 0x110000)
432 {
433 pOut[0] = cast(wchar)(0xf0 | ((b >> 18) & 0x3f));
434 pOut[1] = cast(wchar)(0x80 | ((b >> 12) & 0x3f));
435 pOut[2] = cast(wchar)(0x80 | ((b >> 6) & 0x3f));
436 pOut[3] = cast(wchar)(0x80 | (b & 0x3f));
437 pOut += 4;
438 }
439 else440 core.exception.onUnicodeError("Unicode.toString : invalid dchar", eaten);
441 }
442 443 // return the produced output444 returnoutput [0..(pOut - output.ptr)];
445 }
446 447 448 /*******************************************************************************
449 450 Decode Utf8 produced by the above toString() method.
451 452 If the output is provided off the stack, it should be large
453 enough to encompass the entire transcoding; failing to do
454 so will cause the output to be moved onto the heap instead.
455 456 Returns a slice of the output buffer, corresponding to the
457 converted characters. For optimum performance, the returned
458 buffer should be specified as 'output' on subsequent calls.
459 460 Where 'ate' is provided, it will be set to the number of
461 elements consumed from the input, and the output buffer
462 will not be resized (or allocated). This represents a
463 streaming mode, where slices of the input are processed
464 in sequence rather than all at one time (should use 'ate'
465 as an index for slicing into unconsumed input).
466 467 *******************************************************************************/468 469 dchar[] toString32 (const(char)[] input, dchar[] output=null, size_t* ate=null)
470 {
471 intproduced;
472 autopIn = input.ptr;
473 autopMax = pIn + input.length;
474 const(char)* pValid;
475 476 if (ateisnull)
477 if (input.length > output.length)
478 output.length = input.length;
479 480 if (input.length)
481 {
482 foreach (refdchard; output)
483 {
484 pValid = pIn;
485 dcharb = cast(dchar) *pIn;
486 487 if (b & 0x80)
488 {
489 if (b < 0xe0)
490 {
491 b &= 0x1f;
492 b = (b << 6) | (*++pIn & 0x3f);
493 }
494 else495 {
496 if (b < 0xf0)
497 {
498 b &= 0x0f;
499 b = (b << 6) | (pIn[1] & 0x3f);
500 b = (b << 6) | (pIn[2] & 0x3f);
501 pIn += 2;
502 }
503 else504 {
505 b &= 0x07;
506 b = (b << 6) | (pIn[1] & 0x3f);
507 b = (b << 6) | (pIn[2] & 0x3f);
508 b = (b << 6) | (pIn[3] & 0x3f);
509 510 if (b >= 0x110000)
511 core.exception.onUnicodeError("Unicode.toString32 : invalid utf8 input", pIn - input.ptr);
512 pIn += 3;
513 }
514 }
515 }
516 517 d = b;
518 ++produced;
519 520 // did we read past the end of the input?521 if (++pIn >= pMax)
522 {
523 if (pIn > pMax)
524 {
525 // yep ~ return tail or throw error?526 if (ate)
527 {
528 pIn = pValid;
529 --produced;
530 break;
531 }
532 core.exception.onUnicodeError("Unicode.toString32 : incomplete utf8 input", pIn - input.ptr);
533 }
534 else535 break;
536 }
537 }
538 }
539 540 // do we still have some input left?541 if (ate)
542 *ate = pIn - input.ptr;
543 else544 {
545 if (pIn < pMax)
546 // this should never happen!547 core.exception.onUnicodeError("Unicode.toString32 : utf8 overflow", pIn - input.ptr);
548 }
549 550 // return the produced output551 returnoutput [0..produced];
552 }
553 554 /*******************************************************************************
555 556 Encode Utf16 up to a maximum of 2 bytes long. Throws an exception
557 where the input dchar is greater than 0x10ffff.
558 559 If the output is provided off the stack, it should be large
560 enough to encompass the entire transcoding; failing to do
561 so will cause the output to be moved onto the heap instead.
562 563 Returns a slice of the output buffer, corresponding to the
564 converted characters. For optimum performance, the returned
565 buffer should be specified as 'output' on subsequent calls.
566 567 Where 'ate' is provided, it will be set to the number of
568 elements consumed from the input, and the output buffer
569 will not be resized (or allocated). This represents a
570 streaming mode, where slices of the input are processed
571 in sequence rather than all at one time (should use 'ate'
572 as an index for slicing into unconsumed input).
573 574 *******************************************************************************/575 576 wchar[] toString16 (const(dchar)[] input, wchar[] output=null, size_t* ate=null)
577 {
578 if (ate)
579 *ate = input.length;
580 else581 {
582 autoestimate = input.length * 2 + 2;
583 if (output.length < estimate)
584 output.length = estimate;
585 }
586 587 wchar* pOut = output.ptr;
588 wchar* pMax = pOut + output.length - 2;
589 590 foreach (size_teaten, dcharb; input)
591 {
592 // about to overflow the output?593 if (pOut > pMax)
594 {
595 // if streaming, just return the unused input596 if (ate)
597 {
598 *ate = eaten;
599 break;
600 }
601 602 // reallocate the output buffer603 size_tlen = pOut - output.ptr;
604 output.length = len + len / 2;
605 pOut = output.ptr + len;
606 pMax = output.ptr + output.length - 2;
607 }
608 609 if (b < 0x10000)
610 *pOut++ = cast(wchar) b;
611 else612 if (b < 0x110000)
613 {
614 pOut[0] = cast(wchar)(0xd800 | (((b - 0x10000) >> 10) & 0x3ff));
615 pOut[1] = cast(wchar)(0xdc00 | ((b - 0x10000) & 0x3ff));
616 pOut += 2;
617 }
618 else619 core.exception.onUnicodeError("Unicode.toString16 : invalid dchar", eaten);
620 }
621 622 // return the produced output623 returnoutput [0..(pOut - output.ptr)];
624 }
625 626 /*******************************************************************************
627 628 Decode Utf16 produced by the above toString16() method.
629 630 If the output is provided off the stack, it should be large
631 enough to encompass the entire transcoding; failing to do
632 so will cause the output to be moved onto the heap instead.
633 634 Returns a slice of the output buffer, corresponding to the
635 converted characters. For optimum performance, the returned
636 buffer should be specified as 'output' on subsequent calls.
637 638 Where 'ate' is provided, it will be set to the number of
639 elements consumed from the input, and the output buffer
640 will not be resized (or allocated). This represents a
641 streaming mode, where slices of the input are processed
642 in sequence rather than all at one time (should use 'ate'
643 as an index for slicing into unconsumed input).
644 645 *******************************************************************************/646 647 dchar[] toString32 (const(wchar)[] input, dchar[] output=null, size_t* ate=null)
648 {
649 intproduced;
650 autopIn = input.ptr;
651 autopMax = pIn + input.length;
652 const(wchar)* pValid;
653 654 if (ateisnull)
655 if (input.length > output.length)
656 output.length = input.length;
657 658 if (input.length)
659 {
660 foreach (refdchard; output)
661 {
662 pValid = pIn;
663 dcharb = cast(dchar) *pIn;
664 665 // simple conversion ~ see http://www.unicode.org/faq/utf_bom.html#35666 if (b >= 0xd800 && b <= 0xdfff)
667 b = ((b - 0xd7c0) << 10) + (*++pIn - 0xdc00);
668 669 if (b >= 0x110000)
670 core.exception.onUnicodeError("Unicode.toString32 : invalid utf16 input", pIn - input.ptr);
671 672 d = b;
673 ++produced;
674 675 if (++pIn >= pMax)
676 {
677 if (pIn > pMax)
678 {
679 // yep ~ return tail or throw error?680 if (ate)
681 {
682 pIn = pValid;
683 --produced;
684 break;
685 }
686 core.exception.onUnicodeError("Unicode.toString32 : incomplete utf16 input", pIn - input.ptr);
687 }
688 else689 break;
690 }
691 }
692 }
693 694 // do we still have some input left?695 if (ate)
696 *ate = pIn - input.ptr;
697 else698 {
699 if (pIn < pMax)
700 // this should never happen!701 core.exception.onUnicodeError("Unicode.toString32 : utf16 overflow", pIn - input.ptr);
702 }
703 704 // return the produced output705 returnoutput [0..produced];
706 }
707 708 709 /*******************************************************************************
710 711 Decodes a single dchar from the given src text, and indicates how
712 many chars were consumed from src to do so.
713 714 *******************************************************************************/715 716 dchardecode (cstringsrc, refsize_tate)
717 {
718 dchar[1] ret;
719 returntoString32 (src, ret, &ate)[0];
720 }
721 722 /*******************************************************************************
723 724 Decodes a single dchar from the given src text, and indicates how
725 many wchars were consumed from src to do so.
726 727 *******************************************************************************/728 729 dchardecode (const(wchar)[] src, refsize_tate)
730 {
731 dchar[1] ret;
732 returntoString32 (src, ret, &ate)[0];
733 }
734 735 /*******************************************************************************
736 737 Encode a dchar into the provided dst array, and return a slice of
738 it representing the encoding
739 740 *******************************************************************************/741 742 mstringencode (mstringdst, dcharc)
743 {
744 returntoString ((&c)[0..1], dst);
745 }
746 747 /*******************************************************************************
748 749 Encode a dchar into the provided dst array, and return a slice of
750 it representing the encoding
751 752 *******************************************************************************/753 754 wchar[] encode (wchar[] dst, dcharc)
755 {
756 returntoString16 ((&c)[0..1], dst);
757 }
758 759 /*******************************************************************************
760 761 Is the given character valid?
762 763 *******************************************************************************/764 765 boolisValid (dcharc)
766 {
767 return (c < 0xD800 || (c > 0xDFFF && c <= 0x10FFFF));
768 }
769 770 /*******************************************************************************
771 772 Convert from a char[] into the type of the dst provided.
773 774 Returns a slice of the given dst, where it is sufficiently large
775 to house the result, or a heap-allocated array otherwise. Returns
776 the original input where no conversion is required.
777 778 *******************************************************************************/779 780 const(T)[] fromString8 (T) (cstrings, T[] dst)
781 {
782 staticif (is(T == char))
783 returns;
784 elsestaticif (is(T == wchar))
785 return .toString16 (s, dst);
786 elsestaticif (is(T == dchar))
787 return .toString32 (s, dst);
788 else789 staticassert (false);
790 }
791 792 /*******************************************************************************
793 794 Convert from a wchar[] into the type of the dst provided.
795 796 Returns a slice of the given dst, where it is sufficiently large
797 to house the result, or a heap-allocated array otherwise. Returns
798 the original input where no conversion is required.
799 800 *******************************************************************************/801 802 const(char)[] fromString16 (const(wchar)[] s, char[] dst)
803 {
804 return .toString (s, dst);
805 }
806 807 const(wchar)[] fromString16 (const(wchar)[] s, wchar[] dst)
808 {
809 returns;
810 }
811 812 813 const(dchar)[] fromString16 (const(wchar)[] s, dchar[] dst)
814 {
815 return .toString32 (s, dst);
816 }
817 818 /*******************************************************************************
819 820 Convert from a dchar[] into the type of the dst provided.
821 822 Returns a slice of the given dst, where it is sufficiently large
823 to house the result, or a heap-allocated array otherwise. Returns
824 the original input where no conversion is required.
825 826 *******************************************************************************/827 828 const(char)[] fromString32 (const(dchar)[] s, char[] dst)
829 {
830 return .toString (s, dst);
831 }
832 833 const(wchar)[] fromString32 (const(dchar)[] s, wchar[] dst)
834 {
835 return .toString16 (s, dst);
836 }
837 838 const(dchar)[] fromString32 (const(dchar)[] s, dchar[] dst)
839 {
840 returns;
841 }
842 843 /*******************************************************************************
844 845 Adjust the content such that no partial encodings exist on the
846 left side of the provided text.
847 848 Returns a slice of the input
849 850 *******************************************************************************/851 852 T[] cropLeft(T) (T[] s)
853 {
854 staticif (is (T == char))
855 for (inti=0; i < s.length && (s[i] & 0x80); ++i)
856 if ((s[i] & 0xc0) is0xc0)
857 returns [i..$];
858 859 staticif (is (T == wchar))
860 // skip if first char is a trailing surrogate861 if ((s[0] & 0xfffffc00) is0xdc00)
862 returns [1..$];
863 864 returns;
865 }
866 867 /*******************************************************************************
868 869 Adjust the content such that no partial encodings exist on the
870 right side of the provided text.
871 872 Returns a slice of the input
873 874 *******************************************************************************/875 876 T[] cropRight(T) (T[] s)
877 {
878 if (s.length)
879 {
880 size_ti = s.length - 1;
881 staticif (is (T == char))
882 {
883 while (i && (s[i] & 0x80))
884 {
885 if ((s[i] & 0xc0) is0xc0)
886 {
887 // located the first byte of a sequence888 ubyteb = s[i];
889 size_td = s.length - i;
890 891 // is it a 3 byte sequence?892 if (b & 0x20)
893 --d;
894 895 // or a four byte sequence?896 if (b & 0x10)
897 --d;
898 899 // is the sequence complete?900 if (dis2)
901 i = s.length;
902 returns [0..i];
903 }
904 else905 --i;
906 }
907 }
908 909 staticif (is (T == wchar))
910 {
911 // skip if last char is a leading surrogate912 if ((s[i] & 0xfffffc00) is0xd800)
913 returns [0..$-1];
914 }
915 }
916 returns;
917 }
918 919 920 921 /*******************************************************************************
922 923 *******************************************************************************/924 925 debug (Utf)
926 {
927 importocean.io.Console;
928 929 voidmain()
930 {
931 autos = "[\xc2\xa2\xc2\xa2\xc2\xa2]";
932 Cout (s).newline;
933 934 Cout (cropLeft(s[0..$])).newline;
935 Cout (cropLeft(s[1..$])).newline;
936 Cout (cropLeft(s[2..$])).newline;
937 Cout (cropLeft(s[3..$])).newline;
938 Cout (cropLeft(s[4..$])).newline;
939 Cout (cropLeft(s[5..$])).newline;
940 941 Cout (cropRight(s[0..$])).newline;
942 Cout (cropRight(s[0..$-1])).newline;
943 Cout (cropRight(s[0..$-2])).newline;
944 Cout (cropRight(s[0..$-3])).newline;
945 Cout (cropRight(s[0..$-4])).newline;
946 Cout (cropRight(s[0..$-5])).newline;
947 }
948 }