1 /******************************************************************************* 2 3 Fast Unicode transcoders. These are particularly sensitive to 4 minor changes on 32bit x86 devices, because the register set of 5 those devices is so small. Beware of subtle changes which might 6 extend the execution-period by as much as 200%. Because of this, 7 three of the six transcoders might read past the end of input by 8 one, two, or three bytes before arresting themselves. Note that 9 support for streaming adds a 15% overhead to the dchar => char 10 conversion, but has little effect on the others. 11 12 These routines were tuned on an Intel P4; other devices may work 13 more efficiently with a slightly different approach, though this 14 is likely to be reasonably optimal on AMD x86 CPUs also. These 15 algorithms would benefit significantly from those extra AMD64 16 registers. On a 3GHz P4, the dchar/char conversions take around 17 2500ns to process an array of 1000 ASCII elements. Invoking the 18 memory manager doubles that period, and quadruples the time for 19 arrays of 100 elements. Memory allocation can slow down notably 20 in a multi-threaded environment, so avoid that where possible. 21 22 Surrogate-pairs are dealt with in a non-optimal fashion when 23 transcoding between utf16 and utf8. Such cases are considered 24 to be boundary-conditions for this module. 25 26 There are three common cases where the input may be incomplete, 27 including each 'widening' case of utf8 => utf16, utf8 => utf32, 28 and utf16 => utf32. An edge-case is utf16 => utf8, if surrogate 29 pairs are present. Such cases will throw an exception, unless 30 streaming-mode is enabled ~ in the latter mode, an additional 31 integer is returned indicating how many elements of the input 32 have been consumed. In all cases, a correct slice of the output 33 is returned. 34 35 For details on Unicode processing see: 36 $(UL $(LINK http://www.utf-8.com/)) 37 $(UL $(LINK http://www.hackcraft.net/xmlUnicode/)) 38 $(UL $(LINK http://www.azillionmonkeys.com/qed/unicode.html/)) 39 $(UL $(LINK http://icu.sourceforge.net/docs/papers/forms_of_unicode/)) 40 41 Copyright: 42 Copyright (c) 2004 Kris Bell. 43 Some parts copyright (c) 2009-2016 dunnhumby Germany GmbH. 44 All rights reserved. 45 46 License: 47 Tango Dual License: 3-Clause BSD License / Academic Free License v3.0. 48 See LICENSE_TANGO.txt for details. 49 50 Version: Initial release: Oct 2004 51 52 Authors: Kris 53 54 *******************************************************************************/ 55 56 module ocean.text.convert.Utf; 57 58 static import core.exception; 59 60 import ocean.meta.types.Qualifiers; 61 62 /******************************************************************************* 63 64 Symmetric calls for equivalent types; these return the provided 65 input with no conversion 66 67 *******************************************************************************/ 68 69 const(char)[] toString (const(char)[] src, char[] dst=null, size_t* ate=null) {return src;} 70 const(wchar)[] toString (const(wchar)[] src, wchar[] dst, size_t* ate=null) {return src;} 71 const(dchar)[] toString (const(dchar)[] src, dchar[] dst, size_t* ate=null) {return src;} 72 73 74 /******************************************************************************* 75 76 Encode a string of characters into an UTF-8 string, providing one character 77 at a time to the delegate. 78 79 This allow to shift the allocation strategy on the user, which might have 80 more information about the kind of data passed to this function. 81 82 Parameters: 83 input = UTF-8, UTF-16 or UTF-32 encoded string to encode to UTF-8 84 dg = Output delegate to pass the result to 85 86 Note: 87 Unlike the other `toString` variant, UTF-16 -> UTF-8 doesn't support 88 surrogate pairs and will call `onUnicodeError`. 89 90 *******************************************************************************/ 91 92 public void toString (const(char)[] input, scope size_t delegate(cstring) dg) 93 { 94 dg(input); 95 } 96 97 /// Ditto 98 public void toString (const(wchar)[] input, scope size_t delegate(cstring) dg) 99 { 100 char[4] buff; 101 foreach (size_t idx, wchar c; input) 102 { 103 if (c < 0x80) 104 dg((cast(const(char)*) &c)[0 .. 1]); 105 else if (c < 0x0800) 106 { 107 buff[0] = cast(char)(0xc0 | ((c >> 6) & 0x3f)); 108 buff[1] = cast(char)(0x80 | (c & 0x3f)); 109 dg(buff[0 .. 2]); 110 } 111 else if (c < 0xd800 || c > 0xdfff) 112 { 113 buff[0] = cast(char)(0xe0 | ((c >> 12) & 0x3f)); 114 buff[1] = cast(char)(0x80 | ((c >> 6) & 0x3f)); 115 buff[2] = cast(char)(0x80 | (c & 0x3f)); 116 dg(buff[0 .. 3]); 117 } 118 else 119 core.exception.onUnicodeError("Unicode.toString : Surrogate pair not supported", 0); 120 } 121 } 122 123 /// Ditto 124 public void toString (const(dchar)[] input, scope size_t delegate(cstring) dg) 125 { 126 char[4] buff; 127 foreach (size_t idx, dchar c; input) 128 { 129 if (c < 0x80) 130 dg((cast(const(char)*) &c)[0 .. 1]); 131 else if (c < 0x0800) 132 { 133 buff[0] = cast(char)(0xc0 | ((c >> 6) & 0x3f)); 134 buff[1] = cast(char)(0x80 | (c & 0x3f)); 135 dg(buff[0 .. 2]); 136 } 137 else if (c < 0x10000) 138 { 139 buff[0] = cast(char)(0xe0 | ((c >> 12) & 0x3f)); 140 buff[1] = cast(char)(0x80 | ((c >> 6) & 0x3f)); 141 buff[2] = cast(char)(0x80 | (c & 0x3f)); 142 dg(buff[0 .. 3]); 143 } 144 else if (c < 0x110000) 145 { 146 buff[0] = cast(char)(0xf0 | ((c >> 18) & 0x3f)); 147 buff[1] = cast(char)(0x80 | ((c >> 12) & 0x3f)); 148 buff[2] = cast(char)(0x80 | ((c >> 6) & 0x3f)); 149 buff[3] = cast(char)(0x80 | (c & 0x3f)); 150 dg(buff); 151 } 152 else 153 core.exception.onUnicodeError("Unicode.toString : invalid dchar", idx); 154 } 155 } 156 157 158 /******************************************************************************* 159 160 Encode Utf8 up to a maximum of 4 bytes long (five & six byte 161 variations are not supported). 162 163 If the output is provided off the stack, it should be large 164 enough to encompass the entire transcoding; failing to do 165 so will cause the output to be moved onto the heap instead. 166 167 Returns a slice of the output buffer, corresponding to the 168 converted characters. For optimum performance, the returned 169 buffer should be specified as 'output' on subsequent calls. 170 For example: 171 172 --- 173 char[] output; 174 175 char[] result = toString (input, output); 176 177 // reset output after a realloc 178 if (result.length > output.length) 179 output = result; 180 --- 181 182 Where 'ate' is provided, it will be set to the number of 183 elements consumed from the input, and the output buffer 184 will not be resized (or allocated). This represents a 185 streaming mode, where slices of the input are processed 186 in sequence rather than all at one time (should use 'ate' 187 as an index for slicing into unconsumed input). 188 189 *******************************************************************************/ 190 191 mstring toString (const(wchar)[] input, mstring output=null, size_t* ate=null) 192 { 193 if (ate) 194 *ate = input.length; 195 else 196 { 197 // potentially reallocate output 198 auto estimate = input.length * 2 + 3; 199 if (output.length < estimate) 200 output.length = estimate; 201 } 202 203 char* pOut = output.ptr; 204 char* pMax = pOut + output.length - 3; 205 206 foreach (size_t eaten, wchar b; input) 207 { 208 // about to overflow the output? 209 if (pOut > pMax) 210 { 211 // if streaming, just return the unused input 212 if (ate) 213 { 214 *ate = eaten; 215 break; 216 } 217 218 // reallocate the output buffer 219 auto len = pOut - output.ptr; 220 output.length = len + len / 2; 221 pOut = output.ptr + len; 222 pMax = output.ptr + output.length - 3; 223 } 224 225 if (b < 0x80) 226 *pOut++ = cast(char) b; 227 else 228 { 229 if (b < 0x0800) 230 { 231 pOut[0] = cast(wchar)(0xc0 | ((b >> 6) & 0x3f)); 232 pOut[1] = cast(wchar)(0x80 | (b & 0x3f)); 233 pOut += 2; 234 } 235 else 236 { 237 if (b < 0xd800 || b > 0xdfff) 238 { 239 pOut[0] = cast(wchar)(0xe0 | ((b >> 12) & 0x3f)); 240 pOut[1] = cast(wchar)(0x80 | ((b >> 6) & 0x3f)); 241 pOut[2] = cast(wchar)(0x80 | (b & 0x3f)); 242 pOut += 3; 243 } 244 else 245 { 246 // deal with surrogate-pairs 247 return toString (toString32(input, null, ate), output); 248 } 249 } 250 } 251 } 252 253 // return the produced output 254 return output [0..(pOut - output.ptr)]; 255 } 256 257 /******************************************************************************* 258 259 Decode Utf8 produced by the above toString() method. 260 261 If the output is provided off the stack, it should be large 262 enough to encompass the entire transcoding; failing to do 263 so will cause the output to be moved onto the heap instead. 264 265 Returns a slice of the output buffer, corresponding to the 266 converted characters. For optimum performance, the returned 267 buffer should be specified as 'output' on subsequent calls. 268 269 Where 'ate' is provided, it will be set to the number of 270 elements consumed from the input, and the output buffer 271 will not be resized (or allocated). This represents a 272 streaming mode, where slices of the input are processed 273 in sequence rather than all at one time (should use 'ate' 274 as an index for slicing into unconsumed input). 275 276 *******************************************************************************/ 277 278 wchar[] toString16 (cstring input, wchar[] output=null, size_t* ate=null) 279 { 280 int produced; 281 auto pIn = input.ptr; 282 auto pMax = pIn + input.length; 283 const(char)* pValid; 284 285 if (ate is null) 286 { 287 if (input.length > output.length) 288 output.length = input.length; 289 } 290 291 if (input.length) 292 { 293 foreach (ref wchar d; output) 294 { 295 pValid = pIn; 296 wchar b = cast(wchar) *pIn; 297 298 if (b & 0x80) 299 { 300 if (b < 0xe0) 301 { 302 b &= 0x1f; 303 b = cast(wchar)((b << 6) | (*++pIn & 0x3f)); 304 } 305 else 306 if (b < 0xf0) 307 { 308 b &= 0x0f; 309 b = cast(wchar)((b << 6) | (pIn[1] & 0x3f)); 310 b = cast(wchar)((b << 6) | (pIn[2] & 0x3f)); 311 pIn += 2; 312 } 313 else 314 // deal with surrogate-pairs 315 return toString16 (toString32(input, null, ate), output); 316 } 317 318 d = b; 319 ++produced; 320 321 // did we read past the end of the input? 322 if (++pIn >= pMax) 323 { 324 if (pIn > pMax) 325 { 326 // yep ~ return tail or throw error? 327 if (ate) 328 { 329 pIn = pValid; 330 --produced; 331 break; 332 } 333 core.exception.onUnicodeError("Unicode.toString16 : incomplete utf8 input", pIn - input.ptr); 334 } 335 else 336 break; 337 } 338 } 339 } 340 341 // do we still have some input left? 342 if (ate) 343 *ate = pIn - input.ptr; 344 else 345 { 346 if (pIn < pMax) 347 // this should never happen! 348 core.exception.onUnicodeError("Unicode.toString16 : utf8 overflow", pIn - input.ptr); 349 } 350 351 // return the produced output 352 return output [0..produced]; 353 } 354 355 356 /******************************************************************************* 357 358 Encode Utf8 up to a maximum of 4 bytes long (five & six 359 byte variations are not supported). Throws an exception 360 where the input dchar is greater than 0x10ffff. 361 362 If the output is provided off the stack, it should be large 363 enough to encompass the entire transcoding; failing to do 364 so will cause the output to be moved onto the heap instead. 365 366 Returns a slice of the output buffer, corresponding to the 367 converted characters. For optimum performance, the returned 368 buffer should be specified as 'output' on subsequent calls. 369 370 Where 'ate' is provided, it will be set to the number of 371 elements consumed from the input, and the output buffer 372 will not be resized (or allocated). This represents a 373 streaming mode, where slices of the input are processed 374 in sequence rather than all at one time (should use 'ate' 375 as an index for slicing into unconsumed input). 376 377 *******************************************************************************/ 378 379 mstring toString (const(dchar)[] input, mstring output=null, size_t* ate=null) 380 { 381 if (ate) 382 *ate = input.length; 383 else 384 { 385 // potentially reallocate output 386 auto estimate = input.length * 2 + 4; 387 if (output.length < estimate) 388 output.length = estimate; 389 } 390 391 char* pOut = output.ptr; 392 char* pMax = pOut + output.length - 4; 393 394 foreach (size_t eaten, dchar b; input) 395 { 396 // about to overflow the output? 397 if (pOut > pMax) 398 { 399 // if streaming, just return the unused input 400 if (ate) 401 { 402 *ate = eaten; 403 break; 404 } 405 406 // reallocate the output buffer 407 auto len = pOut - output.ptr; 408 output.length = len + len / 2; 409 pOut = output.ptr + len; 410 pMax = output.ptr + output.length - 4; 411 } 412 413 if (b < 0x80) 414 *pOut++ = cast(char) b; 415 else 416 if (b < 0x0800) 417 { 418 pOut[0] = cast(wchar)(0xc0 | ((b >> 6) & 0x3f)); 419 pOut[1] = cast(wchar)(0x80 | (b & 0x3f)); 420 pOut += 2; 421 } 422 else 423 if (b < 0x10000) 424 { 425 pOut[0] = cast(wchar)(0xe0 | ((b >> 12) & 0x3f)); 426 pOut[1] = cast(wchar)(0x80 | ((b >> 6) & 0x3f)); 427 pOut[2] = cast(wchar)(0x80 | (b & 0x3f)); 428 pOut += 3; 429 } 430 else 431 if (b < 0x110000) 432 { 433 pOut[0] = cast(wchar)(0xf0 | ((b >> 18) & 0x3f)); 434 pOut[1] = cast(wchar)(0x80 | ((b >> 12) & 0x3f)); 435 pOut[2] = cast(wchar)(0x80 | ((b >> 6) & 0x3f)); 436 pOut[3] = cast(wchar)(0x80 | (b & 0x3f)); 437 pOut += 4; 438 } 439 else 440 core.exception.onUnicodeError("Unicode.toString : invalid dchar", eaten); 441 } 442 443 // return the produced output 444 return output [0..(pOut - output.ptr)]; 445 } 446 447 448 /******************************************************************************* 449 450 Decode Utf8 produced by the above toString() method. 451 452 If the output is provided off the stack, it should be large 453 enough to encompass the entire transcoding; failing to do 454 so will cause the output to be moved onto the heap instead. 455 456 Returns a slice of the output buffer, corresponding to the 457 converted characters. For optimum performance, the returned 458 buffer should be specified as 'output' on subsequent calls. 459 460 Where 'ate' is provided, it will be set to the number of 461 elements consumed from the input, and the output buffer 462 will not be resized (or allocated). This represents a 463 streaming mode, where slices of the input are processed 464 in sequence rather than all at one time (should use 'ate' 465 as an index for slicing into unconsumed input). 466 467 *******************************************************************************/ 468 469 dchar[] toString32 (const(char)[] input, dchar[] output=null, size_t* ate=null) 470 { 471 int produced; 472 auto pIn = input.ptr; 473 auto pMax = pIn + input.length; 474 const(char)* pValid; 475 476 if (ate is null) 477 if (input.length > output.length) 478 output.length = input.length; 479 480 if (input.length) 481 { 482 foreach (ref dchar d; output) 483 { 484 pValid = pIn; 485 dchar b = cast(dchar) *pIn; 486 487 if (b & 0x80) 488 { 489 if (b < 0xe0) 490 { 491 b &= 0x1f; 492 b = (b << 6) | (*++pIn & 0x3f); 493 } 494 else 495 { 496 if (b < 0xf0) 497 { 498 b &= 0x0f; 499 b = (b << 6) | (pIn[1] & 0x3f); 500 b = (b << 6) | (pIn[2] & 0x3f); 501 pIn += 2; 502 } 503 else 504 { 505 b &= 0x07; 506 b = (b << 6) | (pIn[1] & 0x3f); 507 b = (b << 6) | (pIn[2] & 0x3f); 508 b = (b << 6) | (pIn[3] & 0x3f); 509 510 if (b >= 0x110000) 511 core.exception.onUnicodeError("Unicode.toString32 : invalid utf8 input", pIn - input.ptr); 512 pIn += 3; 513 } 514 } 515 } 516 517 d = b; 518 ++produced; 519 520 // did we read past the end of the input? 521 if (++pIn >= pMax) 522 { 523 if (pIn > pMax) 524 { 525 // yep ~ return tail or throw error? 526 if (ate) 527 { 528 pIn = pValid; 529 --produced; 530 break; 531 } 532 core.exception.onUnicodeError("Unicode.toString32 : incomplete utf8 input", pIn - input.ptr); 533 } 534 else 535 break; 536 } 537 } 538 } 539 540 // do we still have some input left? 541 if (ate) 542 *ate = pIn - input.ptr; 543 else 544 { 545 if (pIn < pMax) 546 // this should never happen! 547 core.exception.onUnicodeError("Unicode.toString32 : utf8 overflow", pIn - input.ptr); 548 } 549 550 // return the produced output 551 return output [0..produced]; 552 } 553 554 /******************************************************************************* 555 556 Encode Utf16 up to a maximum of 2 bytes long. Throws an exception 557 where the input dchar is greater than 0x10ffff. 558 559 If the output is provided off the stack, it should be large 560 enough to encompass the entire transcoding; failing to do 561 so will cause the output to be moved onto the heap instead. 562 563 Returns a slice of the output buffer, corresponding to the 564 converted characters. For optimum performance, the returned 565 buffer should be specified as 'output' on subsequent calls. 566 567 Where 'ate' is provided, it will be set to the number of 568 elements consumed from the input, and the output buffer 569 will not be resized (or allocated). This represents a 570 streaming mode, where slices of the input are processed 571 in sequence rather than all at one time (should use 'ate' 572 as an index for slicing into unconsumed input). 573 574 *******************************************************************************/ 575 576 wchar[] toString16 (const(dchar)[] input, wchar[] output=null, size_t* ate=null) 577 { 578 if (ate) 579 *ate = input.length; 580 else 581 { 582 auto estimate = input.length * 2 + 2; 583 if (output.length < estimate) 584 output.length = estimate; 585 } 586 587 wchar* pOut = output.ptr; 588 wchar* pMax = pOut + output.length - 2; 589 590 foreach (size_t eaten, dchar b; input) 591 { 592 // about to overflow the output? 593 if (pOut > pMax) 594 { 595 // if streaming, just return the unused input 596 if (ate) 597 { 598 *ate = eaten; 599 break; 600 } 601 602 // reallocate the output buffer 603 size_t len = pOut - output.ptr; 604 output.length = len + len / 2; 605 pOut = output.ptr + len; 606 pMax = output.ptr + output.length - 2; 607 } 608 609 if (b < 0x10000) 610 *pOut++ = cast(wchar) b; 611 else 612 if (b < 0x110000) 613 { 614 pOut[0] = cast(wchar)(0xd800 | (((b - 0x10000) >> 10) & 0x3ff)); 615 pOut[1] = cast(wchar)(0xdc00 | ((b - 0x10000) & 0x3ff)); 616 pOut += 2; 617 } 618 else 619 core.exception.onUnicodeError("Unicode.toString16 : invalid dchar", eaten); 620 } 621 622 // return the produced output 623 return output [0..(pOut - output.ptr)]; 624 } 625 626 /******************************************************************************* 627 628 Decode Utf16 produced by the above toString16() method. 629 630 If the output is provided off the stack, it should be large 631 enough to encompass the entire transcoding; failing to do 632 so will cause the output to be moved onto the heap instead. 633 634 Returns a slice of the output buffer, corresponding to the 635 converted characters. For optimum performance, the returned 636 buffer should be specified as 'output' on subsequent calls. 637 638 Where 'ate' is provided, it will be set to the number of 639 elements consumed from the input, and the output buffer 640 will not be resized (or allocated). This represents a 641 streaming mode, where slices of the input are processed 642 in sequence rather than all at one time (should use 'ate' 643 as an index for slicing into unconsumed input). 644 645 *******************************************************************************/ 646 647 dchar[] toString32 (const(wchar)[] input, dchar[] output=null, size_t* ate=null) 648 { 649 int produced; 650 auto pIn = input.ptr; 651 auto pMax = pIn + input.length; 652 const(wchar)* pValid; 653 654 if (ate is null) 655 if (input.length > output.length) 656 output.length = input.length; 657 658 if (input.length) 659 { 660 foreach (ref dchar d; output) 661 { 662 pValid = pIn; 663 dchar b = cast(dchar) *pIn; 664 665 // simple conversion ~ see http://www.unicode.org/faq/utf_bom.html#35 666 if (b >= 0xd800 && b <= 0xdfff) 667 b = ((b - 0xd7c0) << 10) + (*++pIn - 0xdc00); 668 669 if (b >= 0x110000) 670 core.exception.onUnicodeError("Unicode.toString32 : invalid utf16 input", pIn - input.ptr); 671 672 d = b; 673 ++produced; 674 675 if (++pIn >= pMax) 676 { 677 if (pIn > pMax) 678 { 679 // yep ~ return tail or throw error? 680 if (ate) 681 { 682 pIn = pValid; 683 --produced; 684 break; 685 } 686 core.exception.onUnicodeError("Unicode.toString32 : incomplete utf16 input", pIn - input.ptr); 687 } 688 else 689 break; 690 } 691 } 692 } 693 694 // do we still have some input left? 695 if (ate) 696 *ate = pIn - input.ptr; 697 else 698 { 699 if (pIn < pMax) 700 // this should never happen! 701 core.exception.onUnicodeError("Unicode.toString32 : utf16 overflow", pIn - input.ptr); 702 } 703 704 // return the produced output 705 return output [0..produced]; 706 } 707 708 709 /******************************************************************************* 710 711 Decodes a single dchar from the given src text, and indicates how 712 many chars were consumed from src to do so. 713 714 *******************************************************************************/ 715 716 dchar decode (cstring src, ref size_t ate) 717 { 718 dchar[1] ret; 719 return toString32 (src, ret, &ate)[0]; 720 } 721 722 /******************************************************************************* 723 724 Decodes a single dchar from the given src text, and indicates how 725 many wchars were consumed from src to do so. 726 727 *******************************************************************************/ 728 729 dchar decode (const(wchar)[] src, ref size_t ate) 730 { 731 dchar[1] ret; 732 return toString32 (src, ret, &ate)[0]; 733 } 734 735 /******************************************************************************* 736 737 Encode a dchar into the provided dst array, and return a slice of 738 it representing the encoding 739 740 *******************************************************************************/ 741 742 mstring encode (mstring dst, dchar c) 743 { 744 return toString ((&c)[0..1], dst); 745 } 746 747 /******************************************************************************* 748 749 Encode a dchar into the provided dst array, and return a slice of 750 it representing the encoding 751 752 *******************************************************************************/ 753 754 wchar[] encode (wchar[] dst, dchar c) 755 { 756 return toString16 ((&c)[0..1], dst); 757 } 758 759 /******************************************************************************* 760 761 Is the given character valid? 762 763 *******************************************************************************/ 764 765 bool isValid (dchar c) 766 { 767 return (c < 0xD800 || (c > 0xDFFF && c <= 0x10FFFF)); 768 } 769 770 /******************************************************************************* 771 772 Convert from a char[] into the type of the dst provided. 773 774 Returns a slice of the given dst, where it is sufficiently large 775 to house the result, or a heap-allocated array otherwise. Returns 776 the original input where no conversion is required. 777 778 *******************************************************************************/ 779 780 const(T)[] fromString8 (T) (cstring s, T[] dst) 781 { 782 static if (is(T == char)) 783 return s; 784 else static if (is(T == wchar)) 785 return .toString16 (s, dst); 786 else static if (is(T == dchar)) 787 return .toString32 (s, dst); 788 else 789 static assert (false); 790 } 791 792 /******************************************************************************* 793 794 Convert from a wchar[] into the type of the dst provided. 795 796 Returns a slice of the given dst, where it is sufficiently large 797 to house the result, or a heap-allocated array otherwise. Returns 798 the original input where no conversion is required. 799 800 *******************************************************************************/ 801 802 const(char)[] fromString16 (const(wchar)[] s, char[] dst) 803 { 804 return .toString (s, dst); 805 } 806 807 const(wchar)[] fromString16 (const(wchar)[] s, wchar[] dst) 808 { 809 return s; 810 } 811 812 813 const(dchar)[] fromString16 (const(wchar)[] s, dchar[] dst) 814 { 815 return .toString32 (s, dst); 816 } 817 818 /******************************************************************************* 819 820 Convert from a dchar[] into the type of the dst provided. 821 822 Returns a slice of the given dst, where it is sufficiently large 823 to house the result, or a heap-allocated array otherwise. Returns 824 the original input where no conversion is required. 825 826 *******************************************************************************/ 827 828 const(char)[] fromString32 (const(dchar)[] s, char[] dst) 829 { 830 return .toString (s, dst); 831 } 832 833 const(wchar)[] fromString32 (const(dchar)[] s, wchar[] dst) 834 { 835 return .toString16 (s, dst); 836 } 837 838 const(dchar)[] fromString32 (const(dchar)[] s, dchar[] dst) 839 { 840 return s; 841 } 842 843 /******************************************************************************* 844 845 Adjust the content such that no partial encodings exist on the 846 left side of the provided text. 847 848 Returns a slice of the input 849 850 *******************************************************************************/ 851 852 T[] cropLeft(T) (T[] s) 853 { 854 static if (is (T == char)) 855 for (int i=0; i < s.length && (s[i] & 0x80); ++i) 856 if ((s[i] & 0xc0) is 0xc0) 857 return s [i..$]; 858 859 static if (is (T == wchar)) 860 // skip if first char is a trailing surrogate 861 if ((s[0] & 0xfffffc00) is 0xdc00) 862 return s [1..$]; 863 864 return s; 865 } 866 867 /******************************************************************************* 868 869 Adjust the content such that no partial encodings exist on the 870 right side of the provided text. 871 872 Returns a slice of the input 873 874 *******************************************************************************/ 875 876 T[] cropRight(T) (T[] s) 877 { 878 if (s.length) 879 { 880 size_t i = s.length - 1; 881 static if (is (T == char)) 882 { 883 while (i && (s[i] & 0x80)) 884 { 885 if ((s[i] & 0xc0) is 0xc0) 886 { 887 // located the first byte of a sequence 888 ubyte b = s[i]; 889 size_t d = s.length - i; 890 891 // is it a 3 byte sequence? 892 if (b & 0x20) 893 --d; 894 895 // or a four byte sequence? 896 if (b & 0x10) 897 --d; 898 899 // is the sequence complete? 900 if (d is 2) 901 i = s.length; 902 return s [0..i]; 903 } 904 else 905 --i; 906 } 907 } 908 909 static if (is (T == wchar)) 910 { 911 // skip if last char is a leading surrogate 912 if ((s[i] & 0xfffffc00) is 0xd800) 913 return s [0..$-1]; 914 } 915 } 916 return s; 917 } 918 919 920 921 /******************************************************************************* 922 923 *******************************************************************************/ 924 925 debug (Utf) 926 { 927 import ocean.io.Console; 928 929 void main() 930 { 931 auto s = "[\xc2\xa2\xc2\xa2\xc2\xa2]"; 932 Cout (s).newline; 933 934 Cout (cropLeft(s[0..$])).newline; 935 Cout (cropLeft(s[1..$])).newline; 936 Cout (cropLeft(s[2..$])).newline; 937 Cout (cropLeft(s[3..$])).newline; 938 Cout (cropLeft(s[4..$])).newline; 939 Cout (cropLeft(s[5..$])).newline; 940 941 Cout (cropRight(s[0..$])).newline; 942 Cout (cropRight(s[0..$-1])).newline; 943 Cout (cropRight(s[0..$-2])).newline; 944 Cout (cropRight(s[0..$-3])).newline; 945 Cout (cropRight(s[0..$-4])).newline; 946 Cout (cropRight(s[0..$-5])).newline; 947 } 948 }