1 /******************************************************************************* 2 3 Contains utility functions for working with unicode strings. Contains a 4 function to return the length of a UTF-8 string, a method to truncate a 5 UTF-8 string to the nearest whitespace character that is less than a maximum 6 length parameter, and a method to truncate a UTF-8 string and append a set 7 ending to it. 8 9 Example usage: 10 11 --- 12 13 char[] utf = ...; // some UTF-8 character sequence 14 15 // using the default unicode error handler 16 size_t len1 = utf8Length(utf); 17 18 // using a custom error handler 19 // which takes the index of the string as a parameter 20 size_t len2 = utf8Length(utf, (size_t i){ // error handling code... }); 21 22 --- 23 24 Copyright: 25 Copyright (c) 2009-2016 dunnhumby Germany GmbH. 26 All rights reserved. 27 28 License: 29 Boost Software License Version 1.0. See LICENSE_BOOST.txt for details. 30 Alternatively, this file may be distributed under the terms of the Tango 31 3-Clause BSD License (see LICENSE_BSD.txt for details). 32 33 *******************************************************************************/ 34 35 module ocean.text.utf.UtfUtil; 36 37 import ocean.core.Array: append, copy; 38 import ocean.core.Test; 39 import ocean.core.Verify; 40 import ocean.math.IEEE: isNaN; 41 import ocean.meta.types.Qualifiers; 42 import ocean.stdc.gnu..string: memrchr; 43 import ocean.text.Unicode : isSpace; 44 import ocean.text.utf.c.glib_unicode; 45 46 import core.exception: onUnicodeError; 47 48 /******************************************************************************* 49 50 UTF-8 representation of "…". 51 52 *******************************************************************************/ 53 54 public istring ellipsis = "\xE2\x80\xA6"; // The char '…' 55 56 57 /******************************************************************************* 58 59 This array gives the length of a UTF-8 sequence indexed by the value 60 of the leading byte. An FF (ubyte.max) represents an illegal starting value 61 of a UTF-8 sequence. 62 FF is used instead of 0 to avoid having loops hang. 63 64 *******************************************************************************/ 65 66 private static immutable ubyte[char.max + 1] utf8_stride = 67 [ 68 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 69 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 70 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 71 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 72 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 73 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 74 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 75 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 76 ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max, 77 ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max, 78 ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max, 79 ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max, 80 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 81 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 82 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 83 4,4,4,4,4,4,4,4,5,5,5,5,6,6,ubyte.max,ubyte.max, 84 ]; 85 86 87 /******************************************************************************* 88 89 Calculates the number of UTF8 code points in a UTF8-encoded string. 90 Calls the standard unicode error handler on error, 91 which throws a new UnicodeException. 92 93 Params: 94 str = The string to calculate the length of. 95 96 Returns: 97 The length of the given string. 98 99 Throws: 100 UnicodeException if an invalid UTF8 code unit is detected. 101 102 *******************************************************************************/ 103 104 public size_t utf8Length ( cstring str ) 105 { 106 void error ( size_t i ) 107 { 108 onUnicodeError("invalid UTF-8 sequence", i); 109 } 110 111 return utf8Length(str, &error); 112 } 113 114 115 /******************************************************************************* 116 117 Calculates the number of UTF8 code points in a UTF8-encoded string. 118 Calls error_dg if an invalid UTF8 code unit is detected, 119 which may throw an exception to abort processing. 120 121 Params: 122 str = The string to calculate the length of. 123 error_dg = The error delegate to call upon finding an invalid code unit. 124 Takes a size_t parameter representing the index of the current 125 code point in the string. 126 127 Returns: 128 The length of the given string. 129 130 *******************************************************************************/ 131 132 public size_t utf8Length ( cstring str, scope void delegate ( size_t ) error_dg ) 133 { 134 size_t length; 135 size_t i; 136 size_t stride; 137 138 for ( i = 0; i < str.length; i += stride ) 139 { 140 // check how much we should increment the index 141 // based on the size of the current UTF8 code point 142 stride = utf8_stride[str[i]]; 143 144 if ( stride == ubyte.max ) 145 { 146 error_dg(i); 147 } 148 149 length++; 150 } 151 152 if ( i > str.length ) 153 { 154 verify(i >= stride, "i should be stride or greater"); 155 i -= stride; 156 verify(i < str.length, "i - stride should be less than str.length"); 157 error_dg(i); 158 } 159 160 return length; 161 } 162 163 unittest 164 { 165 test(utf8Length(null) == 0, 166 "the length of a null string should be 0"); 167 168 test(utf8Length("") == 0, 169 "the length of an empty string should be 0"); 170 171 test(utf8Length("foo bar baz xyzzy") == 17, 172 "the length of \"foo bar baz xyzzy\" should be 17"); 173 174 test(utf8Length("ðäß ßøø+ ì$ æ ¢ööđ µøvi€ →→→") == 28, 175 "the length of \"ðäß ßøø+ ì$ æ ¢ööđ µøvi€ →→→\" should be 28"); 176 177 // test if error delegate is called for an invalid string 178 bool error_caught = false; 179 static immutable istring error_str = "error in " ~ char.init ~ " the middle"; 180 utf8Length(error_str, ( size_t i ) { error_caught = true; }); 181 test(error_caught, 182 "the call to utf8Length should have caught an error"); 183 184 // test if error delegate is called for a valid string 185 error_caught = false; 186 static immutable istring valid_str = "There are no errors in this string!"; 187 utf8Length(valid_str, ( size_t i ) { error_caught = true; }); 188 test(!error_caught, 189 "the call to utf8Length should not have caught an error"); 190 } 191 192 193 /******************************************************************************* 194 195 Limits str to a length of n UTF-8 code points, cutting off on the last 196 space, if found. If str is not valid UTF-8, str.length is assumed to be the 197 number of code points. 198 199 Params: 200 str = string to limit the length 201 n = maximum number of code points in the resulting string 202 203 Out: 204 The maximum number of code points in str is n. 205 206 Returns: 207 The truncated string for method chaining 208 209 *******************************************************************************/ 210 211 public mstring truncateAtWordBreak ( ref mstring str, size_t n ) 212 out (result) 213 { 214 if (result.length > n) 215 { 216 assert(g_utf8_validate(result.ptr, result.length, null)); 217 assert(g_utf8_strlen(result.ptr, result.length) <= n); 218 } 219 } 220 do 221 { 222 if (n < str.length) 223 { 224 bool valid_utf8 = g_utf8_validate(str.ptr, str.length, null); 225 226 auto utf8_len = valid_utf8 ? utf8Length(str) : str.length; 227 228 if (n < utf8_len) 229 { 230 size_t last = n; 231 232 if (valid_utf8) 233 { 234 last = g_utf8_offset_to_pointer(str.ptr, last) - str.ptr; 235 } 236 237 void* result = memrchr(str.ptr, ' ', last); 238 char* c = cast(char*) result; 239 if (c) 240 { 241 // Skip consecutive ' ' characters. 242 while (*c == ' ' && c > str.ptr) 243 { 244 c--; 245 } 246 247 str.length = c - str.ptr + (c != str.ptr); 248 } 249 else 250 { 251 // If no ' ' is found to break on, set the break to the maximum 252 // number of code points 253 str.length = last; 254 } 255 } 256 } 257 258 return str; 259 } 260 261 unittest 262 { 263 void doTest ( cstring input, cstring expected_output, int length, int line = __LINE__ ) 264 { 265 mstring buffer; 266 buffer.copy(input); 267 test!("==")(truncateAtWordBreak(buffer, length), expected_output, __FILE__, line); 268 } 269 270 doTest("Hello World!", "Hello World!", "Hello World!".length); 271 272 doTest("Hello World!", "Hello World!", "Hello World!".length + 5); 273 274 doTest("Hello World!", "Hello", 9); 275 276 doTest("Hällö World!", "Hällö", 9); 277 278 doTest("äöü", "äöü", 3); 279 280 doTest("Hello World!", "Hello", 9); 281 } 282 283 284 /******************************************************************************* 285 286 Truncate the length of a UTF-8 string and append a set ending. The string 287 is initially truncated so that it is of maximum length n (this includes 288 the extra ending paramter so the string is truncated to position 289 n - ending.length). 290 291 Params: 292 str = string to truncate and append the ending to 293 n = maximum number of code points in the resulting string 294 ending = the ending to append to the string, defaults to "..." 295 296 In: 297 n must be at least `ending.length` 298 299 Returns: 300 The truncated and appended string for method chaining 301 302 *******************************************************************************/ 303 304 public mstring truncateAppendEnding ( ref mstring str, size_t n, cstring ending = "...") 305 { 306 verify (n >= ending.length); 307 308 bool valid_utf8 = g_utf8_validate(str.ptr, str.length, null); 309 310 auto utf8_len = valid_utf8 ? utf8Length(str) : str.length; 311 312 if (n < utf8_len) 313 { 314 truncateAtWordBreak(str, (n - ending.length)); 315 str.append(ending); 316 } 317 318 return str; 319 } 320 321 unittest 322 { 323 mstring buffer; 324 325 void doTest ( cstring input, cstring expected_output, int length, 326 cstring ending = "..." , int line = __LINE__ ) 327 { 328 buffer.copy(input); 329 test!("==")(truncateAppendEnding(buffer, length, ending), 330 expected_output, __FILE__, line); 331 } 332 333 doTest("Hello World!", "Hello World!", "Hello World!".length); 334 335 doTest("Hello World!", "Hello World!", "Hello World!".length + 5); 336 337 doTest("Hello World!", "Hello...", 9); 338 339 doTest("Hällö World!", "Hällö...", 9); 340 341 doTest("äöü äöü", "ä...", 4); 342 343 doTest("Hello World!", "Hello...", 9); 344 345 doTest("HelloW" ~ cast (char) 0x81 ~ "rld!", 346 "HelloW" ~ cast (char) 0x81 ~ "...", 10); 347 348 doTest("HelloWörld!", "HelloWörl+", 10, "+"); 349 350 doTest("Designstarker Couchtisch in hochwertiger Holznachbildung. Mit " 351 ~ "praktischem Ablagebogen in Kernnussbaumfarben oder Schwarz. " 352 ~ "Winkelfüße mit Alukante. B", "Designstarker Couchtisch in hochwertiger" 353 ~ " Holznachbildung. Mit praktischem Ablagebogen...", 90); 354 } 355 356 357 /******************************************************************************* 358 359 Limits the length of a UTF-8 string, to at most the specified number of 360 bytes. 361 362 This is conceptually equal to str[0..max_len], except that we take care to 363 avoid chopping a multi-byte UTF-8 character in half. 364 365 Params: 366 str = the string to be sliced 367 max_len = the maximum allowable length (in bytes) of the string 368 369 Returns: 370 a slice of the original string, of length at most max_len. 371 372 *******************************************************************************/ 373 374 public inout(mstring) limitStringLength ( inout(mstring) str, size_t max_len ) 375 { 376 if ( str.length <= max_len ) 377 { 378 return str; 379 } 380 381 // Make sure we don't chop a character in half. 382 // All UTF-8 continuation bytes are of the form 0b10xxxxxxx, 383 // so we must skip all such bytes 384 385 auto k = max_len; 386 387 while ( k != 0 && ( (str[k] & 0xC0 ) == 0x80) ) 388 { 389 --k; 390 } 391 392 return str[ 0 .. k ]; 393 394 } 395 396 397 unittest 398 { 399 // String ending with a 1-byte character 400 401 test!("==")(limitStringLength("abc", 5), "abc"); 402 test!("==")(limitStringLength("abc", 2), "ab"); 403 404 // String ending with a 2-byte character 405 406 test!("==")(limitStringLength("ÜÄ", 5), "ÜÄ"); 407 test!("==")(limitStringLength("ÜÄ", 4), "ÜÄ"); 408 test!("==")(limitStringLength("ÜÄ", 3), "Ü"); 409 test!("==")(limitStringLength("ÜÄ", 2), "Ü"); 410 test!("==")(limitStringLength("ÜÄ", 1), ""); 411 412 // String ending with a 3-byte character 413 414 test!("==")(limitStringLength("Ü眼", 6), "Ü眼"); 415 test!("==")(limitStringLength("Ü眼", 5), "Ü眼"); 416 test!("==")(limitStringLength("Ü眼", 4), "Ü"); 417 418 // Ensure it compiles with an mstring 419 420 mstring x = "abcd".dup; 421 mstring y = limitStringLength(x, 2); 422 } 423 424 425 /******************************************************************************* 426 427 Truncates a string at the last space before the n-th Unicode character or, 428 if the resulting string is too short, at the n-th Unicode character. 429 The string should be a valid UTF-8 (the caller should have validated it 430 before calling this function). 431 432 If a string is truncated before the end, then the final Unicode chartacter 433 is made an ending. Trailing space is removed before the ending is added. 434 The returned string will always be no more than n Unicode characters 435 (including the ending). 436 437 The basic algorithm is to walk through src keeping track of how many 438 bytes needed to be sliced at any particular time until we know when 439 we need to end. Because we don't know till the end if we need an 440 ending we need to keep track of one Unicode character behind as well as the 441 position of the Unicode character berore the last space. We have to be 442 careful we never point at spaces. 443 444 Important points when reading the algorithm: 445 446 1) Unicode character != byte 447 2) i == the number of bytes required to include the _previous_ 448 Unicode character (i.e. the number of bytes to the start of c) 449 450 Params: 451 src = the string to truncate (must be UTF-8 encoded) 452 n = the maximum number of Unicode characters allowed in the 453 returned string 454 buffer = a buffer to be used to store the result in (may be 455 resized). The buffer is required because "ending" may 456 contain Unicode characters taking more bytes than the 457 Unicode characters in src they replace, thus leading to a 458 string with fewer Unicode characters but more bytes. 459 ending = These Unicode characters will be appended when "src" needs 460 to be truncated. 461 fill_ratio = if cutting the string in the last space would make its 462 Unicode character length smaller than "n*fill_ratio", 463 then we cut it on the n-th Unicode character 464 465 Returns: 466 buffer 467 468 *******************************************************************************/ 469 470 public mstring truncateAtN(cstring src, size_t n, ref mstring buffer, 471 cstring ending = ellipsis, float fill_ratio = 0.75) 472 out (result) 473 { 474 size_t result_length = 0; 475 foreach ( dchar c; result ) 476 { 477 ++result_length; 478 } 479 480 assert(result_length <= n); 481 } 482 do 483 { 484 { 485 size_t ending_length = 0; // Ending's number of Unicode characters 486 foreach ( dchar c; ending ) 487 { 488 ++ending_length; 489 } 490 491 verify(n > ending_length); 492 493 verify(!isNaN(fill_ratio)); 494 verify(fill_ratio>=0 && fill_ratio<=1); 495 } 496 497 size_t ending_length = 0; // Ending's number of Unicode characters 498 foreach ( size_t i, dchar c; ending ) 499 { 500 ++ending_length; 501 } 502 503 size_t net_length = n - ending_length; // The maximum number of Unicode 504 // characters that can be kept, if 505 // ending is used. 506 507 size_t code_point_count; // Which Unicode character are we up to. 508 size_t bytes_needed = 0; // Number of bytes needed to include the last 509 // valid looking Unicode character. 510 size_t last_space_bytes_net = 0; // Number of bytes needed to include the 511 // last valid Unicode character which is 512 // before the last known space, if ending 513 // is used. 514 size_t last_space_code_points_net = 0; // The number of Unicode characters 515 // that precede the last space, if ending 516 // is used. 517 size_t last_space_bytes_n = 0; // Number of bytes needed to include the 518 // last valid Unicode character which is 519 // before the last known space, if ending 520 // is not used. 521 size_t last_space_code_points_n = 0; // The number of Unicode characters 522 // that precede the last space, if ending 523 // is not used. 524 bool need_ending; // Do we know we need an ending already? 525 bool last_was_space; // Was the previous character a space? 526 527 foreach ( size_t i, dchar c; src ) 528 { 529 bool curr_is_space = isSpace(c); 530 531 // Keep Unicode characters that will be returned if the ending is used. 532 if ( code_point_count <= net_length ) 533 { 534 // We still need more Unicode characters so we update the counters. 535 // In the edge case (code_point_count == net_length), the 536 // current Unicode character is not needed. However, we need its "i" 537 // in order to find the bytes of the string which includes the 538 // previous Unicode character. 539 if ( ! last_was_space ) 540 { 541 bytes_needed = i; 542 543 if ( curr_is_space ) 544 { 545 // If the current Unicode character is a space, the previous 546 // is not a space and we are not at the end, keep its 547 // position. 548 last_space_bytes_net = i; 549 last_space_code_points_net = code_point_count; 550 } 551 } 552 } 553 554 // Keep Unicode characters that will be returned if the ending is not 555 // used. 556 if ( code_point_count <= n 557 && ! last_was_space 558 && curr_is_space ) 559 { 560 // Use "n" instead of "net_length". 561 last_space_bytes_n = i; 562 last_space_code_points_n = code_point_count; 563 } 564 565 last_was_space = curr_is_space; 566 567 // This Unicode character will be truncated, but we need to check if it 568 // is a space character. If the Unicode characters that we ommit are 569 // spaces, we will not append the ending, we will just remove the spaces. 570 if ( code_point_count >= n ) 571 { 572 if ( ! curr_is_space ) 573 { 574 // This is a non-space Unicode character so we are truncating. 575 need_ending = true; 576 break; 577 } 578 } 579 580 // Track which Unicode character we are up to (as opposed to byte) 581 ++code_point_count; 582 } 583 584 // We may have fallen off the end of src before we had time to set up all 585 // our variables. If need_ending is true though we know that isn't the case. 586 if ( need_ending ) 587 { 588 // Check if there is a long enough string before the last space. 589 if ( last_space_bytes_net 590 && (last_space_code_points_net / (cast(float)n) > fill_ratio) ) 591 { 592 bytes_needed = last_space_bytes_net; 593 } 594 // Copy up to the prev positon, which may be the 2nd last Unicode 595 // character or the Unicode character before the last space. 596 assumeSafeAppend(buffer); 597 buffer.length = bytes_needed + ending.length; 598 assumeSafeAppend(buffer); 599 buffer[0 .. bytes_needed] = src[0 .. bytes_needed]; 600 // And append an ending 601 buffer[bytes_needed .. bytes_needed + ending.length] = ending[]; 602 } 603 else 604 { 605 // We need to check if we finished one or more iterations short 606 if ( code_point_count <= n ) 607 { 608 // We did so src is short and if there is no trailing space 609 // we can just use it as is. If there was trailing space then 610 // "last_space_bytes" will have already been set correctly on the 611 // iteration caused by the space 612 if ( ! last_was_space ) 613 { 614 last_space_bytes_n = src.length; 615 } 616 } 617 // No need to append the ending so use the full string we found 618 assumeSafeAppend(buffer); 619 buffer.length = last_space_bytes_n; 620 assumeSafeAppend(buffer); 621 buffer[] = src[0 .. last_space_bytes_n]; 622 } 623 return(buffer); 624 } 625 626 unittest 627 { 628 auto t = new NamedTest( 629 "truncateAtN" 630 ); 631 632 mstring buffer; 633 634 // Old test 635 foreach (i, char c; "…") 636 { 637 t.test!("==")(ellipsis[i], c); 638 } 639 640 istring str = "Hello World!"; 641 t.test!("==")(str.truncateAtN(str.length, buffer), "Hello World!"); 642 t.test!("==")(str.truncateAtN(str.length + 5, buffer), "Hello World!"); 643 t.test!("==")(str.truncateAtN(10, buffer), "Hello Wor" ~ ellipsis); 644 645 t.test!("==")("Hällö World!"c.truncateAtN(10, buffer), 646 "Hällö Wor"c ~ ellipsis); 647 t.test!("==")("äöü"c.truncateAtN(3, buffer), "äöü"c); 648 t.test!("==")("Hello World!".dup.truncateAtN(10, buffer), 649 "Hello Wo" ~ ellipsis); 650 t.test!("==")("HelloWörld!"c.truncateAtN(10, buffer, "+"), "HelloWörl+"c); 651 t.test!("==")( 652 "Designstarker Couchtisch in hochwertiger Holznachbildung. Mit praktischem Ablagebogen in Kernnussbaumfarben oder Schwarz. Winkelfüße mit Alukante. B"c.truncateAtN(100, buffer), 653 "Designstarker Couchtisch in hochwertiger Holznachbildung. Mit praktischem Ablagebogen in"c ~ ellipsis 654 ); 655 656 // Andrew's tests 657 658 t.test!("==")(("This should be the longest string of all the unit tests.\n" 659 ~ "We do this so that the buffer never needs expanding again.\n" 660 ~ "This way we can check for unnecessary allocations.") 661 .truncateAtN(160, buffer), 662 "This should be the longest string of all the unit tests.\n" 663 ~ "We do this so that the buffer never needs expanding again.\n" 664 ~ "This way we can check for unnecessary…" 665 ); 666 667 typeof(buffer.ptr) orig_ptr = buffer.ptr; 668 669 t.test!("==")(" ".truncateAtN(2, buffer), ""); 670 t.test!("==")("12 ".truncateAtN(4, buffer), "12"); 671 t.test!("==")("12 ".truncateAtN(6, buffer), "12"); 672 t.test!("==")("hello".truncateAtN(2, buffer), "h…"); 673 t.test!("==")("hello".truncateAtN(4, buffer), "hel…"); 674 t.test!("==")("hello".truncateAtN(5, buffer), "hello"); 675 t.test!("==")("hello".truncateAtN(6, buffer), "hello"); 676 t.test!("==")("hello".truncateAtN(10, buffer), "hello"); 677 t.test!("==")("h l o".truncateAtN(5, buffer), "h l o"); 678 t.test!("==")("hello ".truncateAtN(5, buffer), "hello"); 679 t.test!("==")("hello ".truncateAtN(6, buffer), "hello"); 680 t.test!("==")("hello ".truncateAtN(7, buffer), "hello"); 681 t.test!("==")("hello ".truncateAtN(10, buffer), "hello"); 682 t.test!("==")("hello world".truncateAtN(8, buffer), "hello…"); 683 t.test!("==")("hello | world".truncateAtN(7, buffer), "hello…"); 684 t.test!("==")("hello | world".truncateAtN(8, buffer), "hello |…"); 685 t.test!("==")("hello | world".truncateAtN(32, buffer), "hello | world"); 686 t.test!("==")("h llo world".truncateAtN(3, buffer), "h…"); 687 t.test!("==")("he ll o world".truncateAtN(9, buffer), "he ll…"); 688 t.test!("==")("he ll o world".truncateAtN(10, buffer), "he ll o…"); 689 t.test!("==")("he ll o world".truncateAtN(32, buffer), 690 "he ll o world"); 691 692 t.test!("==")("a".truncateAtN(4, buffer), "a"); 693 t.test!("==")("ab".truncateAtN(4, buffer), "ab"); 694 t.test!("==")("a|".truncateAtN(4, buffer), "a|"); 695 t.test!("==")("ab|".truncateAtN(4, buffer), "ab|"); 696 t.test!("==")("ab|d".truncateAtN(4, buffer), "ab|d"); 697 t.test!("==")("abc|".truncateAtN(4, buffer), "abc|"); 698 t.test!("==")("abcd| ".truncateAtN(4, buffer), "abc…"); 699 t.test!("==")("a| d".truncateAtN(4, buffer), "a| d"); 700 701 t.test!("==")("По оживлённым берегам"c.truncateAtN(2, buffer), "П…"c); 702 t.test!("==")("По оживлённым берегам"c.truncateAtN(3, buffer), "По…"c); 703 t.test!("==")("По оживлённым берегам"c.truncateAtN(4, buffer), "По…"c); 704 t.test!("==")("По оживлённым берегам"c.truncateAtN(5, buffer), "По о…"c); 705 t.test!("==")("Ἰοὺ ἰού· τὰ πάντʼ ἂν ἐξήκοι σαφῆ."c.truncateAtN(2, buffer), 706 "Ἰ…"c); 707 t.test!("==")("Ἰοὺ ἰού· τὰ πάντʼ ἂν ἐξήκοι σαφῆ."c.truncateAtN(3, buffer), 708 "Ἰο…"c); 709 t.test!("==")("Ἰοὺ ἰού· τὰ πάντʼ ἂν ἐξήκοι σαφῆ."c.truncateAtN(4, buffer), 710 "Ἰοὺ…"c); 711 t.test!("==")("Ἰοὺ ἰού· τὰ πάντʼ ἂν ἐξήκοι σαφῆ."c.truncateAtN(5, buffer), 712 "Ἰοὺ…"c); 713 t.test!("==")("Ἰοὺ ἰού· τὰ πάντʼ ἂν ἐξήκοι σαφῆ."c.truncateAtN(6, buffer), 714 "Ἰοὺ ἰ…"c); 715 t.test!("==")("Ξεσκεπάζω τὴν ψυχοφθόρα βδελυγμία"c.truncateAtN(256, buffer), 716 "Ξεσκεπάζω τὴν ψυχοφθόρα βδελυγμία"c); 717 t.test!("==")("पशुपतिरपि तान्यहानि कृच्छ्राद्"c.truncateAtN(6,buffer), "पशुपत…"c); // NB शु is 2 chars 718 t.test!("==")("पशुपतिरपि तान्यहानि कृच्छ्राद्"c.truncateAtN(8, buffer), "पशुपतिर…"c); 719 t.test!("==")("子曰:「學而時習之,不亦說乎?有朋自遠方來,不亦樂乎?"c.truncateAtN(5, buffer), "子曰:「…"c); 720 721 // we don't yet support R-To-L languages so don't test Arabic 722 //test(truncate_at_n("بِسْمِ ٱللّٰهِ ٱلرَّحْمـَبنِ ٱلرَّحِيمِ", 5c, buffer) = "…رَّحِيمِ"c); 723 724 // Use some other ending that is not one character. 725 t.test!("==")("a| d".truncateAtN(4, buffer, "..."), "a| d"); 726 t.test!("==")("a| d1".truncateAtN(4, buffer, "..."), "a..."); 727 t.test!("==")("1234567890".truncateAtN(7, buffer, "..."), "1234..."); 728 t.test!("==")("1234567890".truncateAtN(70, buffer, "..."), "1234567890"); 729 t.test!("==")("1234 6789 1234 6789 1234 6789".truncateAtN(25, buffer, "..."), 730 "1234 6789 1234 6789..."); 731 732 // check nothing has allocated 733 t.test!("==")(orig_ptr, buffer.ptr); 734 }