ocean.text.utf.UtfUtil source code

1 /*******************************************************************************
2 
3     Contains utility functions for working with unicode strings. Contains a
4     function to return the length of a UTF-8 string, a method to truncate a
5     UTF-8 string to the nearest whitespace character that is less than a maximum
6     length parameter, and a method to truncate a UTF-8 string and append a set
7     ending to it.
8 
9     Example usage:
10 
11     ---
12 
13         char[] utf = ...; // some UTF-8 character sequence
14 
15         // using the default unicode error handler
16         size_t len1 = utf8Length(utf);
17 
18         // using a custom error handler
19         // which takes the index of the string as a parameter
20         size_t len2 = utf8Length(utf, (size_t i){ // error handling code...  });
21 
22     ---
23 
24     Copyright:
25         Copyright (c) 2009-2016 dunnhumby Germany GmbH.
26         All rights reserved.
27 
28     License:
29         Boost Software License Version 1.0. See LICENSE_BOOST.txt for details.
30         Alternatively, this file may be distributed under the terms of the Tango
31         3-Clause BSD License (see LICENSE_BSD.txt for details).
32 
33 *******************************************************************************/
34 
35 module ocean.text.utf.UtfUtil;
36 
37 import core.exception: onUnicodeError;
38 
39 import ocean.transition;
40 
41 import ocean.stdc.string: memrchr;
42 
43 import ocean.core.Array: append, copy;
44 import ocean.core.Verify;
45 
46 import ocean.math.IEEE: isNaN;
47 
48 import ocean.text.Unicode : isSpace;
49 
50 import ocean.text.utf.c.glib_unicode;
51 
52 import ocean.core.Test;
53 
54 
55 /*******************************************************************************
56 
57     UTF-8 representation of "…".
58 
59 *******************************************************************************/
60 
61 public istring ellipsis = "\xE2\x80\xA6";  // The char '…'
62 
63 
64 /*******************************************************************************
65 
66     This array gives the length of a UTF-8 sequence indexed by the value
67     of the leading byte. An FF (ubyte.max) represents an illegal starting value
68     of a UTF-8 sequence.
69     FF is used instead of 0 to avoid having loops hang.
70 
71 *******************************************************************************/
72 
73 private static immutable ubyte[char.max + 1] utf8_stride =
74 [
75     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
76     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
77     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
78     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
79     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
80     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
81     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
82     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
83     ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,
84     ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,
85     ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,
86     ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,
87     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
88     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
89     3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
90     4,4,4,4,4,4,4,4,5,5,5,5,6,6,ubyte.max,ubyte.max,
91 ];
92 
93 
94 /*******************************************************************************
95 
96     Calculates the number of UTF8 code points in a UTF8-encoded string.
97     Calls the standard unicode error handler on error,
98     which throws a new UnicodeException.
99 
100     Params:
101         str = The string to calculate the length of.
102 
103     Returns:
104         The length of the given string.
105 
106     Throws:
107         UnicodeException if an invalid UTF8 code unit is detected.
108 
109 *******************************************************************************/
110 
111 public size_t utf8Length ( cstring str )
112 {
113     void error ( size_t i )
114     {
115         onUnicodeError("invalid UTF-8 sequence", i);
116     }
117 
118     return utf8Length(str, &error);
119 }
120 
121 
122 /*******************************************************************************
123 
124     Calculates the number of UTF8 code points in a UTF8-encoded string.
125     Calls error_dg if an invalid UTF8 code unit is detected,
126     which may throw an exception to abort processing.
127 
128     Params:
129         str = The string to calculate the length of.
130         error_dg = The error delegate to call upon finding an invalid code unit.
131             Takes a size_t parameter representing the index of the current
132             code point in the string.
133 
134     Returns:
135         The length of the given string.
136 
137 *******************************************************************************/
138 
139 public size_t utf8Length ( cstring str, scope void delegate ( size_t ) error_dg )
140 {
141     size_t length;
142     size_t i;
143     size_t stride;
144 
145     for ( i = 0; i < str.length; i += stride )
146     {
147         // check how much we should increment the index
148         // based on the size of the current UTF8 code point
149         stride = utf8_stride[str[i]];
150 
151         if ( stride == ubyte.max )
152         {
153             error_dg(i);
154         }
155 
156         length++;
157     }
158 
159     if ( i > str.length )
160     {
161         verify(i >= stride, "i should be stride or greater");
162         i -= stride;
163         verify(i < str.length, "i - stride should be less than str.length");
164         error_dg(i);
165     }
166 
167     return length;
168 }
169 
170 unittest
171 {
172     test(utf8Length(null) == 0,
173         "the length of a null string should be 0");
174 
175     test(utf8Length("") == 0,
176         "the length of an empty string should be 0");
177 
178     test(utf8Length("foo bar baz xyzzy") == 17,
179         "the length of \"foo bar baz xyzzy\" should be 17");
180 
181     test(utf8Length("ðäß ßøø+ ì$ æ ¢ööđ µøvi€ →→→") == 28,
182         "the length of \"ðäß ßøø+ ì$ æ ¢ööđ µøvi€ →→→\" should be 28");
183 
184     // test if error delegate is called for an invalid string
185     bool error_caught = false;
186     static immutable istring error_str = "error in " ~ char.init ~ " the middle";
187     utf8Length(error_str, ( size_t i ) { error_caught = true; });
188     test(error_caught,
189         "the call to utf8Length should have caught an error");
190 
191     // test if error delegate is called for a valid string
192     error_caught = false;
193     static immutable istring valid_str = "There are no errors in this string!";
194     utf8Length(valid_str, ( size_t i ) { error_caught = true; });
195     test(!error_caught,
196         "the call to utf8Length should not have caught an error");
197 }
198 
199 
200 /*******************************************************************************
201 
202     Limits str to a length of n UTF-8 code points, cutting off on the last
203     space, if found. If str is not valid UTF-8, str.length is assumed to be the
204     number of code points.
205 
206     Params:
207         str = string to limit the length
208         n = maximum number of code points in the resulting string
209 
210     Out:
211         The maximum number of code points in str is n.
212 
213     Returns:
214         The truncated string for method chaining
215 
216 *******************************************************************************/
217 
218 public mstring truncateAtWordBreak ( ref mstring str, size_t n )
219 out (result)
220 {
221     if (result.length > n)
222     {
223         assert(g_utf8_validate(result.ptr, result.length, null));
224         assert(g_utf8_strlen(result.ptr, result.length) <= n);
225     }
226 }
227 body
228 {
229     if (n < str.length)
230     {
231         bool valid_utf8 = g_utf8_validate(str.ptr, str.length, null);
232 
233         auto utf8_len = valid_utf8 ? utf8Length(str) : str.length;
234 
235         if (n < utf8_len)
236         {
237             size_t last = n;
238 
239             if (valid_utf8)
240             {
241                 last = g_utf8_offset_to_pointer(str.ptr, last) - str.ptr;
242             }
243 
244             void* result = memrchr(str.ptr, ' ', last);
245             char* c = cast(char*) result;
246             if (c)
247             {
248                 // Skip consecutive ' ' characters.
249                 while (*c == ' ' && c > str.ptr)
250                 {
251                     c--;
252                 }
253 
254                 str.length = c - str.ptr + (c != str.ptr);
255             }
256             else
257             {
258                 // If no ' ' is found to break on, set the break to the maximum
259                 // number of code points
260                 str.length = last;
261             }
262         }
263     }
264 
265     return str;
266 }
267 
268 unittest
269 {
270     void doTest ( cstring input, cstring expected_output, int length, int line = __LINE__ )
271     {
272         mstring buffer;
273         buffer.copy(input);
274         test!("==")(truncateAtWordBreak(buffer, length), expected_output, __FILE__, line);
275     }
276 
277     doTest("Hello World!", "Hello World!", "Hello World!".length);
278 
279     doTest("Hello World!", "Hello World!", "Hello World!".length + 5);
280 
281     doTest("Hello World!", "Hello", 9);
282 
283     doTest("Hällö World!", "Hällö", 9);
284 
285     doTest("äöü", "äöü", 3);
286 
287     doTest("Hello  World!", "Hello", 9);
288 }
289 
290 
291 /*******************************************************************************
292 
293     Truncate the length of a UTF-8 string and append a set ending. The string
294     is initially truncated so that it is of maximum length n (this includes
295     the extra ending paramter so the string is truncated to position
296     n - ending.length).
297 
298     Params:
299         str = string to truncate and append the ending to
300         n = maximum number of code points in the resulting string
301         ending = the ending to append to the string, defaults to "..."
302 
303     In:
304         n must be at least `ending.length`
305 
306     Returns:
307         The truncated and appended string for method chaining
308 
309 *******************************************************************************/
310 
311 public mstring truncateAppendEnding ( ref mstring str, size_t n, cstring ending = "...")
312 {
313     verify (n >= ending.length);
314 
315     bool valid_utf8 = g_utf8_validate(str.ptr, str.length, null);
316 
317     auto utf8_len = valid_utf8 ? utf8Length(str) : str.length;
318 
319     if (n < utf8_len)
320     {
321         truncateAtWordBreak(str, (n - ending.length));
322         str.append(ending);
323     }
324 
325     return str;
326 }
327 
328 unittest
329 {
330     mstring buffer;
331 
332     void doTest ( cstring input, cstring expected_output, int length,
333         cstring ending = "..." , int line = __LINE__ )
334     {
335         buffer.copy(input);
336         test!("==")(truncateAppendEnding(buffer, length, ending),
337             expected_output, __FILE__, line);
338     }
339 
340     doTest("Hello World!", "Hello World!", "Hello World!".length);
341 
342     doTest("Hello World!", "Hello World!", "Hello World!".length + 5);
343 
344     doTest("Hello World!", "Hello...", 9);
345 
346     doTest("Hällö World!", "Hällö...", 9);
347 
348     doTest("äöü äöü", "ä...", 4);
349 
350     doTest("Hello  World!", "Hello...", 9);
351 
352     doTest("HelloW"  ~ cast (char) 0x81 ~ "rld!",
353         "HelloW"  ~ cast (char) 0x81 ~ "...", 10);
354 
355     doTest("HelloWörld!", "HelloWörl+", 10, "+");
356 
357     doTest("Designstarker Couchtisch in hochwertiger Holznachbildung. Mit "
358       ~ "praktischem Ablagebogen in Kernnussbaumfarben oder Schwarz. "
359       ~ "Winkelfüße mit Alukante. B", "Designstarker Couchtisch in hochwertiger"
360       ~ " Holznachbildung. Mit praktischem Ablagebogen...", 90);
361 }
362 
363 
364 /*******************************************************************************
365 
366     Limits the length of a UTF-8 string, to at most the specified number of
367     bytes.
368 
369     This is conceptually equal to str[0..max_len], except that we take care to
370     avoid chopping a multi-byte UTF-8 character in half.
371 
372     Params:
373         str     = the string to be sliced
374         max_len = the maximum allowable length (in bytes) of the string
375 
376     Returns:
377         a slice of the original string, of length at most max_len.
378 
379 *******************************************************************************/
380 
381 public Inout!(mstring) limitStringLength ( Inout!(mstring) str, size_t max_len )
382 {
383     if ( str.length <= max_len )
384     {
385         return str;
386     }
387 
388     // Make sure we don't chop a character in half.
389     // All UTF-8 continuation bytes are of the form 0b10xxxxxxx,
390     // so we must skip all such bytes
391 
392     auto k = max_len;
393 
394     while ( k != 0 && ( (str[k] & 0xC0 ) ==  0x80) )
395     {
396         --k;
397     }
398 
399     return str[ 0 .. k ];
400 
401 }
402 
403 
404 unittest
405 {
406     // String ending with a 1-byte character
407 
408     test!("==")(limitStringLength("abc", 5), "abc");
409     test!("==")(limitStringLength("abc", 2), "ab");
410 
411     // String ending with a 2-byte character
412 
413     test!("==")(limitStringLength("ÜÄ", 5), "ÜÄ");
414     test!("==")(limitStringLength("ÜÄ", 4), "ÜÄ");
415     test!("==")(limitStringLength("ÜÄ", 3), "Ü");
416     test!("==")(limitStringLength("ÜÄ", 2), "Ü");
417     test!("==")(limitStringLength("ÜÄ", 1), "");
418 
419     // String ending with a 3-byte character
420 
421     test!("==")(limitStringLength("Ü眼", 6), "Ü眼");
422     test!("==")(limitStringLength("Ü眼", 5), "Ü眼");
423     test!("==")(limitStringLength("Ü眼", 4), "Ü");
424 
425     // Ensure it compiles with an mstring
426 
427     mstring x = "abcd".dup;
428     mstring y = limitStringLength(x, 2);
429 }
430 
431 
432 /*******************************************************************************
433 
434     Truncates a string at the last space before the n-th Unicode character or,
435     if the resulting string is too short, at the n-th Unicode character.
436     The string should be a valid UTF-8 (the caller should have validated it
437     before calling this function).
438 
439     If a string is truncated before the end, then the final Unicode chartacter
440     is made an ending. Trailing space is removed before the ending is added.
441     The returned string will always be no more than n Unicode characters
442     (including the ending).
443 
444     The basic algorithm is to walk through src keeping track of how many
445     bytes needed to be sliced at any particular time until we know when
446     we need to end. Because we don't know till the end if we need an
447     ending we need to keep track of one Unicode character behind as well as the
448     position of the Unicode character berore the last space. We have to be
449     careful we never point at spaces.
450 
451     Important points when reading the algorithm:
452 
453     1) Unicode character != byte
454     2) i == the number of bytes required to include the _previous_
455         Unicode character (i.e. the number of bytes to the start of c)
456 
457     Params:
458         src        = the string to truncate (must be UTF-8 encoded)
459         n          = the maximum number of Unicode characters allowed in the
460                      returned string
461         buffer     = a buffer to be used to store the result in (may be
462                      resized). The buffer is required because "ending" may
463                      contain Unicode characters taking more bytes than the
464                      Unicode characters in src they replace, thus leading to a
465                      string with fewer Unicode characters but more bytes.
466         ending     = These Unicode characters will be appended when "src" needs
467                      to be truncated.
468         fill_ratio = if cutting the string in the last space would make its
469                      Unicode character length smaller than "n*fill_ratio",
470                      then we cut it on the n-th Unicode character
471 
472     Returns:
473         buffer
474 
475 *******************************************************************************/
476 
477 public mstring truncateAtN(cstring src, size_t n, ref mstring buffer,
478     cstring ending = ellipsis, float fill_ratio = 0.75)
479 out (result)
480 {
481     size_t result_length = 0;
482     foreach ( dchar c; result )
483     {
484         ++result_length;
485     }
486 
487     assert(result_length <= n);
488 }
489 body
490 {
491     {
492         size_t ending_length = 0;   // Ending's number of Unicode characters
493         foreach ( dchar c; ending )
494         {
495             ++ending_length;
496         }
497 
498         verify(n > ending_length);
499 
500         verify(!isNaN(fill_ratio));
501         verify(fill_ratio>=0 && fill_ratio<=1);
502     }
503 
504     size_t ending_length = 0;   // Ending's number of Unicode characters
505     foreach ( size_t i, dchar c; ending )
506     {
507         ++ending_length;
508     }
509 
510     size_t net_length = n - ending_length;  // The maximum number of Unicode
511                                             // characters that can be kept, if
512                                             // ending is used.
513 
514     size_t code_point_count;    // Which Unicode character are we up to.
515     size_t bytes_needed = 0;    // Number of bytes needed to include the last
516                                 // valid looking Unicode character.
517     size_t last_space_bytes_net = 0; // Number of bytes needed to include the
518                                      // last valid Unicode character which is
519                                      // before the last known space, if ending
520                                      // is used.
521     size_t last_space_code_points_net = 0; // The number of Unicode characters
522                                      // that precede the last space, if ending
523                                      // is used.
524     size_t last_space_bytes_n = 0;   // Number of bytes needed to include the
525                                      // last valid Unicode character which is
526                                      // before the last known space, if ending
527                                      // is not used.
528     size_t last_space_code_points_n = 0; // The number of Unicode characters
529                                      // that precede the last space, if ending
530                                      // is not used.
531     bool need_ending;       // Do we know we need an ending already?
532     bool last_was_space;    // Was the previous character a space?
533 
534     foreach ( size_t i, dchar c; src )
535     {
536         bool curr_is_space = isSpace(c);
537 
538         // Keep Unicode characters that will be returned if the ending is used.
539         if ( code_point_count <= net_length )
540         {
541             // We still need more Unicode characters so we update the counters.
542             // In the edge case (code_point_count == net_length), the
543             // current Unicode character is not needed. However, we need its "i"
544             // in order to find the bytes of the string which includes the
545             // previous Unicode character.
546             if ( ! last_was_space )
547             {
548                 bytes_needed = i;
549 
550                 if ( curr_is_space )
551                 {
552                     // If the current Unicode character is a space, the previous
553                     // is not a space and we are not at the end, keep its
554                     // position.
555                     last_space_bytes_net = i;
556                     last_space_code_points_net = code_point_count;
557                 }
558             }
559         }
560 
561         // Keep Unicode characters that will be returned if the ending is not
562         // used.
563         if ( code_point_count <= n
564             && ! last_was_space
565             && curr_is_space )
566         {
567             // Use "n" instead of "net_length".
568             last_space_bytes_n = i;
569             last_space_code_points_n = code_point_count;
570         }
571 
572         last_was_space = curr_is_space;
573 
574         // This Unicode character will be truncated, but we need to check if it
575         // is a space character. If the Unicode characters that we ommit are
576         // spaces, we will not append the ending, we will just remove the spaces.
577         if ( code_point_count >= n )
578         {
579             if ( ! curr_is_space )
580             {
581                 // This is a non-space Unicode character so we are truncating.
582                 need_ending = true;
583                 break;
584             }
585         }
586 
587         // Track which Unicode character we are up to (as opposed to byte)
588         ++code_point_count;
589     }
590 
591     // We may have fallen off the end of src before we had time to set up all
592     // our variables. If need_ending is true though we know that isn't the case.
593     if ( need_ending )
594     {
595         // Check if there is a long enough string before the last space.
596         if ( last_space_bytes_net
597             && (last_space_code_points_net / (cast(float)n) > fill_ratio) )
598         {
599             bytes_needed = last_space_bytes_net;
600         }
601         // Copy up to the prev positon, which may be the 2nd last Unicode
602         // character or the Unicode character before the last space.
603         enableStomping(buffer);
604         buffer.length = bytes_needed + ending.length;
605         enableStomping(buffer);
606         buffer[0 .. bytes_needed] = src[0 .. bytes_needed];
607         // And append an ending
608         buffer[bytes_needed .. bytes_needed + ending.length] = ending[];
609     }
610     else
611     {
612         // We need to check if we finished one or more iterations short
613         if ( code_point_count <= n )
614         {
615             // We did so src is short and if there is no trailing space
616             // we can just use it as is. If there was trailing space then
617             // "last_space_bytes" will have already been set correctly on the
618             // iteration caused by the space
619             if ( ! last_was_space )
620             {
621                 last_space_bytes_n = src.length;
622             }
623         }
624         // No need to append the ending so use the full string we found
625         enableStomping(buffer);
626         buffer.length = last_space_bytes_n;
627         enableStomping(buffer);
628         buffer[] = src[0 .. last_space_bytes_n];
629     }
630     return(buffer);
631 }
632 
633 unittest
634 {
635     auto t = new NamedTest(
636         "truncateAtN"
637     );
638 
639     mstring buffer;
640 
641     // Old test
642     foreach (i, char c; "…")
643     {
644         t.test!("==")(ellipsis[i], c);
645     }
646 
647     istring str = "Hello World!";
648     t.test!("==")(str.truncateAtN(str.length, buffer), "Hello World!");
649     t.test!("==")(str.truncateAtN(str.length + 5, buffer), "Hello World!");
650     t.test!("==")(str.truncateAtN(10, buffer), "Hello Wor" ~ ellipsis);
651 
652     t.test!("==")("Hällö World!"c.truncateAtN(10, buffer),
653         "Hällö Wor"c ~ ellipsis);
654     t.test!("==")("äöü"c.truncateAtN(3, buffer), "äöü"c);
655     t.test!("==")("Hello  World!".dup.truncateAtN(10, buffer),
656         "Hello  Wo" ~ ellipsis);
657     t.test!("==")("HelloWörld!"c.truncateAtN(10, buffer, "+"), "HelloWörl+"c);
658     t.test!("==")(
659         "Designstarker Couchtisch in hochwertiger Holznachbildung. Mit praktischem Ablagebogen in Kernnussbaumfarben oder Schwarz. Winkelfüße mit Alukante. B"c.truncateAtN(100, buffer),
660         "Designstarker Couchtisch in hochwertiger Holznachbildung. Mit praktischem Ablagebogen in"c ~ ellipsis
661     );
662 
663     // Andrew's tests
664 
665     t.test!("==")(("This should be the longest string of all the unit tests.\n"
666       ~ "We do this so that the buffer never needs expanding again.\n"
667       ~ "This way we can check for unnecessary allocations.")
668         .truncateAtN(160, buffer),
669         "This should be the longest string of all the unit tests.\n"
670       ~ "We do this so that the buffer never needs expanding again.\n"
671       ~ "This way we can check for unnecessary…"
672     );
673 
674     typeof(buffer.ptr) orig_ptr = buffer.ptr;
675 
676     t.test!("==")("     ".truncateAtN(2, buffer), "");
677     t.test!("==")("12   ".truncateAtN(4, buffer), "12");
678     t.test!("==")("12   ".truncateAtN(6, buffer), "12");
679     t.test!("==")("hello".truncateAtN(2, buffer), "h…");
680     t.test!("==")("hello".truncateAtN(4, buffer), "hel…");
681     t.test!("==")("hello".truncateAtN(5, buffer), "hello");
682     t.test!("==")("hello".truncateAtN(6, buffer), "hello");
683     t.test!("==")("hello".truncateAtN(10, buffer), "hello");
684     t.test!("==")("h l o".truncateAtN(5, buffer), "h l o");
685     t.test!("==")("hello ".truncateAtN(5, buffer), "hello");
686     t.test!("==")("hello ".truncateAtN(6, buffer), "hello");
687     t.test!("==")("hello ".truncateAtN(7, buffer), "hello");
688     t.test!("==")("hello ".truncateAtN(10, buffer), "hello");
689     t.test!("==")("hello   world".truncateAtN(8, buffer), "hello…");
690     t.test!("==")("hello | world".truncateAtN(7, buffer), "hello…");
691     t.test!("==")("hello | world".truncateAtN(8, buffer), "hello |…");
692     t.test!("==")("hello | world".truncateAtN(32, buffer), "hello | world");
693     t.test!("==")("h llo world".truncateAtN(3, buffer), "h…");
694     t.test!("==")("he  ll  o  world".truncateAtN(9, buffer), "he  ll…");
695     t.test!("==")("he  ll  o  world".truncateAtN(10, buffer), "he  ll  o…");
696     t.test!("==")("he  ll  o  world".truncateAtN(32, buffer),
697         "he  ll  o  world");
698 
699     t.test!("==")("a".truncateAtN(4, buffer), "a");
700     t.test!("==")("ab".truncateAtN(4, buffer), "ab");
701     t.test!("==")("a|".truncateAtN(4, buffer), "a|");
702     t.test!("==")("ab|".truncateAtN(4, buffer), "ab|");
703     t.test!("==")("ab|d".truncateAtN(4, buffer), "ab|d");
704     t.test!("==")("abc|".truncateAtN(4, buffer), "abc|");
705     t.test!("==")("abcd| ".truncateAtN(4, buffer), "abc…");
706     t.test!("==")("a| d".truncateAtN(4, buffer), "a| d");
707 
708     t.test!("==")("По оживлённым берегам"c.truncateAtN(2, buffer), "П…"c);
709     t.test!("==")("По оживлённым берегам"c.truncateAtN(3, buffer), "По…"c);
710     t.test!("==")("По оживлённым берегам"c.truncateAtN(4, buffer), "По…"c);
711     t.test!("==")("По оживлённым берегам"c.truncateAtN(5, buffer), "По о…"c);
712     t.test!("==")("Ἰοὺ ἰού· τὰ πάντʼ ἂν ἐξήκοι σαφῆ."c.truncateAtN(2, buffer),
713         "Ἰ…"c);
714     t.test!("==")("Ἰοὺ ἰού· τὰ πάντʼ ἂν ἐξήκοι σαφῆ."c.truncateAtN(3, buffer),
715         "Ἰο…"c);
716     t.test!("==")("Ἰοὺ ἰού· τὰ πάντʼ ἂν ἐξήκοι σαφῆ."c.truncateAtN(4, buffer),
717         "Ἰοὺ…"c);
718     t.test!("==")("Ἰοὺ ἰού· τὰ πάντʼ ἂν ἐξήκοι σαφῆ."c.truncateAtN(5, buffer),
719         "Ἰοὺ…"c);
720     t.test!("==")("Ἰοὺ ἰού· τὰ πάντʼ ἂν ἐξήκοι σαφῆ."c.truncateAtN(6, buffer),
721         "Ἰοὺ ἰ…"c);
722     t.test!("==")("Ξεσκεπάζω τὴν ψυχοφθόρα βδελυγμία"c.truncateAtN(256, buffer),
723         "Ξεσκεπάζω τὴν ψυχοφθόρα βδελυγμία"c);
724     t.test!("==")("पशुपतिरपि तान्यहानि कृच्छ्राद्"c.truncateAtN(6,buffer), "पशुपत…"c); // NB शु is 2 chars
725     t.test!("==")("पशुपतिरपि तान्यहानि कृच्छ्राद्"c.truncateAtN(8, buffer), "पशुपतिर…"c);
726     t.test!("==")("子曰：「學而時習之，不亦說乎？有朋自遠方來，不亦樂乎？"c.truncateAtN(5, buffer), "子曰：「…"c);
727 
728     // we don't yet support R-To-L languages so don't test Arabic
729     //test(truncate_at_n("بِسْمِ ٱللّٰهِ ٱلرَّحْمـَبنِ ٱلرَّحِيمِ", 5c, buffer) = "…رَّحِيمِ"c);
730 
731     // Use some other ending that is not one character.
732     t.test!("==")("a| d".truncateAtN(4, buffer, "..."), "a| d");
733     t.test!("==")("a| d1".truncateAtN(4, buffer, "..."), "a...");
734     t.test!("==")("1234567890".truncateAtN(7, buffer, "..."), "1234...");
735     t.test!("==")("1234567890".truncateAtN(70, buffer, "..."), "1234567890");
736     t.test!("==")("1234 6789 1234 6789 1234 6789".truncateAtN(25, buffer, "..."),
737         "1234 6789 1234 6789...");
738 
739     // check nothing has allocated
740     t.test!("==")(orig_ptr, buffer.ptr);
741 }