ocean.text.utf.UtfUtil source code

1 /*******************************************************************************
2 
3     Contains utility functions for working with unicode strings. Contains a
4     function to return the length of a UTF-8 string, a method to truncate a
5     UTF-8 string to the nearest whitespace character that is less than a maximum
6     length parameter, and a method to truncate a UTF-8 string and append a set
7     ending to it.
8 
9     Example usage:
10 
11     ---
12 
13         char[] utf = ...; // some UTF-8 character sequence
14 
15         // using the default unicode error handler
16         size_t len1 = utf8Length(utf);
17 
18         // using a custom error handler
19         // which takes the index of the string as a parameter
20         size_t len2 = utf8Length(utf, (size_t i){ // error handling code...  });
21 
22     ---
23 
24     Copyright:
25         Copyright (c) 2009-2016 dunnhumby Germany GmbH.
26         All rights reserved.
27 
28     License:
29         Boost Software License Version 1.0. See LICENSE_BOOST.txt for details.
30         Alternatively, this file may be distributed under the terms of the Tango
31         3-Clause BSD License (see LICENSE_BSD.txt for details).
32 
33 *******************************************************************************/
34 
35 module ocean.text.utf.UtfUtil;
36 
37 import ocean.core.Array: append, copy;
38 import ocean.core.Test;
39 import ocean.core.Verify;
40 import ocean.math.IEEE: isNaN;
41 import ocean.meta.types.Qualifiers;
42 import ocean.stdc.gnu..string: memrchr;
43 import ocean.text.Unicode : isSpace;
44 import ocean.text.utf.c.glib_unicode;
45 
46 import core.exception: onUnicodeError;
47 
48 /*******************************************************************************
49 
50     UTF-8 representation of "…".
51 
52 *******************************************************************************/
53 
54 public istring ellipsis = "\xE2\x80\xA6";  // The char '…'
55 
56 
57 /*******************************************************************************
58 
59     This array gives the length of a UTF-8 sequence indexed by the value
60     of the leading byte. An FF (ubyte.max) represents an illegal starting value
61     of a UTF-8 sequence.
62     FF is used instead of 0 to avoid having loops hang.
63 
64 *******************************************************************************/
65 
66 private static immutable ubyte[char.max + 1] utf8_stride =
67 [
68     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
69     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
70     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
71     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
72     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
73     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
74     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
75     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
76     ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,
77     ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,
78     ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,
79     ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,ubyte.max,
80     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
81     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
82     3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
83     4,4,4,4,4,4,4,4,5,5,5,5,6,6,ubyte.max,ubyte.max,
84 ];
85 
86 
87 /*******************************************************************************
88 
89     Calculates the number of UTF8 code points in a UTF8-encoded string.
90     Calls the standard unicode error handler on error,
91     which throws a new UnicodeException.
92 
93     Params:
94         str = The string to calculate the length of.
95 
96     Returns:
97         The length of the given string.
98 
99     Throws:
100         UnicodeException if an invalid UTF8 code unit is detected.
101 
102 *******************************************************************************/
103 
104 public size_t utf8Length ( cstring str )
105 {
106     void error ( size_t i )
107     {
108         onUnicodeError("invalid UTF-8 sequence", i);
109     }
110 
111     return utf8Length(str, &error);
112 }
113 
114 
115 /*******************************************************************************
116 
117     Calculates the number of UTF8 code points in a UTF8-encoded string.
118     Calls error_dg if an invalid UTF8 code unit is detected,
119     which may throw an exception to abort processing.
120 
121     Params:
122         str = The string to calculate the length of.
123         error_dg = The error delegate to call upon finding an invalid code unit.
124             Takes a size_t parameter representing the index of the current
125             code point in the string.
126 
127     Returns:
128         The length of the given string.
129 
130 *******************************************************************************/
131 
132 public size_t utf8Length ( cstring str, scope void delegate ( size_t ) error_dg )
133 {
134     size_t length;
135     size_t i;
136     size_t stride;
137 
138     for ( i = 0; i < str.length; i += stride )
139     {
140         // check how much we should increment the index
141         // based on the size of the current UTF8 code point
142         stride = utf8_stride[str[i]];
143 
144         if ( stride == ubyte.max )
145         {
146             error_dg(i);
147         }
148 
149         length++;
150     }
151 
152     if ( i > str.length )
153     {
154         verify(i >= stride, "i should be stride or greater");
155         i -= stride;
156         verify(i < str.length, "i - stride should be less than str.length");
157         error_dg(i);
158     }
159 
160     return length;
161 }
162 
163 unittest
164 {
165     test(utf8Length(null) == 0,
166         "the length of a null string should be 0");
167 
168     test(utf8Length("") == 0,
169         "the length of an empty string should be 0");
170 
171     test(utf8Length("foo bar baz xyzzy") == 17,
172         "the length of \"foo bar baz xyzzy\" should be 17");
173 
174     test(utf8Length("ðäß ßøø+ ì$ æ ¢ööđ µøvi€ →→→") == 28,
175         "the length of \"ðäß ßøø+ ì$ æ ¢ööđ µøvi€ →→→\" should be 28");
176 
177     // test if error delegate is called for an invalid string
178     bool error_caught = false;
179     static immutable istring error_str = "error in " ~ char.init ~ " the middle";
180     utf8Length(error_str, ( size_t i ) { error_caught = true; });
181     test(error_caught,
182         "the call to utf8Length should have caught an error");
183 
184     // test if error delegate is called for a valid string
185     error_caught = false;
186     static immutable istring valid_str = "There are no errors in this string!";
187     utf8Length(valid_str, ( size_t i ) { error_caught = true; });
188     test(!error_caught,
189         "the call to utf8Length should not have caught an error");
190 }
191 
192 
193 /*******************************************************************************
194 
195     Limits str to a length of n UTF-8 code points, cutting off on the last
196     space, if found. If str is not valid UTF-8, str.length is assumed to be the
197     number of code points.
198 
199     Params:
200         str = string to limit the length
201         n = maximum number of code points in the resulting string
202 
203     Out:
204         The maximum number of code points in str is n.
205 
206     Returns:
207         The truncated string for method chaining
208 
209 *******************************************************************************/
210 
211 public mstring truncateAtWordBreak ( ref mstring str, size_t n )
212 out (result)
213 {
214     if (result.length > n)
215     {
216         assert(g_utf8_validate(result.ptr, result.length, null));
217         assert(g_utf8_strlen(result.ptr, result.length) <= n);
218     }
219 }
220 do
221 {
222     if (n < str.length)
223     {
224         bool valid_utf8 = g_utf8_validate(str.ptr, str.length, null);
225 
226         auto utf8_len = valid_utf8 ? utf8Length(str) : str.length;
227 
228         if (n < utf8_len)
229         {
230             size_t last = n;
231 
232             if (valid_utf8)
233             {
234                 last = g_utf8_offset_to_pointer(str.ptr, last) - str.ptr;
235             }
236 
237             void* result = memrchr(str.ptr, ' ', last);
238             char* c = cast(char*) result;
239             if (c)
240             {
241                 // Skip consecutive ' ' characters.
242                 while (*c == ' ' && c > str.ptr)
243                 {
244                     c--;
245                 }
246 
247                 str.length = c - str.ptr + (c != str.ptr);
248             }
249             else
250             {
251                 // If no ' ' is found to break on, set the break to the maximum
252                 // number of code points
253                 str.length = last;
254             }
255         }
256     }
257 
258     return str;
259 }
260 
261 unittest
262 {
263     void doTest ( cstring input, cstring expected_output, int length, int line = __LINE__ )
264     {
265         mstring buffer;
266         buffer.copy(input);
267         test!("==")(truncateAtWordBreak(buffer, length), expected_output, __FILE__, line);
268     }
269 
270     doTest("Hello World!", "Hello World!", "Hello World!".length);
271 
272     doTest("Hello World!", "Hello World!", "Hello World!".length + 5);
273 
274     doTest("Hello World!", "Hello", 9);
275 
276     doTest("Hällö World!", "Hällö", 9);
277 
278     doTest("äöü", "äöü", 3);
279 
280     doTest("Hello  World!", "Hello", 9);
281 }
282 
283 
284 /*******************************************************************************
285 
286     Truncate the length of a UTF-8 string and append a set ending. The string
287     is initially truncated so that it is of maximum length n (this includes
288     the extra ending paramter so the string is truncated to position
289     n - ending.length).
290 
291     Params:
292         str = string to truncate and append the ending to
293         n = maximum number of code points in the resulting string
294         ending = the ending to append to the string, defaults to "..."
295 
296     In:
297         n must be at least `ending.length`
298 
299     Returns:
300         The truncated and appended string for method chaining
301 
302 *******************************************************************************/
303 
304 public mstring truncateAppendEnding ( ref mstring str, size_t n, cstring ending = "...")
305 {
306     verify (n >= ending.length);
307 
308     bool valid_utf8 = g_utf8_validate(str.ptr, str.length, null);
309 
310     auto utf8_len = valid_utf8 ? utf8Length(str) : str.length;
311 
312     if (n < utf8_len)
313     {
314         truncateAtWordBreak(str, (n - ending.length));
315         str.append(ending);
316     }
317 
318     return str;
319 }
320 
321 unittest
322 {
323     mstring buffer;
324 
325     void doTest ( cstring input, cstring expected_output, int length,
326         cstring ending = "..." , int line = __LINE__ )
327     {
328         buffer.copy(input);
329         test!("==")(truncateAppendEnding(buffer, length, ending),
330             expected_output, __FILE__, line);
331     }
332 
333     doTest("Hello World!", "Hello World!", "Hello World!".length);
334 
335     doTest("Hello World!", "Hello World!", "Hello World!".length + 5);
336 
337     doTest("Hello World!", "Hello...", 9);
338 
339     doTest("Hällö World!", "Hällö...", 9);
340 
341     doTest("äöü äöü", "ä...", 4);
342 
343     doTest("Hello  World!", "Hello...", 9);
344 
345     doTest("HelloW"  ~ cast (char) 0x81 ~ "rld!",
346         "HelloW"  ~ cast (char) 0x81 ~ "...", 10);
347 
348     doTest("HelloWörld!", "HelloWörl+", 10, "+");
349 
350     doTest("Designstarker Couchtisch in hochwertiger Holznachbildung. Mit "
351       ~ "praktischem Ablagebogen in Kernnussbaumfarben oder Schwarz. "
352       ~ "Winkelfüße mit Alukante. B", "Designstarker Couchtisch in hochwertiger"
353       ~ " Holznachbildung. Mit praktischem Ablagebogen...", 90);
354 }
355 
356 
357 /*******************************************************************************
358 
359     Limits the length of a UTF-8 string, to at most the specified number of
360     bytes.
361 
362     This is conceptually equal to str[0..max_len], except that we take care to
363     avoid chopping a multi-byte UTF-8 character in half.
364 
365     Params:
366         str     = the string to be sliced
367         max_len = the maximum allowable length (in bytes) of the string
368 
369     Returns:
370         a slice of the original string, of length at most max_len.
371 
372 *******************************************************************************/
373 
374 public inout(mstring) limitStringLength ( inout(mstring) str, size_t max_len )
375 {
376     if ( str.length <= max_len )
377     {
378         return str;
379     }
380 
381     // Make sure we don't chop a character in half.
382     // All UTF-8 continuation bytes are of the form 0b10xxxxxxx,
383     // so we must skip all such bytes
384 
385     auto k = max_len;
386 
387     while ( k != 0 && ( (str[k] & 0xC0 ) ==  0x80) )
388     {
389         --k;
390     }
391 
392     return str[ 0 .. k ];
393 
394 }
395 
396 
397 unittest
398 {
399     // String ending with a 1-byte character
400 
401     test!("==")(limitStringLength("abc", 5), "abc");
402     test!("==")(limitStringLength("abc", 2), "ab");
403 
404     // String ending with a 2-byte character
405 
406     test!("==")(limitStringLength("ÜÄ", 5), "ÜÄ");
407     test!("==")(limitStringLength("ÜÄ", 4), "ÜÄ");
408     test!("==")(limitStringLength("ÜÄ", 3), "Ü");
409     test!("==")(limitStringLength("ÜÄ", 2), "Ü");
410     test!("==")(limitStringLength("ÜÄ", 1), "");
411 
412     // String ending with a 3-byte character
413 
414     test!("==")(limitStringLength("Ü眼", 6), "Ü眼");
415     test!("==")(limitStringLength("Ü眼", 5), "Ü眼");
416     test!("==")(limitStringLength("Ü眼", 4), "Ü");
417 
418     // Ensure it compiles with an mstring
419 
420     mstring x = "abcd".dup;
421     mstring y = limitStringLength(x, 2);
422 }
423 
424 
425 /*******************************************************************************
426 
427     Truncates a string at the last space before the n-th Unicode character or,
428     if the resulting string is too short, at the n-th Unicode character.
429     The string should be a valid UTF-8 (the caller should have validated it
430     before calling this function).
431 
432     If a string is truncated before the end, then the final Unicode chartacter
433     is made an ending. Trailing space is removed before the ending is added.
434     The returned string will always be no more than n Unicode characters
435     (including the ending).
436 
437     The basic algorithm is to walk through src keeping track of how many
438     bytes needed to be sliced at any particular time until we know when
439     we need to end. Because we don't know till the end if we need an
440     ending we need to keep track of one Unicode character behind as well as the
441     position of the Unicode character berore the last space. We have to be
442     careful we never point at spaces.
443 
444     Important points when reading the algorithm:
445 
446     1) Unicode character != byte
447     2) i == the number of bytes required to include the _previous_
448         Unicode character (i.e. the number of bytes to the start of c)
449 
450     Params:
451         src        = the string to truncate (must be UTF-8 encoded)
452         n          = the maximum number of Unicode characters allowed in the
453                      returned string
454         buffer     = a buffer to be used to store the result in (may be
455                      resized). The buffer is required because "ending" may
456                      contain Unicode characters taking more bytes than the
457                      Unicode characters in src they replace, thus leading to a
458                      string with fewer Unicode characters but more bytes.
459         ending     = These Unicode characters will be appended when "src" needs
460                      to be truncated.
461         fill_ratio = if cutting the string in the last space would make its
462                      Unicode character length smaller than "n*fill_ratio",
463                      then we cut it on the n-th Unicode character
464 
465     Returns:
466         buffer
467 
468 *******************************************************************************/
469 
470 public mstring truncateAtN(cstring src, size_t n, ref mstring buffer,
471     cstring ending = ellipsis, float fill_ratio = 0.75)
472 out (result)
473 {
474     size_t result_length = 0;
475     foreach ( dchar c; result )
476     {
477         ++result_length;
478     }
479 
480     assert(result_length <= n);
481 }
482 do
483 {
484     {
485         size_t ending_length = 0;   // Ending's number of Unicode characters
486         foreach ( dchar c; ending )
487         {
488             ++ending_length;
489         }
490 
491         verify(n > ending_length);
492 
493         verify(!isNaN(fill_ratio));
494         verify(fill_ratio>=0 && fill_ratio<=1);
495     }
496 
497     size_t ending_length = 0;   // Ending's number of Unicode characters
498     foreach ( size_t i, dchar c; ending )
499     {
500         ++ending_length;
501     }
502 
503     size_t net_length = n - ending_length;  // The maximum number of Unicode
504                                             // characters that can be kept, if
505                                             // ending is used.
506 
507     size_t code_point_count;    // Which Unicode character are we up to.
508     size_t bytes_needed = 0;    // Number of bytes needed to include the last
509                                 // valid looking Unicode character.
510     size_t last_space_bytes_net = 0; // Number of bytes needed to include the
511                                      // last valid Unicode character which is
512                                      // before the last known space, if ending
513                                      // is used.
514     size_t last_space_code_points_net = 0; // The number of Unicode characters
515                                      // that precede the last space, if ending
516                                      // is used.
517     size_t last_space_bytes_n = 0;   // Number of bytes needed to include the
518                                      // last valid Unicode character which is
519                                      // before the last known space, if ending
520                                      // is not used.
521     size_t last_space_code_points_n = 0; // The number of Unicode characters
522                                      // that precede the last space, if ending
523                                      // is not used.
524     bool need_ending;       // Do we know we need an ending already?
525     bool last_was_space;    // Was the previous character a space?
526 
527     foreach ( size_t i, dchar c; src )
528     {
529         bool curr_is_space = isSpace(c);
530 
531         // Keep Unicode characters that will be returned if the ending is used.
532         if ( code_point_count <= net_length )
533         {
534             // We still need more Unicode characters so we update the counters.
535             // In the edge case (code_point_count == net_length), the
536             // current Unicode character is not needed. However, we need its "i"
537             // in order to find the bytes of the string which includes the
538             // previous Unicode character.
539             if ( ! last_was_space )
540             {
541                 bytes_needed = i;
542 
543                 if ( curr_is_space )
544                 {
545                     // If the current Unicode character is a space, the previous
546                     // is not a space and we are not at the end, keep its
547                     // position.
548                     last_space_bytes_net = i;
549                     last_space_code_points_net = code_point_count;
550                 }
551             }
552         }
553 
554         // Keep Unicode characters that will be returned if the ending is not
555         // used.
556         if ( code_point_count <= n
557             && ! last_was_space
558             && curr_is_space )
559         {
560             // Use "n" instead of "net_length".
561             last_space_bytes_n = i;
562             last_space_code_points_n = code_point_count;
563         }
564 
565         last_was_space = curr_is_space;
566 
567         // This Unicode character will be truncated, but we need to check if it
568         // is a space character. If the Unicode characters that we ommit are
569         // spaces, we will not append the ending, we will just remove the spaces.
570         if ( code_point_count >= n )
571         {
572             if ( ! curr_is_space )
573             {
574                 // This is a non-space Unicode character so we are truncating.
575                 need_ending = true;
576                 break;
577             }
578         }
579 
580         // Track which Unicode character we are up to (as opposed to byte)
581         ++code_point_count;
582     }
583 
584     // We may have fallen off the end of src before we had time to set up all
585     // our variables. If need_ending is true though we know that isn't the case.
586     if ( need_ending )
587     {
588         // Check if there is a long enough string before the last space.
589         if ( last_space_bytes_net
590             && (last_space_code_points_net / (cast(float)n) > fill_ratio) )
591         {
592             bytes_needed = last_space_bytes_net;
593         }
594         // Copy up to the prev positon, which may be the 2nd last Unicode
595         // character or the Unicode character before the last space.
596         assumeSafeAppend(buffer);
597         buffer.length = bytes_needed + ending.length;
598         assumeSafeAppend(buffer);
599         buffer[0 .. bytes_needed] = src[0 .. bytes_needed];
600         // And append an ending
601         buffer[bytes_needed .. bytes_needed + ending.length] = ending[];
602     }
603     else
604     {
605         // We need to check if we finished one or more iterations short
606         if ( code_point_count <= n )
607         {
608             // We did so src is short and if there is no trailing space
609             // we can just use it as is. If there was trailing space then
610             // "last_space_bytes" will have already been set correctly on the
611             // iteration caused by the space
612             if ( ! last_was_space )
613             {
614                 last_space_bytes_n = src.length;
615             }
616         }
617         // No need to append the ending so use the full string we found
618         assumeSafeAppend(buffer);
619         buffer.length = last_space_bytes_n;
620         assumeSafeAppend(buffer);
621         buffer[] = src[0 .. last_space_bytes_n];
622     }
623     return(buffer);
624 }
625 
626 unittest
627 {
628     auto t = new NamedTest(
629         "truncateAtN"
630     );
631 
632     mstring buffer;
633 
634     // Old test
635     foreach (i, char c; "…")
636     {
637         t.test!("==")(ellipsis[i], c);
638     }
639 
640     istring str = "Hello World!";
641     t.test!("==")(str.truncateAtN(str.length, buffer), "Hello World!");
642     t.test!("==")(str.truncateAtN(str.length + 5, buffer), "Hello World!");
643     t.test!("==")(str.truncateAtN(10, buffer), "Hello Wor" ~ ellipsis);
644 
645     t.test!("==")("Hällö World!"c.truncateAtN(10, buffer),
646         "Hällö Wor"c ~ ellipsis);
647     t.test!("==")("äöü"c.truncateAtN(3, buffer), "äöü"c);
648     t.test!("==")("Hello  World!".dup.truncateAtN(10, buffer),
649         "Hello  Wo" ~ ellipsis);
650     t.test!("==")("HelloWörld!"c.truncateAtN(10, buffer, "+"), "HelloWörl+"c);
651     t.test!("==")(
652         "Designstarker Couchtisch in hochwertiger Holznachbildung. Mit praktischem Ablagebogen in Kernnussbaumfarben oder Schwarz. Winkelfüße mit Alukante. B"c.truncateAtN(100, buffer),
653         "Designstarker Couchtisch in hochwertiger Holznachbildung. Mit praktischem Ablagebogen in"c ~ ellipsis
654     );
655 
656     // Andrew's tests
657 
658     t.test!("==")(("This should be the longest string of all the unit tests.\n"
659       ~ "We do this so that the buffer never needs expanding again.\n"
660       ~ "This way we can check for unnecessary allocations.")
661         .truncateAtN(160, buffer),
662         "This should be the longest string of all the unit tests.\n"
663       ~ "We do this so that the buffer never needs expanding again.\n"
664       ~ "This way we can check for unnecessary…"
665     );
666 
667     typeof(buffer.ptr) orig_ptr = buffer.ptr;
668 
669     t.test!("==")("     ".truncateAtN(2, buffer), "");
670     t.test!("==")("12   ".truncateAtN(4, buffer), "12");
671     t.test!("==")("12   ".truncateAtN(6, buffer), "12");
672     t.test!("==")("hello".truncateAtN(2, buffer), "h…");
673     t.test!("==")("hello".truncateAtN(4, buffer), "hel…");
674     t.test!("==")("hello".truncateAtN(5, buffer), "hello");
675     t.test!("==")("hello".truncateAtN(6, buffer), "hello");
676     t.test!("==")("hello".truncateAtN(10, buffer), "hello");
677     t.test!("==")("h l o".truncateAtN(5, buffer), "h l o");
678     t.test!("==")("hello ".truncateAtN(5, buffer), "hello");
679     t.test!("==")("hello ".truncateAtN(6, buffer), "hello");
680     t.test!("==")("hello ".truncateAtN(7, buffer), "hello");
681     t.test!("==")("hello ".truncateAtN(10, buffer), "hello");
682     t.test!("==")("hello   world".truncateAtN(8, buffer), "hello…");
683     t.test!("==")("hello | world".truncateAtN(7, buffer), "hello…");
684     t.test!("==")("hello | world".truncateAtN(8, buffer), "hello |…");
685     t.test!("==")("hello | world".truncateAtN(32, buffer), "hello | world");
686     t.test!("==")("h llo world".truncateAtN(3, buffer), "h…");
687     t.test!("==")("he  ll  o  world".truncateAtN(9, buffer), "he  ll…");
688     t.test!("==")("he  ll  o  world".truncateAtN(10, buffer), "he  ll  o…");
689     t.test!("==")("he  ll  o  world".truncateAtN(32, buffer),
690         "he  ll  o  world");
691 
692     t.test!("==")("a".truncateAtN(4, buffer), "a");
693     t.test!("==")("ab".truncateAtN(4, buffer), "ab");
694     t.test!("==")("a|".truncateAtN(4, buffer), "a|");
695     t.test!("==")("ab|".truncateAtN(4, buffer), "ab|");
696     t.test!("==")("ab|d".truncateAtN(4, buffer), "ab|d");
697     t.test!("==")("abc|".truncateAtN(4, buffer), "abc|");
698     t.test!("==")("abcd| ".truncateAtN(4, buffer), "abc…");
699     t.test!("==")("a| d".truncateAtN(4, buffer), "a| d");
700 
701     t.test!("==")("По оживлённым берегам"c.truncateAtN(2, buffer), "П…"c);
702     t.test!("==")("По оживлённым берегам"c.truncateAtN(3, buffer), "По…"c);
703     t.test!("==")("По оживлённым берегам"c.truncateAtN(4, buffer), "По…"c);
704     t.test!("==")("По оживлённым берегам"c.truncateAtN(5, buffer), "По о…"c);
705     t.test!("==")("Ἰοὺ ἰού· τὰ πάντʼ ἂν ἐξήκοι σαφῆ."c.truncateAtN(2, buffer),
706         "Ἰ…"c);
707     t.test!("==")("Ἰοὺ ἰού· τὰ πάντʼ ἂν ἐξήκοι σαφῆ."c.truncateAtN(3, buffer),
708         "Ἰο…"c);
709     t.test!("==")("Ἰοὺ ἰού· τὰ πάντʼ ἂν ἐξήκοι σαφῆ."c.truncateAtN(4, buffer),
710         "Ἰοὺ…"c);
711     t.test!("==")("Ἰοὺ ἰού· τὰ πάντʼ ἂν ἐξήκοι σαφῆ."c.truncateAtN(5, buffer),
712         "Ἰοὺ…"c);
713     t.test!("==")("Ἰοὺ ἰού· τὰ πάντʼ ἂν ἐξήκοι σαφῆ."c.truncateAtN(6, buffer),
714         "Ἰοὺ ἰ…"c);
715     t.test!("==")("Ξεσκεπάζω τὴν ψυχοφθόρα βδελυγμία"c.truncateAtN(256, buffer),
716         "Ξεσκεπάζω τὴν ψυχοφθόρα βδελυγμία"c);
717     t.test!("==")("पशुपतिरपि तान्यहानि कृच्छ्राद्"c.truncateAtN(6,buffer), "पशुपत…"c); // NB शु is 2 chars
718     t.test!("==")("पशुपतिरपि तान्यहानि कृच्छ्राद्"c.truncateAtN(8, buffer), "पशुपतिर…"c);
719     t.test!("==")("子曰：「學而時習之，不亦說乎？有朋自遠方來，不亦樂乎？"c.truncateAtN(5, buffer), "子曰：「…"c);
720 
721     // we don't yet support R-To-L languages so don't test Arabic
722     //test(truncate_at_n("بِسْمِ ٱللّٰهِ ٱلرَّحْمـَبنِ ٱلرَّحِيمِ", 5c, buffer) = "…رَّحِيمِ"c);
723 
724     // Use some other ending that is not one character.
725     t.test!("==")("a| d".truncateAtN(4, buffer, "..."), "a| d");
726     t.test!("==")("a| d1".truncateAtN(4, buffer, "..."), "a...");
727     t.test!("==")("1234567890".truncateAtN(7, buffer, "..."), "1234...");
728     t.test!("==")("1234567890".truncateAtN(70, buffer, "..."), "1234567890");
729     t.test!("==")("1234 6789 1234 6789 1234 6789".truncateAtN(25, buffer, "..."),
730         "1234 6789 1234 6789...");
731 
732     // check nothing has allocated
733     t.test!("==")(orig_ptr, buffer.ptr);
734 }