1 /*******************************************************************************
2 
3     Struct template to iterate over strings in variable encoding format (utf8,
4     utf16, utf32), extracting one unicode character at a time. Each unicode
5     character may be represented by one or more character in the input string,
6     depending on the encoding format.
7 
8     The struct takes a template parameter (pull_dchars) which determines
9     whether its methods return unicode characters (utf32 - dchars) or characters
10     in the same format as the source string.
11 
12     The template also has an index operator, to extract the nth unicode
13     character in the string, and methods and static methods for extracting
14     single characters from a string of variable encoding.
15 
16     Example usage:
17 
18     ---
19 
20         import ocean.text.utf.UtfString;
21 
22         char[] test = "test string";
23         UtfString!(char) utfstr = { test };
24 
25         foreach ( width, i, c; utfstr )
26         {
27             Stdout.formatln("Character {} is {} and it's {} wide", i, c, width);
28         }
29 
30     ---
31 
32     There is also a utf_match function in the module, which compares two strings
33     for equivalence, irrespective of whether they're in the same encoding or
34     not.
35 
36     Example:
37 
38     ---
39 
40         import ocean.text.utf.UtfString;
41 
42         char[] str1 = "hello world ®"; // utf8 encoding
43         dchar[] str2 = "hello world ®"; // utf32 encoding
44 
45         assert(utf_match(str1, str2));
46 
47     ---
48 
49     Copyright:
50         Copyright (c) 2009-2016 dunnhumby Germany GmbH.
51         All rights reserved.
52 
53     License:
54         Boost Software License Version 1.0. See LICENSE_BOOST.txt for details.
55         Alternatively, this file may be distributed under the terms of the Tango
56         3-Clause BSD License (see LICENSE_BSD.txt for details).
57 
58 *******************************************************************************/
59 
60 module ocean.text.utf.UtfString;
61 
62 
63 
64 
65 import Utf = ocean.text.convert.Utf;
66 
67 import ocean.meta.types.Qualifiers;
68 
69 import ocean.core.Verify;
70 
71 version (unittest) import ocean.core.Test;
72 
73 /*******************************************************************************
74 
75     Invalid unicode.
76 
77 *******************************************************************************/
78 
79 public static immutable dchar InvalidUnicode = cast(dchar)0xffffffff;
80 
81 
82 
83 /*******************************************************************************
84 
85     Encoding agnostic string compare function.
86 
87     Params:
88         Char1 = character type of first string to compare
89         Char2 = character type of second string to compare
90         str1 = first string to compare
91         str2 = second string to compare
92 
93     Returns:
94         true if the strings contain the same unicode characters
95 
96 *******************************************************************************/
97 
98 bool utf_match ( Char1, Char2 ) ( Char1[] str1, Char2[] str2 )
99 {
100     static if ( is(Char1 == Char2) )
101     {
102         return str1 == str2;
103     }
104     else
105     {
106         if ( (str1.length == 0 || str2.length == 0) && str1.length != str2.length )
107         {
108             return false;
109         }
110         UtfString!(Char1, true) utf_str1 = { str1 };
111         UtfString!(Char2, true) utf_str2 = { str2 };
112 
113         foreach ( c1; utf_str1 )
114         {
115             auto c2 = utf_str2.extract(true);
116 
117             if ( c1 != c2 )
118             {
119                 return false;
120             }
121         }
122 
123         return true;
124     }
125 }
126 
127 
128 
129 /*******************************************************************************
130 
131     UtfString template struct
132 
133     Params:
134         Char = type of strings to process
135         pull_dchars = determines the output type of the struct's methods. If
136             true they will all output dchars (ie unicode / utf32 characters),
137             otherwise they output slices of the input string, containing the
138             characters representing a single unicode character.
139 
140 *******************************************************************************/
141 
142 public struct UtfString ( Char = char, bool pull_dchars = false )
143 {
144     /***************************************************************************
145 
146         Check the parameter type of this class.
147 
148     ***************************************************************************/
149 
150     static assert(
151         is(Unqual!(Char) == char)
152             || is(Unqual!(Char) == wchar)
153             || is(Unqual!(Char) == dchar),
154         This.stringof ~ " template parameter Char must be one of {char, wchar, dchar}, not " ~ Char.stringof
155     );
156 
157     /***************************************************************************
158 
159         This alias.
160 
161     ***************************************************************************/
162 
163     public alias typeof(&this) This;
164 
165 
166     /***************************************************************************
167 
168         String to iterate over.
169 
170     ***************************************************************************/
171 
172     public Char[] string;
173 
174 
175     /***************************************************************************
176 
177         Output type alias.
178 
179     ***************************************************************************/
180 
181     static if ( pull_dchars )
182     {
183         public alias dchar OutType;
184         public alias dchar[] ArrayOutType;
185     }
186     else
187     {
188         public alias Char[] OutType;
189         public alias Char[] ArrayOutType;
190     }
191 
192 
193     /***************************************************************************
194 
195         Internal buffer, used by the slice operator.
196 
197     ***************************************************************************/
198 
199     private ArrayOutType slice_string;
200 
201 
202     /***************************************************************************
203 
204         foreach iterator.
205 
206         Exposes the following foreach parameters:
207             size_t width = number of input characters for this unicode character
208             size_t i = current index into the input string
209             OutType c = the next unicode character in the string
210 
211     ***************************************************************************/
212 
213     public int opApply ( scope int delegate ( ref size_t, ref size_t, ref OutType ) dg )
214     {
215         int res;
216         size_t i;
217 
218         while ( i < this.string.length )
219         {
220             Char[] process = this.string[i..$];
221 
222             size_t width;
223             auto c = This.extract(process, width);
224 
225             res = dg(width, i, c);
226             if ( res )
227             {
228                 break;
229             }
230 
231             i += width;
232         }
233 
234         return res;
235     }
236 
237 
238     /***************************************************************************
239 
240         foreach iterator.
241 
242         Exposes the following foreach parameters:
243             size_t i = current index into the input string
244             OutType c = the next unicode character in the string
245 
246     ***************************************************************************/
247 
248     public int opApply ( scope int delegate ( ref size_t, ref OutType ) dg )
249     {
250         int res;
251         size_t i;
252 
253         while ( i < this.string.length )
254         {
255             Char[] process = this.string[i..$];
256 
257             size_t width;
258             auto c = This.extract(process, width);
259 
260             res = dg(i, c);
261             if ( res )
262             {
263                 break;
264             }
265 
266             i += width;
267         }
268 
269         return res;
270     }
271 
272 
273     /***************************************************************************
274 
275         foreach iterator.
276 
277         Exposes the following foreach parameters:
278             OutType c = the next unicode character in the string
279 
280     ***************************************************************************/
281 
282     public int opApply ( scope int delegate ( ref OutType ) dg )
283     {
284         int res;
285         size_t i;
286 
287         while ( i < this.string.length )
288         {
289             Char[] process = this.string[i..$];
290 
291             size_t width;
292             auto c = This.extract(process, width);
293 
294             res = dg(c);
295             if ( res )
296             {
297                 break;
298             }
299 
300             i += width;
301         }
302 
303         return res;
304     }
305 
306 
307     /***************************************************************************
308 
309         opIndex. Extracts the nth unicode character from the referenced string.
310 
311         Params:
312             index = index of character to extract
313 
314         Returns:
315             the extracted character, either as a dchar or a slice into the input
316             string (depending on the pull_dchars template parameter).
317 
318     ***************************************************************************/
319 
320     public OutType opIndex ( size_t index )
321     {
322         verify(this.string.length > 0,
323             This.stringof ~ ".opIndex - attempted to index into an empty string");
324 
325         size_t i;
326         size_t count;
327         OutType c;
328         do
329         {
330             size_t width;
331             c = This.extract(this.string[i..$], width);
332             i += width;
333         } while ( count++ < index );
334 
335         return c;
336     }
337 
338 
339     /***************************************************************************
340 
341         opSlice. Extracts an indexed sequence of unicode characters from the
342         referenced string.
343 
344         For dchar output, the returned slice is built up in the internal
345         slice_string member. Otherwise a slice into the referenced string is
346         returned.
347 
348         Params:
349             start = index of first character to extract
350             end = index of last character to extract
351 
352         Returns:
353             the sliced characters (either as dchars or as the same type as the
354             referenced string).
355 
356     ***************************************************************************/
357 
358     public ArrayOutType opSlice ( size_t start, size_t end )
359     {
360         verify(end > start, typeof(this).stringof ~ ".opSlice - end <= start!");
361 
362         static if ( pull_dchars )
363         {
364             return this.sliceCopy(start, end, this.slice_string);
365         }
366         else
367         {
368             size_t start_i;
369             size_t char_count;
370             size_t src_i;
371 
372             while ( src_i < this.string.length )
373             {
374                 if ( char_count == start )
375                 {
376                     start_i = src_i;
377                 }
378                 if ( char_count >= end )
379                 {
380                     return this.string[start_i .. src_i];
381                 }
382 
383                 Char[] process = this.string[src_i..$];
384 
385                 size_t width;
386                 This.extract(process, width);
387 
388                 src_i += width;
389                 char_count++;
390             }
391 
392             assert(false, typeof(this).stringof ~ ".opSlice - end > array length");
393         }
394     }
395 
396 
397     /***************************************************************************
398 
399         Slice / copy. Extracts an indexed sequence of unicode characters from
400         the referenced string and copies them into the provided buffer.
401 
402         The returned slice is built up in the passed string.
403 
404         Params:
405             start = index of first character to extract
406             end = index of last character to extract
407             output = string into which the sliced characters are placed
408 
409         Returns:
410             the sliced characters (either as dchars or as the same type as the
411             referenced string).
412 
413     ***************************************************************************/
414 
415     public ArrayOutType sliceCopy ( size_t start, size_t end, ref ArrayOutType output )
416     {
417         output.length = 0;
418 
419         size_t i;
420         foreach ( c; this )
421         {
422             if ( i >= start )
423             {
424                 output ~= c;
425             }
426 
427             if ( ++i >= end )
428             {
429                 break;
430             }
431         }
432 
433         return output;
434     }
435 
436 
437     /***************************************************************************
438 
439         Calculates the number of unicode characters in the referenced string.
440         The calculation requires that the whole string is iterated over.
441 
442         Returns:
443             number of unicode characters in the string
444 
445     ***************************************************************************/
446 
447     public size_t length ( )
448     {
449         size_t len;
450 
451         foreach ( c; this )
452         {
453             len++;
454         }
455 
456         return len;
457     }
458 
459 
460     /***************************************************************************
461 
462         Extract the next character from the referenced string.
463 
464         Params:
465             consume = if true, the extracted characters are removed from the
466                 string (the start of the slice is advanced)
467 
468         Returns:
469             the extracted character, either as a dchar or a slice into the input
470             string (depending on the pull_dchars template parameter).
471 
472     ***************************************************************************/
473 
474     public OutType extract ( bool consume = false )
475     {
476         size_t width;
477         return this.extract(width, consume);
478     }
479 
480 
481     /***************************************************************************
482 
483         Extract the next character from the referenced string.
484 
485         Params:
486             width = outputs the width (in terms of the number of characters in
487                 the input string) of the extracted character
488             consume = if true, the extracted characters are removed from the
489                 string (the start of the slice is advanced)
490 
491         Returns:
492             the extracted character, either as a dchar or a slice into the input
493             string (depending on the pull_dchars template parameter).
494 
495     ***************************************************************************/
496 
497     public OutType extract ( out size_t width, bool consume = false )
498     {
499         auto extracted = This.extract(this.string, width);
500         if ( consume )
501         {
502             this.string = this.string[width..$];
503         }
504 
505         return extracted;
506     }
507 
508 
509     /***************************************************************************
510 
511         Static method to extract the next character from the passed string.
512 
513         Params:
514             text = string to extract from
515 
516         Returns:
517             the extracted character, either as a dchar or a slice into the input
518             string (depending on the pull_dchars template parameter).
519 
520     ***************************************************************************/
521 
522     public static OutType extract ( Char[] text )
523     {
524         size_t width;
525         return This.extract(text, width);
526     }
527 
528 
529     /***************************************************************************
530 
531         Static method to extract the next character from the passed string.
532 
533         Params:
534             text = string to extract from
535             width = outputs the width (in terms of the number of characters in
536                 the input string) of the extracted character
537 
538         Returns:
539             the extracted character, either as a dchar or a slice into the input
540             string (depending on the pull_dchars template parameter).
541 
542     ***************************************************************************/
543 
544     static if ( pull_dchars )
545     {
546         public static OutType extract ( Char[] text, out size_t width )
547         {
548             if ( !text.length )
549             {
550                 return InvalidUnicode;
551             }
552 
553             static if ( is(Unqual!(Char) == dchar) )
554             {
555                 width = 1;
556                 return text[0];
557             }
558             else
559             {
560                 dchar unicode = Utf.decode(text, width);
561                 return unicode;
562             }
563         }
564     }
565     else
566     {
567         public static OutType extract ( Char[] text, out size_t width )
568         {
569             if ( !text.length )
570             {
571                 return "";
572             }
573 
574             static if ( is(Unqual!(Char) == dchar) )
575             {
576                 width = 1;
577             }
578             else
579             {
580                 dchar unicode = Utf.decode(text, width);
581             }
582 
583             return text[0..width];
584         }
585     }
586 }
587 
588 
589 unittest
590 {
591     istring str1 = "hello world ®"; // utf8 encoding
592     const(dchar)[] str2 = "hello world ®"; // utf32 encoding
593 
594     test(utf_match(str1, str2));
595 }