1 /*******************************************************************************
2 3 Struct template to iterate over strings in variable encoding format (utf8,
4 utf16, utf32), extracting one unicode character at a time. Each unicode
5 character may be represented by one or more character in the input string,
6 depending on the encoding format.
7 8 The struct takes a template parameter (pull_dchars) which determines
9 whether its methods return unicode characters (utf32 - dchars) or characters
10 in the same format as the source string.
11 12 The template also has an index operator, to extract the nth unicode
13 character in the string, and methods and static methods for extracting
14 single characters from a string of variable encoding.
15 16 Example usage:
17 18 ---
19 20 import ocean.text.utf.UtfString;
21 22 char[] test = "test string";
23 UtfString!(char) utfstr = { test };
24 25 foreach ( width, i, c; utfstr )
26 {
27 Stdout.formatln("Character {} is {} and it's {} wide", i, c, width);
28 }
29 30 ---
31 32 There is also a utf_match function in the module, which compares two strings
33 for equivalence, irrespective of whether they're in the same encoding or
34 not.
35 36 Example:
37 38 ---
39 40 import ocean.text.utf.UtfString;
41 42 char[] str1 = "hello world ®"; // utf8 encoding
43 dchar[] str2 = "hello world ®"; // utf32 encoding
44 45 assert(utf_match(str1, str2));
46 47 ---
48 49 Copyright:
50 Copyright (c) 2009-2016 dunnhumby Germany GmbH.
51 All rights reserved.
52 53 License:
54 Boost Software License Version 1.0. See LICENSE_BOOST.txt for details.
55 Alternatively, this file may be distributed under the terms of the Tango
56 3-Clause BSD License (see LICENSE_BSD.txt for details).
57 58 *******************************************************************************/59 60 moduleocean.text.utf.UtfString;
61 62 63 64 65 importUtf = ocean.text.convert.Utf;
66 67 importocean.meta.types.Qualifiers;
68 69 importocean.core.Verify;
70 71 version (unittest) importocean.core.Test;
72 73 /*******************************************************************************
74 75 Invalid unicode.
76 77 *******************************************************************************/78 79 publicstaticimmutabledcharInvalidUnicode = cast(dchar)0xffffffff;
80 81 82 83 /*******************************************************************************
84 85 Encoding agnostic string compare function.
86 87 Params:
88 Char1 = character type of first string to compare
89 Char2 = character type of second string to compare
90 str1 = first string to compare
91 str2 = second string to compare
92 93 Returns:
94 true if the strings contain the same unicode characters
95 96 *******************************************************************************/97 98 boolutf_match ( Char1, Char2 ) ( Char1[] str1, Char2[] str2 )
99 {
100 staticif ( is(Char1 == Char2) )
101 {
102 returnstr1 == str2;
103 }
104 else105 {
106 if ( (str1.length == 0 || str2.length == 0) && str1.length != str2.length )
107 {
108 returnfalse;
109 }
110 UtfString!(Char1, true) utf_str1 = { str1 };
111 UtfString!(Char2, true) utf_str2 = { str2 };
112 113 foreach ( c1; utf_str1 )
114 {
115 autoc2 = utf_str2.extract(true);
116 117 if ( c1 != c2 )
118 {
119 returnfalse;
120 }
121 }
122 123 returntrue;
124 }
125 }
126 127 128 129 /*******************************************************************************
130 131 UtfString template struct
132 133 Params:
134 Char = type of strings to process
135 pull_dchars = determines the output type of the struct's methods. If
136 true they will all output dchars (ie unicode / utf32 characters),
137 otherwise they output slices of the input string, containing the
138 characters representing a single unicode character.
139 140 *******************************************************************************/141 142 publicstructUtfString ( Char = char, boolpull_dchars = false )
143 {
144 /***************************************************************************
145 146 Check the parameter type of this class.
147 148 ***************************************************************************/149 150 staticassert(
151 is(Unqual!(Char) == char)
152 || is(Unqual!(Char) == wchar)
153 || is(Unqual!(Char) == dchar),
154 This.stringof ~ " template parameter Char must be one of {char, wchar, dchar}, not " ~ Char.stringof155 );
156 157 /***************************************************************************
158 159 This alias.
160 161 ***************************************************************************/162 163 publicaliastypeof(&this) This;
164 165 166 /***************************************************************************
167 168 String to iterate over.
169 170 ***************************************************************************/171 172 publicChar[] string;
173 174 175 /***************************************************************************
176 177 Output type alias.
178 179 ***************************************************************************/180 181 staticif ( pull_dchars )
182 {
183 publicaliasdcharOutType;
184 publicaliasdchar[] ArrayOutType;
185 }
186 else187 {
188 publicaliasChar[] OutType;
189 publicaliasChar[] ArrayOutType;
190 }
191 192 193 /***************************************************************************
194 195 Internal buffer, used by the slice operator.
196 197 ***************************************************************************/198 199 privateArrayOutTypeslice_string;
200 201 202 /***************************************************************************
203 204 foreach iterator.
205 206 Exposes the following foreach parameters:
207 size_t width = number of input characters for this unicode character
208 size_t i = current index into the input string
209 OutType c = the next unicode character in the string
210 211 ***************************************************************************/212 213 publicintopApply ( scopeintdelegate ( refsize_t, refsize_t, refOutType ) dg )
214 {
215 intres;
216 size_ti;
217 218 while ( i < this.string.length )
219 {
220 Char[] process = this.string[i..$];
221 222 size_twidth;
223 autoc = This.extract(process, width);
224 225 res = dg(width, i, c);
226 if ( res )
227 {
228 break;
229 }
230 231 i += width;
232 }
233 234 returnres;
235 }
236 237 238 /***************************************************************************
239 240 foreach iterator.
241 242 Exposes the following foreach parameters:
243 size_t i = current index into the input string
244 OutType c = the next unicode character in the string
245 246 ***************************************************************************/247 248 publicintopApply ( scopeintdelegate ( refsize_t, refOutType ) dg )
249 {
250 intres;
251 size_ti;
252 253 while ( i < this.string.length )
254 {
255 Char[] process = this.string[i..$];
256 257 size_twidth;
258 autoc = This.extract(process, width);
259 260 res = dg(i, c);
261 if ( res )
262 {
263 break;
264 }
265 266 i += width;
267 }
268 269 returnres;
270 }
271 272 273 /***************************************************************************
274 275 foreach iterator.
276 277 Exposes the following foreach parameters:
278 OutType c = the next unicode character in the string
279 280 ***************************************************************************/281 282 publicintopApply ( scopeintdelegate ( refOutType ) dg )
283 {
284 intres;
285 size_ti;
286 287 while ( i < this.string.length )
288 {
289 Char[] process = this.string[i..$];
290 291 size_twidth;
292 autoc = This.extract(process, width);
293 294 res = dg(c);
295 if ( res )
296 {
297 break;
298 }
299 300 i += width;
301 }
302 303 returnres;
304 }
305 306 307 /***************************************************************************
308 309 opIndex. Extracts the nth unicode character from the referenced string.
310 311 Params:
312 index = index of character to extract
313 314 Returns:
315 the extracted character, either as a dchar or a slice into the input
316 string (depending on the pull_dchars template parameter).
317 318 ***************************************************************************/319 320 publicOutTypeopIndex ( size_tindex )
321 {
322 verify(this.string.length > 0,
323 This.stringof ~ ".opIndex - attempted to index into an empty string");
324 325 size_ti;
326 size_tcount;
327 OutTypec;
328 do329 {
330 size_twidth;
331 c = This.extract(this.string[i..$], width);
332 i += width;
333 } while ( count++ < index );
334 335 returnc;
336 }
337 338 339 /***************************************************************************
340 341 opSlice. Extracts an indexed sequence of unicode characters from the
342 referenced string.
343 344 For dchar output, the returned slice is built up in the internal
345 slice_string member. Otherwise a slice into the referenced string is
346 returned.
347 348 Params:
349 start = index of first character to extract
350 end = index of last character to extract
351 352 Returns:
353 the sliced characters (either as dchars or as the same type as the
354 referenced string).
355 356 ***************************************************************************/357 358 publicArrayOutTypeopSlice ( size_tstart, size_tend )
359 {
360 verify(end > start, typeof(this).stringof ~ ".opSlice - end <= start!");
361 362 staticif ( pull_dchars )
363 {
364 returnthis.sliceCopy(start, end, this.slice_string);
365 }
366 else367 {
368 size_tstart_i;
369 size_tchar_count;
370 size_tsrc_i;
371 372 while ( src_i < this.string.length )
373 {
374 if ( char_count == start )
375 {
376 start_i = src_i;
377 }
378 if ( char_count >= end )
379 {
380 returnthis.string[start_i .. src_i];
381 }
382 383 Char[] process = this.string[src_i..$];
384 385 size_twidth;
386 This.extract(process, width);
387 388 src_i += width;
389 char_count++;
390 }
391 392 assert(false, typeof(this).stringof ~ ".opSlice - end > array length");
393 }
394 }
395 396 397 /***************************************************************************
398 399 Slice / copy. Extracts an indexed sequence of unicode characters from
400 the referenced string and copies them into the provided buffer.
401 402 The returned slice is built up in the passed string.
403 404 Params:
405 start = index of first character to extract
406 end = index of last character to extract
407 output = string into which the sliced characters are placed
408 409 Returns:
410 the sliced characters (either as dchars or as the same type as the
411 referenced string).
412 413 ***************************************************************************/414 415 publicArrayOutTypesliceCopy ( size_tstart, size_tend, refArrayOutTypeoutput )
416 {
417 output.length = 0;
418 419 size_ti;
420 foreach ( c; this )
421 {
422 if ( i >= start )
423 {
424 output ~= c;
425 }
426 427 if ( ++i >= end )
428 {
429 break;
430 }
431 }
432 433 returnoutput;
434 }
435 436 437 /***************************************************************************
438 439 Calculates the number of unicode characters in the referenced string.
440 The calculation requires that the whole string is iterated over.
441 442 Returns:
443 number of unicode characters in the string
444 445 ***************************************************************************/446 447 publicsize_tlength ( )
448 {
449 size_tlen;
450 451 foreach ( c; this )
452 {
453 len++;
454 }
455 456 returnlen;
457 }
458 459 460 /***************************************************************************
461 462 Extract the next character from the referenced string.
463 464 Params:
465 consume = if true, the extracted characters are removed from the
466 string (the start of the slice is advanced)
467 468 Returns:
469 the extracted character, either as a dchar or a slice into the input
470 string (depending on the pull_dchars template parameter).
471 472 ***************************************************************************/473 474 publicOutTypeextract ( boolconsume = false )
475 {
476 size_twidth;
477 returnthis.extract(width, consume);
478 }
479 480 481 /***************************************************************************
482 483 Extract the next character from the referenced string.
484 485 Params:
486 width = outputs the width (in terms of the number of characters in
487 the input string) of the extracted character
488 consume = if true, the extracted characters are removed from the
489 string (the start of the slice is advanced)
490 491 Returns:
492 the extracted character, either as a dchar or a slice into the input
493 string (depending on the pull_dchars template parameter).
494 495 ***************************************************************************/496 497 publicOutTypeextract ( outsize_twidth, boolconsume = false )
498 {
499 autoextracted = This.extract(this.string, width);
500 if ( consume )
501 {
502 this.string = this.string[width..$];
503 }
504 505 returnextracted;
506 }
507 508 509 /***************************************************************************
510 511 Static method to extract the next character from the passed string.
512 513 Params:
514 text = string to extract from
515 516 Returns:
517 the extracted character, either as a dchar or a slice into the input
518 string (depending on the pull_dchars template parameter).
519 520 ***************************************************************************/521 522 publicstaticOutTypeextract ( Char[] text )
523 {
524 size_twidth;
525 returnThis.extract(text, width);
526 }
527 528 529 /***************************************************************************
530 531 Static method to extract the next character from the passed string.
532 533 Params:
534 text = string to extract from
535 width = outputs the width (in terms of the number of characters in
536 the input string) of the extracted character
537 538 Returns:
539 the extracted character, either as a dchar or a slice into the input
540 string (depending on the pull_dchars template parameter).
541 542 ***************************************************************************/543 544 staticif ( pull_dchars )
545 {
546 publicstaticOutTypeextract ( Char[] text, outsize_twidth )
547 {
548 if ( !text.length )
549 {
550 returnInvalidUnicode;
551 }
552 553 staticif ( is(Unqual!(Char) == dchar) )
554 {
555 width = 1;
556 returntext[0];
557 }
558 else559 {
560 dcharunicode = Utf.decode(text, width);
561 returnunicode;
562 }
563 }
564 }
565 else566 {
567 publicstaticOutTypeextract ( Char[] text, outsize_twidth )
568 {
569 if ( !text.length )
570 {
571 return"";
572 }
573 574 staticif ( is(Unqual!(Char) == dchar) )
575 {
576 width = 1;
577 }
578 else579 {
580 dcharunicode = Utf.decode(text, width);
581 }
582 583 returntext[0..width];
584 }
585 }
586 }
587 588 589 unittest590 {
591 istringstr1 = "hello world ®"; // utf8 encoding592 const(dchar)[] str2 = "hello world ®"; // utf32 encoding593 594 test(utf_match(str1, str2));
595 }