1 /******************************************************************************* 2 3 Struct template to iterate over strings in variable encoding format (utf8, 4 utf16, utf32), extracting one unicode character at a time. Each unicode 5 character may be represented by one or more character in the input string, 6 depending on the encoding format. 7 8 The struct takes a template parameter (pull_dchars) which determines 9 whether its methods return unicode characters (utf32 - dchars) or characters 10 in the same format as the source string. 11 12 The template also has an index operator, to extract the nth unicode 13 character in the string, and methods and static methods for extracting 14 single characters from a string of variable encoding. 15 16 Example usage: 17 18 --- 19 20 import ocean.text.utf.UtfString; 21 22 char[] test = "test string"; 23 UtfString!(char) utfstr = { test }; 24 25 foreach ( width, i, c; utfstr ) 26 { 27 Stdout.formatln("Character {} is {} and it's {} wide", i, c, width); 28 } 29 30 --- 31 32 There is also a utf_match function in the module, which compares two strings 33 for equivalence, irrespective of whether they're in the same encoding or 34 not. 35 36 Example: 37 38 --- 39 40 import ocean.text.utf.UtfString; 41 42 char[] str1 = "hello world ®"; // utf8 encoding 43 dchar[] str2 = "hello world ®"; // utf32 encoding 44 45 assert(utf_match(str1, str2)); 46 47 --- 48 49 Copyright: 50 Copyright (c) 2009-2016 dunnhumby Germany GmbH. 51 All rights reserved. 52 53 License: 54 Boost Software License Version 1.0. See LICENSE_BOOST.txt for details. 55 Alternatively, this file may be distributed under the terms of the Tango 56 3-Clause BSD License (see LICENSE_BSD.txt for details). 57 58 *******************************************************************************/ 59 60 module ocean.text.utf.UtfString; 61 62 63 64 65 import Utf = ocean.text.convert.Utf; 66 67 import ocean.meta.types.Qualifiers; 68 69 import ocean.core.Verify; 70 71 version (unittest) import ocean.core.Test; 72 73 /******************************************************************************* 74 75 Invalid unicode. 76 77 *******************************************************************************/ 78 79 public static immutable dchar InvalidUnicode = cast(dchar)0xffffffff; 80 81 82 83 /******************************************************************************* 84 85 Encoding agnostic string compare function. 86 87 Params: 88 Char1 = character type of first string to compare 89 Char2 = character type of second string to compare 90 str1 = first string to compare 91 str2 = second string to compare 92 93 Returns: 94 true if the strings contain the same unicode characters 95 96 *******************************************************************************/ 97 98 bool utf_match ( Char1, Char2 ) ( Char1[] str1, Char2[] str2 ) 99 { 100 static if ( is(Char1 == Char2) ) 101 { 102 return str1 == str2; 103 } 104 else 105 { 106 if ( (str1.length == 0 || str2.length == 0) && str1.length != str2.length ) 107 { 108 return false; 109 } 110 UtfString!(Char1, true) utf_str1 = { str1 }; 111 UtfString!(Char2, true) utf_str2 = { str2 }; 112 113 foreach ( c1; utf_str1 ) 114 { 115 auto c2 = utf_str2.extract(true); 116 117 if ( c1 != c2 ) 118 { 119 return false; 120 } 121 } 122 123 return true; 124 } 125 } 126 127 128 129 /******************************************************************************* 130 131 UtfString template struct 132 133 Params: 134 Char = type of strings to process 135 pull_dchars = determines the output type of the struct's methods. If 136 true they will all output dchars (ie unicode / utf32 characters), 137 otherwise they output slices of the input string, containing the 138 characters representing a single unicode character. 139 140 *******************************************************************************/ 141 142 public struct UtfString ( Char = char, bool pull_dchars = false ) 143 { 144 /*************************************************************************** 145 146 Check the parameter type of this class. 147 148 ***************************************************************************/ 149 150 static assert( 151 is(Unqual!(Char) == char) 152 || is(Unqual!(Char) == wchar) 153 || is(Unqual!(Char) == dchar), 154 This.stringof ~ " template parameter Char must be one of {char, wchar, dchar}, not " ~ Char.stringof 155 ); 156 157 /*************************************************************************** 158 159 This alias. 160 161 ***************************************************************************/ 162 163 public alias typeof(&this) This; 164 165 166 /*************************************************************************** 167 168 String to iterate over. 169 170 ***************************************************************************/ 171 172 public Char[] string; 173 174 175 /*************************************************************************** 176 177 Output type alias. 178 179 ***************************************************************************/ 180 181 static if ( pull_dchars ) 182 { 183 public alias dchar OutType; 184 public alias dchar[] ArrayOutType; 185 } 186 else 187 { 188 public alias Char[] OutType; 189 public alias Char[] ArrayOutType; 190 } 191 192 193 /*************************************************************************** 194 195 Internal buffer, used by the slice operator. 196 197 ***************************************************************************/ 198 199 private ArrayOutType slice_string; 200 201 202 /*************************************************************************** 203 204 foreach iterator. 205 206 Exposes the following foreach parameters: 207 size_t width = number of input characters for this unicode character 208 size_t i = current index into the input string 209 OutType c = the next unicode character in the string 210 211 ***************************************************************************/ 212 213 public int opApply ( scope int delegate ( ref size_t, ref size_t, ref OutType ) dg ) 214 { 215 int res; 216 size_t i; 217 218 while ( i < this.string.length ) 219 { 220 Char[] process = this.string[i..$]; 221 222 size_t width; 223 auto c = This.extract(process, width); 224 225 res = dg(width, i, c); 226 if ( res ) 227 { 228 break; 229 } 230 231 i += width; 232 } 233 234 return res; 235 } 236 237 238 /*************************************************************************** 239 240 foreach iterator. 241 242 Exposes the following foreach parameters: 243 size_t i = current index into the input string 244 OutType c = the next unicode character in the string 245 246 ***************************************************************************/ 247 248 public int opApply ( scope int delegate ( ref size_t, ref OutType ) dg ) 249 { 250 int res; 251 size_t i; 252 253 while ( i < this.string.length ) 254 { 255 Char[] process = this.string[i..$]; 256 257 size_t width; 258 auto c = This.extract(process, width); 259 260 res = dg(i, c); 261 if ( res ) 262 { 263 break; 264 } 265 266 i += width; 267 } 268 269 return res; 270 } 271 272 273 /*************************************************************************** 274 275 foreach iterator. 276 277 Exposes the following foreach parameters: 278 OutType c = the next unicode character in the string 279 280 ***************************************************************************/ 281 282 public int opApply ( scope int delegate ( ref OutType ) dg ) 283 { 284 int res; 285 size_t i; 286 287 while ( i < this.string.length ) 288 { 289 Char[] process = this.string[i..$]; 290 291 size_t width; 292 auto c = This.extract(process, width); 293 294 res = dg(c); 295 if ( res ) 296 { 297 break; 298 } 299 300 i += width; 301 } 302 303 return res; 304 } 305 306 307 /*************************************************************************** 308 309 opIndex. Extracts the nth unicode character from the referenced string. 310 311 Params: 312 index = index of character to extract 313 314 Returns: 315 the extracted character, either as a dchar or a slice into the input 316 string (depending on the pull_dchars template parameter). 317 318 ***************************************************************************/ 319 320 public OutType opIndex ( size_t index ) 321 { 322 verify(this.string.length > 0, 323 This.stringof ~ ".opIndex - attempted to index into an empty string"); 324 325 size_t i; 326 size_t count; 327 OutType c; 328 do 329 { 330 size_t width; 331 c = This.extract(this.string[i..$], width); 332 i += width; 333 } while ( count++ < index ); 334 335 return c; 336 } 337 338 339 /*************************************************************************** 340 341 opSlice. Extracts an indexed sequence of unicode characters from the 342 referenced string. 343 344 For dchar output, the returned slice is built up in the internal 345 slice_string member. Otherwise a slice into the referenced string is 346 returned. 347 348 Params: 349 start = index of first character to extract 350 end = index of last character to extract 351 352 Returns: 353 the sliced characters (either as dchars or as the same type as the 354 referenced string). 355 356 ***************************************************************************/ 357 358 public ArrayOutType opSlice ( size_t start, size_t end ) 359 { 360 verify(end > start, typeof(this).stringof ~ ".opSlice - end <= start!"); 361 362 static if ( pull_dchars ) 363 { 364 return this.sliceCopy(start, end, this.slice_string); 365 } 366 else 367 { 368 size_t start_i; 369 size_t char_count; 370 size_t src_i; 371 372 while ( src_i < this.string.length ) 373 { 374 if ( char_count == start ) 375 { 376 start_i = src_i; 377 } 378 if ( char_count >= end ) 379 { 380 return this.string[start_i .. src_i]; 381 } 382 383 Char[] process = this.string[src_i..$]; 384 385 size_t width; 386 This.extract(process, width); 387 388 src_i += width; 389 char_count++; 390 } 391 392 assert(false, typeof(this).stringof ~ ".opSlice - end > array length"); 393 } 394 } 395 396 397 /*************************************************************************** 398 399 Slice / copy. Extracts an indexed sequence of unicode characters from 400 the referenced string and copies them into the provided buffer. 401 402 The returned slice is built up in the passed string. 403 404 Params: 405 start = index of first character to extract 406 end = index of last character to extract 407 output = string into which the sliced characters are placed 408 409 Returns: 410 the sliced characters (either as dchars or as the same type as the 411 referenced string). 412 413 ***************************************************************************/ 414 415 public ArrayOutType sliceCopy ( size_t start, size_t end, ref ArrayOutType output ) 416 { 417 output.length = 0; 418 419 size_t i; 420 foreach ( c; this ) 421 { 422 if ( i >= start ) 423 { 424 output ~= c; 425 } 426 427 if ( ++i >= end ) 428 { 429 break; 430 } 431 } 432 433 return output; 434 } 435 436 437 /*************************************************************************** 438 439 Calculates the number of unicode characters in the referenced string. 440 The calculation requires that the whole string is iterated over. 441 442 Returns: 443 number of unicode characters in the string 444 445 ***************************************************************************/ 446 447 public size_t length ( ) 448 { 449 size_t len; 450 451 foreach ( c; this ) 452 { 453 len++; 454 } 455 456 return len; 457 } 458 459 460 /*************************************************************************** 461 462 Extract the next character from the referenced string. 463 464 Params: 465 consume = if true, the extracted characters are removed from the 466 string (the start of the slice is advanced) 467 468 Returns: 469 the extracted character, either as a dchar or a slice into the input 470 string (depending on the pull_dchars template parameter). 471 472 ***************************************************************************/ 473 474 public OutType extract ( bool consume = false ) 475 { 476 size_t width; 477 return this.extract(width, consume); 478 } 479 480 481 /*************************************************************************** 482 483 Extract the next character from the referenced string. 484 485 Params: 486 width = outputs the width (in terms of the number of characters in 487 the input string) of the extracted character 488 consume = if true, the extracted characters are removed from the 489 string (the start of the slice is advanced) 490 491 Returns: 492 the extracted character, either as a dchar or a slice into the input 493 string (depending on the pull_dchars template parameter). 494 495 ***************************************************************************/ 496 497 public OutType extract ( out size_t width, bool consume = false ) 498 { 499 auto extracted = This.extract(this.string, width); 500 if ( consume ) 501 { 502 this.string = this.string[width..$]; 503 } 504 505 return extracted; 506 } 507 508 509 /*************************************************************************** 510 511 Static method to extract the next character from the passed string. 512 513 Params: 514 text = string to extract from 515 516 Returns: 517 the extracted character, either as a dchar or a slice into the input 518 string (depending on the pull_dchars template parameter). 519 520 ***************************************************************************/ 521 522 public static OutType extract ( Char[] text ) 523 { 524 size_t width; 525 return This.extract(text, width); 526 } 527 528 529 /*************************************************************************** 530 531 Static method to extract the next character from the passed string. 532 533 Params: 534 text = string to extract from 535 width = outputs the width (in terms of the number of characters in 536 the input string) of the extracted character 537 538 Returns: 539 the extracted character, either as a dchar or a slice into the input 540 string (depending on the pull_dchars template parameter). 541 542 ***************************************************************************/ 543 544 static if ( pull_dchars ) 545 { 546 public static OutType extract ( Char[] text, out size_t width ) 547 { 548 if ( !text.length ) 549 { 550 return InvalidUnicode; 551 } 552 553 static if ( is(Unqual!(Char) == dchar) ) 554 { 555 width = 1; 556 return text[0]; 557 } 558 else 559 { 560 dchar unicode = Utf.decode(text, width); 561 return unicode; 562 } 563 } 564 } 565 else 566 { 567 public static OutType extract ( Char[] text, out size_t width ) 568 { 569 if ( !text.length ) 570 { 571 return ""; 572 } 573 574 static if ( is(Unqual!(Char) == dchar) ) 575 { 576 width = 1; 577 } 578 else 579 { 580 dchar unicode = Utf.decode(text, width); 581 } 582 583 return text[0..width]; 584 } 585 } 586 } 587 588 589 unittest 590 { 591 istring str1 = "hello world ®"; // utf8 encoding 592 const(dchar)[] str2 = "hello world ®"; // utf32 encoding 593 594 test(utf_match(str1, str2)); 595 }