1 /****************************************************************************** 2 3 String splitting utilities 4 5 - The SplitStr class splits a string by occurrences of a delimiter string. 6 - The SplitChr class splits a string by occurrences of a delimiter 7 character. 8 9 Copyright: 10 Copyright (c) 2009-2016 dunnhumby Germany GmbH. 11 All rights reserved. 12 13 License: 14 Boost Software License Version 1.0. See LICENSE_BOOST.txt for details. 15 Alternatively, this file may be distributed under the terms of the Tango 16 3-Clause BSD License (see LICENSE_BSD.txt for details). 17 18 ******************************************************************************/ 19 20 module ocean.text.util.SplitIterator; 21 22 23 import ocean.transition; 24 25 import ocean.core.Array: concat, copy; 26 import ocean.core.Verify; 27 28 import ocean.stdc.string: strlen, memchr, strcspn; 29 import core.stdc.ctype: isspace; 30 31 import ocean.stdc.posix.sys.types: ssize_t; 32 33 import ocean.io.Stdout; 34 import ocean.text.Search: SearchFruct, search; 35 36 version(UnitTest) import ocean.core.Test; 37 38 /****************************************************************************** 39 40 Splits a string by occurrences of a delimiter string. 41 42 Memory friendly, suitable for stack-allocated scope instances. 43 44 ******************************************************************************/ 45 46 class StrSplitIterator : ISplitIterator 47 { 48 alias typeof(.search(cstring.init)) Search; 49 50 /************************************************************************** 51 52 Contains the delimiter as match string and manages a table of indices to 53 improve the search algorithm efficiency. May be modified at any time 54 using its methods. 55 56 **************************************************************************/ 57 58 public Search sf; 59 60 /************************************************************************** 61 62 Constructor 63 64 Params: 65 delim_ = delimiter string 66 67 **************************************************************************/ 68 69 public this ( cstring delim_ ) 70 { 71 this.sf = .search(delim_); 72 } 73 74 /************************************************************************** 75 76 Constructor 77 78 Params: 79 delim_ = delimiter string 80 content = Content string to split. content will be sliced (not copied). 81 82 **************************************************************************/ 83 84 public this ( cstring delim_, cstring content ) 85 { 86 this(delim_); 87 this.reset(content); 88 } 89 90 /************************************************************************** 91 92 Constructor 93 94 Intended to be used for a 'scope' instance where a SearchFruct instance 95 is stored somewhere in order to reuse the search index. 96 97 Params: 98 sf_in = delimiter string 99 100 **************************************************************************/ 101 102 public this ( Search sf_in ) 103 { 104 this.sf = sf_in; 105 } 106 107 /************************************************************************** 108 109 Sets the delimiter string. delim_ may or may not be NUL-terminated; 110 however, only the last character may be NUL. 111 112 Params: 113 delim_ = new delimiter string (will be copied into an internal 114 buffer) 115 116 Returns: 117 delim_ 118 119 **************************************************************************/ 120 121 public cstring delim ( cstring delim_ ) 122 { 123 this.sf.match = delim_; 124 125 return delim_; 126 } 127 128 /************************************************************************** 129 130 Returns: 131 current delimiter string (without NUL-terminator; slices an internal 132 buffer) 133 134 **************************************************************************/ 135 136 public cstring delim ( ) 137 { 138 return this.sf.match; 139 } 140 141 /************************************************************************** 142 143 Locates the first occurrence of the current delimiter string in str, 144 starting from str[start]. 145 146 Params: 147 str = string to scan for delimiter 148 start = search start index 149 150 Returns: 151 index of first occurrence of the current delimiter string in str or 152 str.length if not found 153 154 **************************************************************************/ 155 156 public override size_t locateDelim ( cstring str, size_t start = 0 ) 157 { 158 return this.sf.forward(str, start); 159 } 160 161 /************************************************************************** 162 163 Skips the delimiter which str starts with. 164 Note that the result is correct only if str really starts with a 165 delimiter. 166 167 Params: 168 str = string starting with delimiter 169 170 Returns: 171 index of the first character after the starting delimiter in str 172 173 **************************************************************************/ 174 175 protected override size_t skipDelim ( cstring str ) 176 { 177 verify (str.length >= this.delim.length); 178 179 return this.sf.match.length; 180 } 181 } 182 183 unittest 184 { 185 scope split = new StrSplitIterator("123"); 186 187 split.collapse = true; 188 189 foreach (str; ["123" ~ "ab" ~ "123" ~ "cd" ~ "123" ~ "efg" ~ "123", 190 "123" ~ "ab" ~ "123" ~ "123" ~ "cd" ~ "123" ~ "efg" ~ "123", 191 "123" ~ "ab" ~ "123" ~ "123" ~ "cd" ~ "123" ~ "efg", 192 "ab" ~ "123" ~ "123" ~ "cd" ~ "123" ~ "efg", 193 194 "123" ~ "123" ~ "ab" ~ "123" ~ "123"~ "cd" ~ "123" ~ "efg", 195 "ab" ~ "123" ~ "123" ~ "cd" ~ "123" ~ "efg" ~ "123" ~ "123"]) 196 { 197 foreach (element; split.reset(str)) 198 { 199 istring[] elements = ["ab", "cd", "efg"]; 200 201 test (split.n); 202 test (split.n <= elements.length); 203 test (element == elements[split.n - 1]); 204 } 205 } 206 207 split.collapse = false; 208 209 foreach (element; split.reset("ab" ~ "123" ~ "cd" ~ "123" ~ "efg")) 210 { 211 istring[] elements = ["ab", "cd", "efg"]; 212 213 test (split.n); 214 test (split.n <= elements.length); 215 test (element == elements[split.n - 1]); 216 } 217 218 foreach (element; split.reset("123" ~ "ab"~ "123" ~ "cd" ~ "123" ~ "efg" ~ "123")) 219 { 220 istring[] elements = ["", "ab", "cd", "efg", ""]; 221 222 test (split.n); 223 test (split.n <= elements.length); 224 test (element == elements[split.n - 1]); 225 } 226 227 split.reset("ab" ~ "123" ~ "cd" ~ "123" ~ "efg"); 228 229 test (split.next == "ab"); 230 test (split.next == "cd"); 231 test (split.next == "efg"); 232 233 } 234 235 unittest 236 { 237 scope split_constr = new StrSplitIterator("123", "ab" ~ "123" ~ "cd" ~ "123" ~ "efg"); 238 239 test (split_constr.next == "ab"); 240 test (split_constr.next == "cd"); 241 test (split_constr.next == "efg"); 242 } 243 244 245 /****************************************************************************** 246 247 Splits a string by occurrences of a delimiter character 248 249 ******************************************************************************/ 250 251 class ChrSplitIterator : ISplitIterator 252 { 253 /************************************************************************** 254 255 Delimiter character. Must be specified in the constructor but may be 256 changed at any time, even during iteration. 257 258 **************************************************************************/ 259 260 public char delim; 261 262 /************************************************************************** 263 264 Constructor 265 266 Params: 267 delim_ = delimiter character 268 269 **************************************************************************/ 270 271 public this ( char delim_ ) 272 { 273 this.delim = delim_; 274 } 275 276 /************************************************************************** 277 278 Constructor 279 280 Params: 281 delim_ = delimiter character 282 content = Content string to split. content will be sliced (not copied). 283 284 **************************************************************************/ 285 286 public this ( char delim_, cstring content ) 287 { 288 this(delim_); 289 this.reset(content); 290 } 291 292 /************************************************************************** 293 294 Locates the first occurrence of delim in str starting with str[start]. 295 296 Params: 297 str = string to scan 298 start = search start index, must be at most str.length 299 300 Returns: 301 index of first occurrence of delim in str or str.length if not 302 found 303 304 **************************************************************************/ 305 306 public override size_t locateDelim ( cstring str, size_t start = 0 ) 307 { 308 verify( 309 start <= str.length, 310 typeof (this).stringof ~ ".locateDelim: start index out of range" 311 ); 312 313 char* item = cast (char*) memchr(str.ptr + start, this.delim, str.length - start); 314 315 return item? item - str.ptr : str.length; 316 } 317 318 /************************************************************************** 319 320 Skips the delimiter which str starts with. 321 Note that the result is correct only if str really starts with a 322 delimiter. 323 324 Params: 325 str = string starting with delimiter 326 327 Returns: 328 index of the first character after the starting delimiter in str 329 330 **************************************************************************/ 331 332 protected override size_t skipDelim ( cstring str ) 333 { 334 verify(str.length >= 1); 335 336 return 1; 337 } 338 } 339 340 unittest 341 { 342 scope split_constr = new ChrSplitIterator('1', "ab" ~ "1" ~ "cd" ~ "1" ~ "efg"); 343 344 test (split_constr.next == "ab"); 345 test (split_constr.next == "cd"); 346 test (split_constr.next == "efg"); 347 } 348 349 350 /****************************************************************************** 351 352 Base class 353 354 ******************************************************************************/ 355 356 abstract class ISplitIterator 357 { 358 /************************************************************************** 359 360 Set to true to collapse consecutive delimiter occurrences to a single 361 one to prevent producing empty segments. 362 363 **************************************************************************/ 364 365 public bool collapse = false; 366 367 /************************************************************************** 368 369 Set to true to do a 'foreach' cycle with the remaining content after 370 the last delimiter occurrence or when no delimiter is found. 371 372 **************************************************************************/ 373 374 public bool include_remaining = true; 375 376 /************************************************************************** 377 378 String to split on next iteration and slice to remaining content. 379 380 **************************************************************************/ 381 382 private cstring content, remaining_; 383 384 /************************************************************************** 385 386 'foreach' iteration counter 387 388 **************************************************************************/ 389 390 private uint n_ = 0; 391 392 /************************************************************************** 393 394 Union of the supported 'foreach' iteration delegate types 395 396 **************************************************************************/ 397 398 protected union IterationDelegate 399 { 400 int delegate ( ref size_t pos, ref cstring segment ) with_pos; 401 402 int delegate ( ref cstring segment ) without_pos; 403 } 404 405 /************************************************************************** 406 407 Consistency check 408 409 **************************************************************************/ 410 411 invariant ( ) 412 { 413 if (this.n_) 414 { 415 assert (this.content); 416 } 417 418 /* 419 * TODO: Is this what 420 * --- 421 * assert (this.content[$ - this.remaining_.length .. $] is this.remaining_); 422 * --- 423 * does, that is, comparing the memory location for identity, not the 424 * content? If so, replace it. 425 */ 426 427 assert (this.remaining_.length <= this.content.length); 428 429 if (this.remaining_.length) 430 { 431 assert (this.remaining_.ptr is &this.content[$ - this.remaining_.length]); 432 } 433 } 434 435 /************************************************************************** 436 437 Sets the content string to split on next iteration. 438 439 Params: 440 content = Content string to split; pass null to clear the content. 441 content will be sliced (not copied). 442 443 Returns: 444 this instance 445 446 **************************************************************************/ 447 448 public typeof (this) reset ( cstring content = null ) 449 { 450 this.content = content; 451 this.remaining_ = this.content; 452 this.n_ = 0; 453 454 return this; 455 } 456 457 /************************************************************************** 458 459 'foreach' iteration over string slices between the current and the next 460 delimiter. n() returns the number of 'foreach' loop cycles so far, 461 remaining() the slice after the next delimiter to the content end. 462 If no delimiter was found, n() is 0 after 'foreach' has finished and 463 remaining() returns the content. 464 465 segment slices content so do not modify it. However, the content of 466 segment may be modified which will result in an in-place modification 467 of the content. 468 469 **************************************************************************/ 470 471 public int opApply ( scope int delegate ( ref cstring segment ) dg_in ) 472 { 473 IterationDelegate dg; 474 475 dg.without_pos = dg_in; 476 477 return this.opApply_(false, dg); 478 } 479 480 /************************************************************************** 481 482 'foreach' iteration over string slices between the current and the next 483 delimiter. n() returns the number of 'foreach' loop cycles so far, 484 remaining() the slice after the next delimiter to the content end. 485 If no delimiter was found, n() is 0 after 'foreach' has finished and 486 remaining() returns the content. 487 488 pos references the current content position and may be changed to 489 specify the position where searching should be continued. If changed, 490 pos must be at most content.length. 491 492 segment slices content so do not modify it. However, the content of 493 segment may be modified which will result in an in-place modification 494 of the content. 495 496 **************************************************************************/ 497 498 public int opApply ( scope int delegate ( ref size_t pos, ref cstring segment ) dg_in ) 499 { 500 IterationDelegate dg; 501 502 dg.with_pos = dg_in; 503 504 return this.opApply_(true, dg); 505 } 506 507 /************************************************************************** 508 509 Returns: 510 the number of 'foreach' loop cycles so far. If the value is 0, 511 either no 'foreach' iteration has been done since last reset() or 512 there is no delimiter occurrence in the content string. remaining() 513 will then return the content string. 514 515 **************************************************************************/ 516 517 public uint n ( ) 518 { 519 return this.n_; 520 } 521 522 /************************************************************************** 523 524 Returns: 525 - a slice to the content string after the next delimiter when 526 currently doing a 'foreach' iteration, 527 - a slice to the content string after the last delimiter after a 528 'foreach' iteration has finished, 529 - the content string if no 'foreach' iteration has been done or 530 there is no delimiter occurrence in the content string. 531 532 **************************************************************************/ 533 534 public cstring remaining ( ) 535 { 536 return this.remaining_; 537 } 538 539 /************************************************************************** 540 541 Locates the first delimiter occurrence in str starting from str[start]. 542 543 544 545 Params: 546 str = str to locate first delimiter occurrence in 547 start = start index 548 549 Returns: 550 index of the first delimiter occurrence in str or str.length if not 551 found 552 553 **************************************************************************/ 554 555 abstract size_t locateDelim ( cstring str, size_t start = 0 ); 556 557 /************************************************************************** 558 559 Locates the first delimiter occurrence in the current content string 560 starting from content[start]. 561 562 Params: 563 start = start index, must be at most content.length 564 565 Returns: 566 index of the first delimiter occurrence in str or str.length 567 either not found or start >= content.length 568 569 **************************************************************************/ 570 571 public size_t locateDelim ( size_t start = 0 ) 572 { 573 verify (start <= this.content.length, 574 typeof (this).stringof ~ ".locateDelim(): start index out of range"); 575 return this.locateDelim(this.content, start); 576 } 577 578 /************************************************************************** 579 580 Skips initial consecutive occurrences of the current delimiter in the 581 currently remaining content. 582 583 Returns: 584 remaining content after the delimiters have been skipped. 585 586 **************************************************************************/ 587 588 public cstring skipLeadingDelims ( ) 589 { 590 size_t start = 0, 591 pos = this.locateDelim(this.remaining_); 592 593 while (pos == start && pos < this.remaining_.length) 594 { 595 start = pos + this.skipDelim(this.remaining_[pos .. $]); 596 597 pos = this.locateDelim(this.remaining_, start); 598 } 599 600 return this.remaining_ = this.remaining_[start .. $]; 601 } 602 603 /************************************************************************** 604 605 Searches the next delimiter. 606 607 Returns: 608 a slice to the content between the previous and next delimiter, if 609 found. If not found and include_remaining is true, the remaining 610 content is returned or null if include_remaining is false. 611 612 **************************************************************************/ 613 614 public cstring next ( ) 615 { 616 if (this.remaining_.length) 617 { 618 this.n_++; 619 620 if (this.collapse) 621 { 622 this.skipLeadingDelims(); 623 } 624 625 size_t start = this.content.length - this.remaining_.length, 626 end = this.locateDelim(start); 627 628 if (end < this.content.length) 629 { 630 this.remaining_ = this.content[end + this.skipDelim(this.content[end .. $]) .. $]; 631 632 return this.content[start .. end]; 633 } 634 else if (this.include_remaining) 635 { 636 scope (success) this.remaining_ = null; 637 638 return this.remaining_; 639 } 640 else 641 { 642 return null; 643 } 644 } 645 else 646 { 647 return null; 648 } 649 } 650 651 /************************************************************************** 652 653 'foreach' iteration over string slices between the current and the next 654 delimiter. 655 656 Params: 657 with_pos = true: use dg.with_pos, false: user dg.without_pos 658 dg = iteration delegate 659 660 Returns: 661 passes through dg() return value. 662 663 **************************************************************************/ 664 665 protected int opApply_ ( bool with_pos, IterationDelegate dg ) 666 { 667 int result = 0; 668 669 if (this.remaining_.length) 670 { 671 if (this.collapse) 672 { 673 this.skipLeadingDelims(); 674 } 675 676 size_t start = this.content.length - this.remaining_.length; 677 678 for (size_t pos = this.locateDelim(start); 679 pos < this.content.length; 680 pos = this.locateDelim(start)) 681 { 682 size_t next = pos + this.skipDelim(this.content[pos .. $]); 683 684 if (!(pos == start && collapse)) 685 { 686 this.n_++; 687 688 cstring segment = this.content[start .. pos]; 689 this.remaining_ = this.content[next .. $]; 690 691 if (with_pos) 692 { 693 result = dg.with_pos(next, segment); 694 695 verify (next <= this.content.length, 696 typeof (this).stringof ~ ": iteration delegate " 697 ~ "set the position out of range"); 698 699 this.remaining_ = this.content[next .. $]; 700 } 701 else 702 { 703 result = dg.without_pos(segment); 704 } 705 } 706 707 start = next; 708 709 if (result || start >= this.content.length) break; 710 } 711 712 this.remaining_ = this.content[start .. $]; 713 714 if (this.include_remaining && 715 !(result || (!this.remaining_.length && this.collapse))) 716 { 717 this.n_++; 718 719 cstring segment = this.remaining_; 720 721 this.remaining_ = ""; 722 723 result = with_pos? dg.with_pos(start, segment) : 724 dg.without_pos(segment); 725 } 726 } 727 728 return result; 729 } 730 731 /************************************************************************** 732 733 Skips the delimiter which str starts with. 734 The return value is at most str.length. 735 It is assured that str starts with a delimiter so a subclass may return 736 an undefined result otherwise. Additionally, a subclass is encouraged to 737 use an 'in' contract to ensure str starts with a delimiter and/or is 738 long enought to skip a leading delimiter. 739 740 Params: 741 str = string starting with delimiter 742 743 Returns: 744 index of the first character after the starting delimiter in str 745 746 **************************************************************************/ 747 748 abstract protected size_t skipDelim ( cstring str ); 749 750 /*************************************************************************** 751 752 Trims white space from str. 753 754 Params: 755 str = input string 756 757 Returns: 758 the resulting string 759 760 ***************************************************************************/ 761 762 static cstring trim ( cstring str ) 763 { 764 foreach_reverse (i, c; str) 765 { 766 if (!isspace(c)) 767 { 768 str = str[0 .. i + 1]; 769 break; 770 } 771 } 772 773 foreach (i, c; str) 774 { 775 if (!isspace(c)) 776 { 777 return str[i .. $]; 778 } 779 } 780 781 return str? str[0 .. 0] : null; 782 } 783 784 }