1 /****************************************************************************** 2 3 String splitting utilities 4 5 - The SplitStr class splits a string by occurrences of a delimiter string. 6 - The SplitChr class splits a string by occurrences of a delimiter 7 character. 8 9 Copyright: 10 Copyright (c) 2009-2016 dunnhumby Germany GmbH. 11 All rights reserved. 12 13 License: 14 Boost Software License Version 1.0. See LICENSE_BOOST.txt for details. 15 Alternatively, this file may be distributed under the terms of the Tango 16 3-Clause BSD License (see LICENSE_BSD.txt for details). 17 18 ******************************************************************************/ 19 20 module ocean.text.util.SplitIterator; 21 22 import ocean.meta.types.Qualifiers; 23 import ocean.core.Array: concat, copy; 24 import ocean.core.Verify; 25 import ocean.io.Stdout; 26 import ocean.text.Search: SearchFruct, search; 27 28 import core.stdc.string: strlen, memchr, strcspn; 29 import core.stdc.ctype: isspace; 30 import core.sys.posix.sys.types: ssize_t; 31 32 version (unittest) import ocean.core.Test; 33 34 /****************************************************************************** 35 36 Splits a string by occurrences of a delimiter string. 37 38 Memory friendly, suitable for stack-allocated scope instances. 39 40 ******************************************************************************/ 41 42 class StrSplitIterator : ISplitIterator 43 { 44 alias typeof(.search(cstring.init)) Search; 45 46 /************************************************************************** 47 48 Contains the delimiter as match string and manages a table of indices to 49 improve the search algorithm efficiency. May be modified at any time 50 using its methods. 51 52 **************************************************************************/ 53 54 public Search sf; 55 56 /************************************************************************** 57 58 Constructor 59 60 Params: 61 delim_ = delimiter string 62 63 **************************************************************************/ 64 65 public this ( cstring delim_ ) 66 { 67 this.sf = .search(delim_); 68 } 69 70 /************************************************************************** 71 72 Constructor 73 74 Params: 75 delim_ = delimiter string 76 content = Content string to split. content will be sliced (not copied). 77 78 **************************************************************************/ 79 80 public this ( cstring delim_, cstring content ) 81 { 82 this(delim_); 83 this.reset(content); 84 } 85 86 /************************************************************************** 87 88 Constructor 89 90 Intended to be used for a 'scope' instance where a SearchFruct instance 91 is stored somewhere in order to reuse the search index. 92 93 Params: 94 sf_in = delimiter string 95 96 **************************************************************************/ 97 98 public this ( Search sf_in ) 99 { 100 this.sf = sf_in; 101 } 102 103 /************************************************************************** 104 105 Sets the delimiter string. delim_ may or may not be NUL-terminated; 106 however, only the last character may be NUL. 107 108 Params: 109 delim_ = new delimiter string (will be copied into an internal 110 buffer) 111 112 Returns: 113 delim_ 114 115 **************************************************************************/ 116 117 public cstring delim ( cstring delim_ ) 118 { 119 this.sf.match = delim_; 120 121 return delim_; 122 } 123 124 /************************************************************************** 125 126 Returns: 127 current delimiter string (without NUL-terminator; slices an internal 128 buffer) 129 130 **************************************************************************/ 131 132 public cstring delim ( ) 133 { 134 return this.sf.match; 135 } 136 137 /************************************************************************** 138 139 Locates the first occurrence of the current delimiter string in str, 140 starting from str[start]. 141 142 Params: 143 str = string to scan for delimiter 144 start = search start index 145 146 Returns: 147 index of first occurrence of the current delimiter string in str or 148 str.length if not found 149 150 **************************************************************************/ 151 152 public override size_t locateDelim ( cstring str, size_t start = 0 ) 153 { 154 return this.sf.forward(str, start); 155 } 156 157 /************************************************************************** 158 159 Skips the delimiter which str starts with. 160 Note that the result is correct only if str really starts with a 161 delimiter. 162 163 Params: 164 str = string starting with delimiter 165 166 Returns: 167 index of the first character after the starting delimiter in str 168 169 **************************************************************************/ 170 171 protected override size_t skipDelim ( cstring str ) 172 { 173 verify (str.length >= this.delim.length); 174 175 return this.sf.match.length; 176 } 177 } 178 179 unittest 180 { 181 scope split = new StrSplitIterator("123"); 182 183 split.collapse = true; 184 185 foreach (str; ["123" ~ "ab" ~ "123" ~ "cd" ~ "123" ~ "efg" ~ "123", 186 "123" ~ "ab" ~ "123" ~ "123" ~ "cd" ~ "123" ~ "efg" ~ "123", 187 "123" ~ "ab" ~ "123" ~ "123" ~ "cd" ~ "123" ~ "efg", 188 "ab" ~ "123" ~ "123" ~ "cd" ~ "123" ~ "efg", 189 190 "123" ~ "123" ~ "ab" ~ "123" ~ "123"~ "cd" ~ "123" ~ "efg", 191 "ab" ~ "123" ~ "123" ~ "cd" ~ "123" ~ "efg" ~ "123" ~ "123"]) 192 { 193 foreach (element; split.reset(str)) 194 { 195 istring[] elements = ["ab", "cd", "efg"]; 196 197 test (split.n); 198 test (split.n <= elements.length); 199 test (element == elements[split.n - 1]); 200 } 201 } 202 203 split.collapse = false; 204 205 foreach (element; split.reset("ab" ~ "123" ~ "cd" ~ "123" ~ "efg")) 206 { 207 istring[] elements = ["ab", "cd", "efg"]; 208 209 test (split.n); 210 test (split.n <= elements.length); 211 test (element == elements[split.n - 1]); 212 } 213 214 foreach (element; split.reset("123" ~ "ab"~ "123" ~ "cd" ~ "123" ~ "efg" ~ "123")) 215 { 216 istring[] elements = ["", "ab", "cd", "efg", ""]; 217 218 test (split.n); 219 test (split.n <= elements.length); 220 test (element == elements[split.n - 1]); 221 } 222 223 split.reset("ab" ~ "123" ~ "cd" ~ "123" ~ "efg"); 224 225 test (split.next == "ab"); 226 test (split.next == "cd"); 227 test (split.next == "efg"); 228 229 } 230 231 unittest 232 { 233 scope split_constr = new StrSplitIterator("123", "ab" ~ "123" ~ "cd" ~ "123" ~ "efg"); 234 235 test (split_constr.next == "ab"); 236 test (split_constr.next == "cd"); 237 test (split_constr.next == "efg"); 238 } 239 240 241 /****************************************************************************** 242 243 Splits a string by occurrences of a delimiter character 244 245 ******************************************************************************/ 246 247 class ChrSplitIterator : ISplitIterator 248 { 249 /************************************************************************** 250 251 Delimiter character. Must be specified in the constructor but may be 252 changed at any time, even during iteration. 253 254 **************************************************************************/ 255 256 public char delim; 257 258 /************************************************************************** 259 260 Constructor 261 262 Params: 263 delim_ = delimiter character 264 265 **************************************************************************/ 266 267 public this ( char delim_ ) 268 { 269 this.delim = delim_; 270 } 271 272 /************************************************************************** 273 274 Constructor 275 276 Params: 277 delim_ = delimiter character 278 content = Content string to split. content will be sliced (not copied). 279 280 **************************************************************************/ 281 282 public this ( char delim_, cstring content ) 283 { 284 this(delim_); 285 this.reset(content); 286 } 287 288 /************************************************************************** 289 290 Locates the first occurrence of delim in str starting with str[start]. 291 292 Params: 293 str = string to scan 294 start = search start index, must be at most str.length 295 296 Returns: 297 index of first occurrence of delim in str or str.length if not 298 found 299 300 **************************************************************************/ 301 302 public override size_t locateDelim ( cstring str, size_t start = 0 ) 303 { 304 verify( 305 start <= str.length, 306 typeof (this).stringof ~ ".locateDelim: start index out of range" 307 ); 308 309 char* item = cast (char*) memchr(str.ptr + start, this.delim, str.length - start); 310 311 return item? item - str.ptr : str.length; 312 } 313 314 /************************************************************************** 315 316 Skips the delimiter which str starts with. 317 Note that the result is correct only if str really starts with a 318 delimiter. 319 320 Params: 321 str = string starting with delimiter 322 323 Returns: 324 index of the first character after the starting delimiter in str 325 326 **************************************************************************/ 327 328 protected override size_t skipDelim ( cstring str ) 329 { 330 verify(str.length >= 1); 331 332 return 1; 333 } 334 } 335 336 unittest 337 { 338 scope split_constr = new ChrSplitIterator('1', "ab" ~ "1" ~ "cd" ~ "1" ~ "efg"); 339 340 test (split_constr.next == "ab"); 341 test (split_constr.next == "cd"); 342 test (split_constr.next == "efg"); 343 } 344 345 346 /****************************************************************************** 347 348 Base class 349 350 ******************************************************************************/ 351 352 abstract class ISplitIterator 353 { 354 /************************************************************************** 355 356 Set to true to collapse consecutive delimiter occurrences to a single 357 one to prevent producing empty segments. 358 359 **************************************************************************/ 360 361 public bool collapse = false; 362 363 /************************************************************************** 364 365 Set to true to do a 'foreach' cycle with the remaining content after 366 the last delimiter occurrence or when no delimiter is found. 367 368 **************************************************************************/ 369 370 public bool include_remaining = true; 371 372 /************************************************************************** 373 374 String to split on next iteration and slice to remaining content. 375 376 **************************************************************************/ 377 378 private cstring content, remaining_; 379 380 /************************************************************************** 381 382 'foreach' iteration counter 383 384 **************************************************************************/ 385 386 private uint n_ = 0; 387 388 /************************************************************************** 389 390 Union of the supported 'foreach' iteration delegate types 391 392 **************************************************************************/ 393 394 protected union IterationDelegate 395 { 396 int delegate ( ref size_t pos, ref cstring segment ) with_pos; 397 398 int delegate ( ref cstring segment ) without_pos; 399 } 400 401 /************************************************************************** 402 403 Consistency check 404 405 **************************************************************************/ 406 407 invariant ( ) 408 { 409 if (this.n_) 410 { 411 assert (this.content); 412 } 413 414 /* 415 * TODO: Is this what 416 * --- 417 * assert (this.content[$ - this.remaining_.length .. $] is this.remaining_); 418 * --- 419 * does, that is, comparing the memory location for identity, not the 420 * content? If so, replace it. 421 */ 422 423 assert (this.remaining_.length <= this.content.length); 424 425 if (this.remaining_.length) 426 { 427 assert (this.remaining_.ptr is &this.content[$ - this.remaining_.length]); 428 } 429 } 430 431 /************************************************************************** 432 433 Sets the content string to split on next iteration. 434 435 Params: 436 content = Content string to split; pass null to clear the content. 437 content will be sliced (not copied). 438 439 Returns: 440 this instance 441 442 **************************************************************************/ 443 444 public typeof (this) reset ( cstring content = null ) 445 { 446 this.content = content; 447 this.remaining_ = this.content; 448 this.n_ = 0; 449 450 return this; 451 } 452 453 /************************************************************************** 454 455 'foreach' iteration over string slices between the current and the next 456 delimiter. n() returns the number of 'foreach' loop cycles so far, 457 remaining() the slice after the next delimiter to the content end. 458 If no delimiter was found, n() is 0 after 'foreach' has finished and 459 remaining() returns the content. 460 461 segment slices content so do not modify it. However, the content of 462 segment may be modified which will result in an in-place modification 463 of the content. 464 465 **************************************************************************/ 466 467 public int opApply ( scope int delegate ( ref cstring segment ) dg_in ) 468 { 469 IterationDelegate dg; 470 471 dg.without_pos = dg_in; 472 473 return this.opApply_(false, dg); 474 } 475 476 /************************************************************************** 477 478 'foreach' iteration over string slices between the current and the next 479 delimiter. n() returns the number of 'foreach' loop cycles so far, 480 remaining() the slice after the next delimiter to the content end. 481 If no delimiter was found, n() is 0 after 'foreach' has finished and 482 remaining() returns the content. 483 484 pos references the current content position and may be changed to 485 specify the position where searching should be continued. If changed, 486 pos must be at most content.length. 487 488 segment slices content so do not modify it. However, the content of 489 segment may be modified which will result in an in-place modification 490 of the content. 491 492 **************************************************************************/ 493 494 public int opApply ( scope int delegate ( ref size_t pos, ref cstring segment ) dg_in ) 495 { 496 IterationDelegate dg; 497 498 dg.with_pos = dg_in; 499 500 return this.opApply_(true, dg); 501 } 502 503 /************************************************************************** 504 505 Returns: 506 the number of 'foreach' loop cycles so far. If the value is 0, 507 either no 'foreach' iteration has been done since last reset() or 508 there is no delimiter occurrence in the content string. remaining() 509 will then return the content string. 510 511 **************************************************************************/ 512 513 public uint n ( ) 514 { 515 return this.n_; 516 } 517 518 /************************************************************************** 519 520 Returns: 521 - a slice to the content string after the next delimiter when 522 currently doing a 'foreach' iteration, 523 - a slice to the content string after the last delimiter after a 524 'foreach' iteration has finished, 525 - the content string if no 'foreach' iteration has been done or 526 there is no delimiter occurrence in the content string. 527 528 **************************************************************************/ 529 530 public cstring remaining ( ) 531 { 532 return this.remaining_; 533 } 534 535 /************************************************************************** 536 537 Locates the first delimiter occurrence in str starting from str[start]. 538 539 540 541 Params: 542 str = str to locate first delimiter occurrence in 543 start = start index 544 545 Returns: 546 index of the first delimiter occurrence in str or str.length if not 547 found 548 549 **************************************************************************/ 550 551 abstract size_t locateDelim ( cstring str, size_t start = 0 ); 552 553 /************************************************************************** 554 555 Locates the first delimiter occurrence in the current content string 556 starting from content[start]. 557 558 Params: 559 start = start index, must be at most content.length 560 561 Returns: 562 index of the first delimiter occurrence in str or str.length 563 either not found or start >= content.length 564 565 **************************************************************************/ 566 567 public size_t locateDelim ( size_t start = 0 ) 568 { 569 verify (start <= this.content.length, 570 typeof (this).stringof ~ ".locateDelim(): start index out of range"); 571 return this.locateDelim(this.content, start); 572 } 573 574 /************************************************************************** 575 576 Skips initial consecutive occurrences of the current delimiter in the 577 currently remaining content. 578 579 Returns: 580 remaining content after the delimiters have been skipped. 581 582 **************************************************************************/ 583 584 public cstring skipLeadingDelims ( ) 585 { 586 size_t start = 0, 587 pos = this.locateDelim(this.remaining_); 588 589 while (pos == start && pos < this.remaining_.length) 590 { 591 start = pos + this.skipDelim(this.remaining_[pos .. $]); 592 593 pos = this.locateDelim(this.remaining_, start); 594 } 595 596 return this.remaining_ = this.remaining_[start .. $]; 597 } 598 599 /************************************************************************** 600 601 Searches the next delimiter. 602 603 Returns: 604 a slice to the content between the previous and next delimiter, if 605 found. If not found and include_remaining is true, the remaining 606 content is returned or null if include_remaining is false. 607 608 **************************************************************************/ 609 610 public cstring next ( ) 611 { 612 if (this.remaining_.length) 613 { 614 this.n_++; 615 616 if (this.collapse) 617 { 618 this.skipLeadingDelims(); 619 } 620 621 size_t start = this.content.length - this.remaining_.length, 622 end = this.locateDelim(start); 623 624 if (end < this.content.length) 625 { 626 this.remaining_ = this.content[end + this.skipDelim(this.content[end .. $]) .. $]; 627 628 return this.content[start .. end]; 629 } 630 else if (this.include_remaining) 631 { 632 scope (success) this.remaining_ = null; 633 634 return this.remaining_; 635 } 636 else 637 { 638 return null; 639 } 640 } 641 else 642 { 643 return null; 644 } 645 } 646 647 /************************************************************************** 648 649 'foreach' iteration over string slices between the current and the next 650 delimiter. 651 652 Params: 653 with_pos = true: use dg.with_pos, false: user dg.without_pos 654 dg = iteration delegate 655 656 Returns: 657 passes through dg() return value. 658 659 **************************************************************************/ 660 661 protected int opApply_ ( bool with_pos, IterationDelegate dg ) 662 { 663 int result = 0; 664 665 if (this.remaining_.length) 666 { 667 if (this.collapse) 668 { 669 this.skipLeadingDelims(); 670 } 671 672 size_t start = this.content.length - this.remaining_.length; 673 674 for (size_t pos = this.locateDelim(start); 675 pos < this.content.length; 676 pos = this.locateDelim(start)) 677 { 678 size_t next = pos + this.skipDelim(this.content[pos .. $]); 679 680 if (!(pos == start && collapse)) 681 { 682 this.n_++; 683 684 cstring segment = this.content[start .. pos]; 685 this.remaining_ = this.content[next .. $]; 686 687 if (with_pos) 688 { 689 result = dg.with_pos(next, segment); 690 691 verify (next <= this.content.length, 692 typeof (this).stringof ~ ": iteration delegate " 693 ~ "set the position out of range"); 694 695 this.remaining_ = this.content[next .. $]; 696 } 697 else 698 { 699 result = dg.without_pos(segment); 700 } 701 } 702 703 start = next; 704 705 if (result || start >= this.content.length) break; 706 } 707 708 this.remaining_ = this.content[start .. $]; 709 710 if (this.include_remaining && 711 !(result || (!this.remaining_.length && this.collapse))) 712 { 713 this.n_++; 714 715 cstring segment = this.remaining_; 716 717 this.remaining_ = ""; 718 719 result = with_pos? dg.with_pos(start, segment) : 720 dg.without_pos(segment); 721 } 722 } 723 724 return result; 725 } 726 727 /************************************************************************** 728 729 Skips the delimiter which str starts with. 730 The return value is at most str.length. 731 It is assured that str starts with a delimiter so a subclass may return 732 an undefined result otherwise. Additionally, a subclass is encouraged to 733 use an 'in' contract to ensure str starts with a delimiter and/or is 734 long enought to skip a leading delimiter. 735 736 Params: 737 str = string starting with delimiter 738 739 Returns: 740 index of the first character after the starting delimiter in str 741 742 **************************************************************************/ 743 744 abstract protected size_t skipDelim ( cstring str ); 745 746 /*************************************************************************** 747 748 Trims white space from str. 749 750 Params: 751 str = input string 752 753 Returns: 754 the resulting string 755 756 ***************************************************************************/ 757 758 static cstring trim ( cstring str ) 759 { 760 foreach_reverse (i, c; str) 761 { 762 if (!isspace(c)) 763 { 764 str = str[0 .. i + 1]; 765 break; 766 } 767 } 768 769 foreach (i, c; str) 770 { 771 if (!isspace(c)) 772 { 773 return str[i .. $]; 774 } 775 } 776 777 return str? str[0 .. 0] : null; 778 } 779 780 }