1 /******************************************************************************* 2 3 Placeholder for a variety of wee functions. 4 5 Several of these functions return an index value, representing where 6 some criteria was identified. When said criteria is not matched, the 7 functions return a value representing the array length provided to 8 them. That is, for those scenarios where C functions might typically 9 return -1 these functions return length instead. This operate nicely 10 with D slices: 11 --- 12 auto text = "happy:faces"; 13 14 assert (text[0 .. locate (text, ':')] == "happy"); 15 16 assert (text[0 .. locate (text, '!')] == "happy:faces"); 17 --- 18 19 The contains() function is more convenient for trivial 20 lookup cases: 21 --- 22 if (contains ("fubar", '!')) 23 ... 24 --- 25 26 Note that where some functions expect a size_t as an argument, the 27 D template-matching algorithm will fail where an int is provided 28 instead. This is the typically the cause of "template not found" 29 errors. Also note that name overloading is not supported cleanly 30 by IFTI at this time, so is not applied here. 31 32 33 Applying the D "import alias" mechanism to this module is highly 34 recommended, in order to limit namespace pollution: 35 --- 36 import Util = ocean.text.Util; 37 38 auto s = Util.trim (" foo "); 39 --- 40 41 42 Function templates: 43 --- 44 trim (source) // trim whitespace 45 triml (source) // trim whitespace 46 trimr (source) // trim whitespace 47 strip (source, match) // trim elements 48 stripl (source, match) // trim elements 49 stripr (source, match) // trim elements 50 chopl (source, match) // trim pattern match 51 chopr (source, match) // trim pattern match 52 delimit (src, set) // split on delims 53 split (source, pattern) // split on pattern 54 splitLines (source); // split on lines 55 head (source, pattern, tail) // split to head & tail 56 join (source, postfix, output) // join text segments 57 prefix (dst, prefix, content...) // prefix text segments 58 postfix (dst, postfix, content...) // postfix text segments 59 combine (dst, prefix, postfix, content...) // combine lotsa stuff 60 repeat (source, count, output) // repeat source 61 replace (source, match, replacement) // replace chars 62 substitute (source, match, replacement) // replace/remove matches 63 count (source, match) // count instances 64 contains (source, match) // has char? 65 containsPattern (source, match) // has pattern? 66 index (source, match, start) // find match index 67 locate (source, match, start) // find char 68 locatePrior (source, match, start) // find prior char 69 locatePattern (source, match, start); // find pattern 70 locatePatternPrior (source, match, start); // find prior pattern 71 indexOf (s*, match, length) // low-level lookup 72 mismatch (s1*, s2*, length) // low-level compare 73 matching (s1*, s2*, length) // low-level compare 74 isSpace (match) // is whitespace? 75 unescape(source, output) // convert '\' prefixes 76 layout (destination, format ...) // featherweight printf 77 lines (str) // foreach lines 78 quotes (str, set) // foreach quotes 79 delimiters (str, set) // foreach delimiters 80 patterns (str, pattern) // foreach patterns 81 --- 82 83 Please note that any 'pattern' referred to within this module 84 refers to a pattern of characters, and not some kind of regex 85 descriptor. Use the Regex module for regex operation. 86 87 Copyright: 88 Copyright (c) 2004 Kris Bell. 89 Some parts copyright (c) 2009-2016 dunnhumby Germany GmbH. 90 All rights reserved. 91 92 License: 93 Tango Dual License: 3-Clause BSD License / Academic Free License v3.0. 94 See LICENSE_TANGO.txt for details. 95 96 Version: 97 Apr 2004: Initial release 98 Dec 2006: South Seas version 99 100 Authors: Kris 101 102 *******************************************************************************/ 103 104 module ocean.text.Util; 105 106 //import ocean.meta.types.Qualifiers : cstring, mstring; 107 import ocean.transition; 108 import ocean.core.Verify; 109 110 version (UnitTest) import ocean.core.Test; 111 112 113 /****************************************************************************** 114 115 Trim the provided array by stripping whitespace from both 116 ends. Returns a slice of the original content 117 118 ******************************************************************************/ 119 120 inout(char)[] trim (inout(char)[] source) 121 { 122 inout(char)* head = source.ptr, 123 tail = head + source.length; 124 125 while (head < tail && isSpace(*head)) 126 ++head; 127 128 while (tail > head && isSpace(*(tail-1))) 129 --tail; 130 131 return head [0 .. tail - head]; 132 } 133 134 /****************************************************************************** 135 136 Trim the provided array by stripping whitespace from the left. 137 Returns a slice of the original content 138 139 ******************************************************************************/ 140 141 inout(char)[] triml (inout(char)[] source) 142 { 143 inout(char)* head = source.ptr, 144 tail = head + source.length; 145 146 while (head < tail && isSpace(*head)) 147 ++head; 148 149 return head [0 .. tail - head]; 150 } 151 152 /****************************************************************************** 153 154 Trim the provided array by stripping whitespace from the right. 155 Returns a slice of the original content 156 157 ******************************************************************************/ 158 159 inout(char)[] trimr (inout(char)[] source) 160 { 161 inout(char)* head = source.ptr, 162 tail = head + source.length; 163 164 while (tail > head && isSpace(*(tail-1))) 165 --tail; 166 167 return head [0 .. tail - head]; 168 } 169 170 /****************************************************************************** 171 172 Trim the given array by stripping the provided match from 173 both ends. Returns a slice of the original content 174 175 ******************************************************************************/ 176 177 inout(char)[] strip (inout(char)[] source, char match) 178 { 179 inout(char)* head = source.ptr, 180 tail = head + source.length; 181 182 while (head < tail && *head is match) 183 ++head; 184 185 while (tail > head && *(tail-1) is match) 186 --tail; 187 188 return head [0 .. tail - head]; 189 } 190 191 /****************************************************************************** 192 193 Trim the given array by stripping the provided match from 194 the left hand side. Returns a slice of the original content 195 196 ******************************************************************************/ 197 198 inout(char)[] stripl (inout(char)[] source, char match) 199 { 200 inout(char)* head = source.ptr, 201 tail = head + source.length; 202 203 while (head < tail && *head is match) 204 ++head; 205 206 return head [0 .. tail - head]; 207 } 208 209 /****************************************************************************** 210 211 Trim the given array by stripping the provided match from 212 the right hand side. Returns a slice of the original content 213 214 ******************************************************************************/ 215 216 inout(char)[] stripr (inout(char)[] source, char match) 217 { 218 inout(char)* head = source.ptr, 219 tail = head + source.length; 220 221 while (tail > head && *(tail-1) is match) 222 --tail; 223 224 return head [0 .. tail - head]; 225 } 226 227 /****************************************************************************** 228 229 Chop the given source by stripping the provided match from 230 the left hand side. Returns a slice of the original content 231 232 ******************************************************************************/ 233 234 inout(char)[] chopl (inout(char)[] source, cstring match) 235 { 236 if (match.length <= source.length) 237 if (source[0 .. match.length] == match) 238 source = source [match.length .. $]; 239 240 return source; 241 } 242 243 /****************************************************************************** 244 245 Chop the given source by stripping the provided match from 246 the right hand side. Returns a slice of the original content 247 248 ******************************************************************************/ 249 250 inout(char)[] chopr (inout(char)[] source, cstring match) 251 { 252 if (match.length <= source.length) 253 if (source[$-match.length .. $] == match) 254 source = source [0 .. $-match.length]; 255 256 return source; 257 } 258 259 /****************************************************************************** 260 261 Replace all instances of one element with another (in place) 262 263 ******************************************************************************/ 264 265 mstring replace (mstring source, char match, char replacement) 266 { 267 foreach (ref c; source) 268 if (c is match) 269 c = replacement; 270 return source; 271 } 272 273 /****************************************************************************** 274 275 Substitute all instances of match from source. Set replacement 276 to null in order to remove instead of replace 277 278 ******************************************************************************/ 279 280 mstring substitute (cstring source, cstring match, cstring replacement) 281 { 282 mstring output; 283 284 foreach (s; patterns (source, match, replacement)) 285 { 286 output ~= s; 287 } 288 return output; 289 } 290 291 /****************************************************************************** 292 293 Count all instances of match within source 294 295 ******************************************************************************/ 296 297 size_t count (cstring source, cstring match) 298 { 299 size_t c; 300 301 foreach (s; patterns (source, match)) 302 ++c; 303 verify(c > 0); 304 return c - 1; 305 } 306 307 /****************************************************************************** 308 309 Returns whether or not the provided array contains an instance 310 of the given match 311 312 ******************************************************************************/ 313 314 bool contains (cstring source, char match) 315 { 316 return indexOf (source.ptr, match, source.length) != source.length; 317 } 318 319 /****************************************************************************** 320 321 Returns whether or not the provided array contains an instance 322 of the given match 323 324 ******************************************************************************/ 325 326 bool containsPattern (cstring source, cstring match) 327 { 328 return locatePattern (source, match) != source.length; 329 } 330 331 unittest 332 { 333 mstring greeting = "Hello world".dup; 334 istring satan = "Hell"; 335 test(containsPattern(greeting, satan), "Pattern not found"); 336 } 337 338 339 /****************************************************************************** 340 341 Return the index of the next instance of 'match' starting at 342 position 'start', or source.length where there is no match. 343 344 Parameter 'start' defaults to 0 345 346 ******************************************************************************/ 347 348 size_t index (cstring source, cstring match, size_t start=0) 349 { 350 return (match.length is 1) ? locate (source, match[0], start) 351 : locatePattern (source, match, start); 352 } 353 354 unittest 355 { 356 char[] url; 357 size_t start; 358 cstring CSLASHSLASH = "://"; 359 start = index(url, CSLASHSLASH); 360 } 361 362 /****************************************************************************** 363 364 Return the index of the prior instance of 'match' starting 365 just before 'start', or source.length where there is no match. 366 367 Parameter 'start' defaults to source.length 368 369 ******************************************************************************/ 370 371 size_t rindex (cstring source, cstring match, size_t start=size_t.max) 372 { 373 return (match.length is 1) ? locatePrior (source, match[0], start) 374 : locatePatternPrior (source, match, start); 375 } 376 377 /****************************************************************************** 378 379 Return the index of the next instance of 'match' starting at 380 position 'start', or source.length where there is no match. 381 382 Parameter 'start' defaults to 0 383 384 ******************************************************************************/ 385 386 size_t locate (cstring source, char match, size_t start=0) 387 { 388 if (start > source.length) 389 start = source.length; 390 391 return indexOf (source.ptr+start, match, source.length - start) + start; 392 } 393 394 unittest 395 { 396 size_t start; 397 char[] url; 398 start = locate(url, '/', start); 399 } 400 401 /****************************************************************************** 402 403 Return the index of the prior instance of 'match' starting 404 just before 'start', or source.length where there is no match. 405 406 Parameter 'start' defaults to source.length 407 408 ******************************************************************************/ 409 410 size_t locatePrior (cstring source, char match, size_t start = size_t.max) 411 { 412 if (start > source.length) 413 start = source.length; 414 415 while (start > 0) 416 if (source[--start] is match) 417 return start; 418 return source.length; 419 } 420 421 /****************************************************************************** 422 423 Return the index of the next instance of 'match' starting at 424 position 'start', or source.length where there is no match. 425 426 Parameter 'start' defaults to 0 427 428 ******************************************************************************/ 429 430 size_t locatePattern (cstring source, cstring match, size_t start=0) 431 { 432 size_t idx; 433 const(char)* p = source.ptr + start; 434 size_t extent = source.length - start - match.length + 1; 435 436 if (match.length && extent <= source.length) 437 { 438 while (extent) 439 if ((idx = indexOf (p, match[0], extent)) is extent) 440 break; 441 else 442 { 443 if (matching (p+=idx, match.ptr, match.length)) 444 return p - source.ptr; 445 else 446 { 447 extent -= (idx+1); 448 ++p; 449 } 450 } 451 } 452 453 return source.length; 454 } 455 456 /****************************************************************************** 457 458 Return the index of the prior instance of 'match' starting 459 just before 'start', or source.length where there is no match. 460 461 Parameter 'start' defaults to source.length 462 463 ******************************************************************************/ 464 465 size_t locatePatternPrior (cstring source, cstring match, size_t start=size_t.max) 466 { 467 auto len = source.length; 468 469 if (start > len) 470 start = len; 471 472 if (match.length && match.length <= len) 473 while (start) 474 { 475 start = locatePrior (source, match[0], start); 476 if (start is len) 477 break; 478 else 479 if ((start + match.length) <= len) 480 if (matching (source.ptr+start, match.ptr, match.length)) 481 return start; 482 } 483 484 return len; 485 } 486 487 /****************************************************************************** 488 489 Split the provided array on the first pattern instance, and 490 return the resultant head and tail. The pattern is excluded 491 from the two segments. 492 493 Where a segment is not found, tail will be null and the return 494 value will be the original array. 495 496 ******************************************************************************/ 497 498 inout(char)[] head (inout(char)[] src, cstring pattern, out inout(char)[] tail) 499 { 500 auto i = locatePattern (src, pattern); 501 if (i != src.length) 502 { 503 tail = src [i + pattern.length .. $]; 504 src = src [0 .. i]; 505 } 506 return src; 507 } 508 509 /****************************************************************************** 510 511 Split the provided array on the last pattern instance, and 512 return the resultant head and tail. The pattern is excluded 513 from the two segments. 514 515 Where a segment is not found, head will be null and the return 516 value will be the original array. 517 518 ******************************************************************************/ 519 520 inout(char)[] tail (inout(char)[] src, cstring pattern, out inout(char)[] head) 521 { 522 auto i = locatePatternPrior (src, pattern); 523 if (i != src.length) 524 { 525 head = src [0 .. i]; 526 src = src [i + pattern.length .. $]; 527 } 528 return src; 529 } 530 531 /****************************************************************************** 532 533 Split the provided array wherever a delimiter-set instance is 534 found, and return the resultant segments. The delimiters are 535 excluded from each of the segments. Note that delimiters are 536 matched as a set of alternates rather than as a pattern. 537 538 Splitting on a single delimiter is considerably faster than 539 splitting upon a set of alternatives. 540 541 Note that the src content is not duplicated by this function, 542 but is sliced instead. 543 544 ******************************************************************************/ 545 546 inout(char)[][] delimit (inout(char)[] src, cstring set) 547 { 548 typeof(return) result; 549 550 // Cast is needed to avoid instantiating `DelimFruct!(inout(char))` 551 // which can't compile. We cast segment back to `inout` after so the end 552 // result is still type-checked. 553 554 foreach (segment; delimiters (cast(cstring) src, set)) 555 result ~= cast(typeof(src)) segment; 556 return result; 557 } 558 559 /****************************************************************************** 560 561 Split the provided array wherever a pattern instance is 562 found, and return the resultant segments. The pattern is 563 excluded from each of the segments. 564 565 Note that the src content is not duplicated by this function, 566 but is sliced instead. 567 568 ******************************************************************************/ 569 570 inout(char)[][] split (inout(char)[] src, cstring pattern) 571 { 572 typeof(return) result; 573 574 // Cast is needed to avoid instantiating `PatternFruct!(inout(char))` 575 // which can't compile. We cast segment back to `inout` after so the end 576 // result is still type-checked. 577 578 foreach (segment; patterns (cast(cstring) src, pattern)) 579 result ~= cast(typeof(src)) segment; 580 return result; 581 } 582 583 /****************************************************************************** 584 585 Convert text into a set of lines, where each line is identified 586 by a \n or \r\n combination. The line terminator is stripped from 587 each resultant array 588 589 Note that the src content is not duplicated by this function, but 590 is sliced instead. 591 592 ******************************************************************************/ 593 594 inout(char)[][] toLines (inout(char)[] src) 595 { 596 typeof(return) result; 597 598 // Cast is needed to avoid instantiating `LineFruct!(inout(char))` 599 // which can't compile. We cast line back to `inout` after so the end 600 // result is still type-checked. 601 602 foreach (line; lines (cast(cstring) src)) 603 result ~= cast(typeof(src)) line; 604 return result; 605 } 606 607 alias toLines splitLines; 608 609 /****************************************************************************** 610 611 Return the indexed line, where each line is identified by a \n 612 or \r\n combination. The line terminator is stripped from the 613 resultant line 614 615 Note that src content is not duplicated by this function, but 616 is sliced instead. 617 618 ******************************************************************************/ 619 620 inout(char)[] lineOf (inout(char)[] src, size_t index) 621 { 622 int i = 0; 623 624 // Cast is needed to avoid instantiating `LineFruct!(inout(char))` 625 // which can't compile. We cast line back to `inout` after so the end 626 // result is still type-checked. 627 628 foreach (line; lines(cast(cstring) src)) 629 if (i++ is index) 630 return cast(typeof(return)) line; 631 return null; 632 } 633 634 /****************************************************************************** 635 636 Combine a series of text segments together, each appended with 637 a postfix pattern. An optional output buffer can be provided to 638 avoid heap activity - it should be large enough to contain the 639 entire output, otherwise the heap will be used instead. 640 641 Returns a valid slice of the output, containing the concatenated 642 text. 643 644 ******************************************************************************/ 645 646 mstring join (const(char[])[] src, cstring postfix=null, mstring dst = null) 647 { 648 return combine(dst, null, postfix, src); 649 } 650 651 unittest 652 { 653 test (join([ "aaa", "bbb", "ccc" ], ",") == "aaa,bbb,ccc"); 654 655 // ensure `join` works with differently qualified arguments 656 const(char[][]) mut = [ "xxx".dup, "yyy".dup, "zzz" ]; 657 char[20] buf; 658 auto ret = join(mut, " ", buf); 659 test (ret == "xxx yyy zzz"); 660 test (ret.ptr is buf.ptr); 661 } 662 663 /****************************************************************************** 664 665 Combine a series of text segments together, each prepended with 666 a prefix pattern. An optional output buffer can be provided to 667 avoid heap activity - it should be large enough to contain the 668 entire output, otherwise the heap will be used instead. 669 670 Note that, unlike join(), the output buffer is specified first 671 such that a set of trailing strings can be provided. 672 673 Returns a valid slice of the output, containing the concatenated 674 text. 675 676 ******************************************************************************/ 677 678 mstring prefix (mstring dst, cstring prefix, const(char[])[] src...) 679 { 680 return combine(dst, prefix, null, src); 681 } 682 683 /****************************************************************************** 684 685 Combine a series of text segments together, each appended with an 686 optional postfix pattern. An optional output buffer can be provided 687 to avoid heap activity - it should be large enough to contain the 688 entire output, otherwise the heap will be used instead. 689 690 Note that, unlike join(), the output buffer is specified first 691 such that a set of trailing strings can be provided. 692 693 Returns a valid slice of the output, containing the concatenated 694 text. 695 696 ******************************************************************************/ 697 698 mstring postfix (mstring dst, cstring postfix, cstring[] src...) 699 { 700 return combine(dst, null, postfix, src); 701 } 702 703 /****************************************************************************** 704 705 Combine a series of text segments together, each prefixed and/or 706 postfixed with optional strings. An optional output buffer can be 707 provided to avoid heap activity - which should be large enough to 708 contain the entire output, otherwise the heap will be used instead. 709 710 Note that, unlike join(), the output buffer is specified first 711 such that a set of trailing strings can be provided. 712 713 Returns a valid slice of the output, containing the concatenated 714 text. 715 716 ******************************************************************************/ 717 718 mstring combine (mstring dst, cstring prefix, cstring postfix, const(char[])[] src ...) 719 { 720 size_t len = src.length * prefix.length + 721 src.length * postfix.length; 722 723 foreach (segment; src) 724 len += segment.length; 725 726 if (dst.length < len) 727 dst.length = len; 728 729 auto p = dst.ptr; 730 foreach (segment; src) 731 { 732 p[0 .. prefix.length] = prefix; 733 p += prefix.length; 734 p[0 .. segment.length] = segment; 735 p += segment.length; 736 p[0 .. postfix.length] = postfix; 737 p += postfix.length; 738 } 739 740 // remove trailing seperator 741 if (len) 742 len -= postfix.length; 743 return dst [0 .. len]; 744 } 745 746 /****************************************************************************** 747 748 Repeat an array for a specific number of times. An optional output 749 buffer can be provided to avoid heap activity - it should be large 750 enough to contain the entire output, otherwise the heap will be used 751 instead. 752 753 Returns a valid slice of the output, containing the concatenated 754 text. 755 756 ******************************************************************************/ 757 758 mstring repeat (cstring src, size_t count, mstring dst=null) 759 { 760 size_t len = src.length * count; 761 if (len is 0) 762 return null; 763 764 if (dst.length < len) 765 dst.length = len; 766 767 for (auto p = dst.ptr; count--; p += src.length) 768 p[0 .. src.length] = src; 769 770 return dst [0 .. len]; 771 } 772 773 /****************************************************************************** 774 775 Is the argument a whitespace character? 776 777 ******************************************************************************/ 778 779 bool isSpace (char c) 780 { 781 return (c <= 32 && (c is ' ' || c is '\t' || c is '\r' || c is '\n' || c is '\f' || c is '\v')); 782 } 783 784 /****************************************************************************** 785 786 Return whether or not the two arrays have matching content 787 788 ******************************************************************************/ 789 790 bool matching (const(char)* s1, const(char)* s2, size_t length) 791 { 792 return mismatch(s1, s2, length) is length; 793 } 794 795 /****************************************************************************** 796 797 Returns the index of the first match in str, failing once 798 length is reached. Note that we return 'length' for failure 799 and a 0-based index on success 800 801 ******************************************************************************/ 802 803 size_t indexOf (const(char)* str, char match, size_t length) 804 { 805 enum m1 = cast(size_t) 0x0101010101010101; 806 enum m2 = cast(size_t) 0x8080808080808080; 807 808 if (length) 809 { 810 size_t m = match; 811 m += m << 8; 812 m += (m << (8 * 2)); 813 m += (m << (8 * 4)); 814 815 auto p = str; 816 auto e = p + length - size_t.sizeof; 817 while (p < e) 818 { 819 // clear matching T segments 820 auto v = (*cast(size_t*) p) ^ m; 821 // test for zero, courtesy of Alan Mycroft 822 if ((v - m1) & ~v & m2) 823 break; 824 p += size_t.sizeof; 825 } 826 827 e += size_t.sizeof; 828 while (p < e) 829 if (*p++ is match) 830 return cast(size_t) (p - str - 1); 831 } 832 return length; 833 } 834 835 /****************************************************************************** 836 837 Returns the index of a mismatch between s1 & s2, failing when 838 length is reached. Note that we return 'length' upon failure 839 (array content matches) and a 0-based index upon success. 840 841 Use this as a faster opEquals. Also provides the basis for a 842 faster opCmp, since the index of the first mismatched character 843 can be used to determine the return value 844 845 ******************************************************************************/ 846 847 size_t mismatch (const(char)* s1, const(char)* s2, size_t length) 848 { 849 verify(s1 && s2); 850 851 if (length) 852 { 853 auto start = s1; 854 auto e = start + length - size_t.sizeof; 855 856 while (s1 < e) 857 { 858 if (*cast(size_t*) s1 != *cast(size_t*) s2) 859 break; 860 s1 += size_t.sizeof; 861 s2 += size_t.sizeof; 862 } 863 864 e += size_t.sizeof; 865 while (s1 < e) 866 if (*s1++ != *s2++) 867 return s1 - start - 1; 868 } 869 870 return length; 871 } 872 873 /****************************************************************************** 874 875 Iterator to isolate lines. 876 877 Converts text into a set of lines, where each line is identified 878 by a \n or \r\n combination. The line terminator is stripped from 879 each resultant array. 880 881 --- 882 foreach (line; lines ("one\ntwo\nthree")) 883 ... 884 --- 885 886 ******************************************************************************/ 887 888 LineFruct!(T) lines(T) (T[] src) 889 { 890 LineFruct!(T) lines; 891 lines.src = src; 892 return lines; 893 } 894 895 /****************************************************************************** 896 897 Iterator to isolate text elements. 898 899 Splits the provided array wherever a delimiter-set instance is 900 found, and return the resultant segments. The delimiters are 901 excluded from each of the segments. Note that delimiters are 902 matched as a set of alternates rather than as a pattern. 903 904 Splitting on a single delimiter is considerably faster than 905 splitting upon a set of alternatives. 906 907 --- 908 foreach (segment; delimiters ("one,two;three", ",;")) 909 ... 910 --- 911 912 Has to be templated to propagate mutable/const input qualifier to 913 return struct. 914 915 ******************************************************************************/ 916 917 DelimFruct!T delimiters (T) (T[] src, cstring set) 918 { 919 DelimFruct!T elements; 920 elements.set = set; 921 elements.src = src; 922 return elements; 923 } 924 925 /****************************************************************************** 926 927 Iterator to isolate text elements. 928 929 Split the provided array wherever a pattern instance is found, 930 and return the resultant segments. Pattern are excluded from 931 each of the segments, and an optional sub argument enables 932 replacement. 933 934 --- 935 foreach (segment; patterns ("one, two, three", ", ")) 936 ... 937 --- 938 939 ******************************************************************************/ 940 941 PatternFruct!T patterns(T) (T[] src, cstring pattern, T[] sub = null) 942 { 943 PatternFruct!T elements; 944 elements.pattern = pattern; 945 elements.sub = sub; 946 elements.src = src; 947 return elements; 948 } 949 950 unittest 951 { 952 cstring[] arr; 953 foreach (match; patterns("aaa..bbb..ccc", "..")) 954 arr ~= match; 955 test (arr == [ "aaa", "bbb", "ccc" ]); 956 957 arr = [ ]; 958 foreach (match; patterns("aaa..bbb..ccc", "..", "X")) 959 arr ~= match; 960 test (arr == [ "aaa", "X", "bbb", "X", "ccc" ]); 961 } 962 963 /****************************************************************************** 964 965 Iterator to isolate optionally quoted text elements. 966 967 As per elements(), but with the extension of being quote-aware; 968 the set of delimiters is ignored inside a pair of quotes. Note 969 that an unterminated quote will consume remaining content. 970 971 --- 972 foreach (quote; quotes ("one two 'three four' five", " ")) 973 ... 974 --- 975 976 ******************************************************************************/ 977 978 QuoteFruct!T quotes(T) (T[] src, cstring set) 979 { 980 QuoteFruct!T quotes; 981 quotes.set = set; 982 quotes.src = src; 983 return quotes; 984 } 985 986 /******************************************************************************* 987 988 Arranges text strings in order, using indices to specify where 989 each particular argument should be positioned within the text. 990 This is handy for collating I18N components, or as a simplistic 991 and lightweight formatter. Indices range from zero through nine. 992 993 --- 994 // write ordered text to the console 995 char[64] tmp; 996 997 Cout (layout (tmp, "%1 is after %0", "zero", "one")).newline; 998 --- 999 1000 *******************************************************************************/ 1001 1002 cstring layout(mstring output, cstring format, const(char[])[] arguments ...) 1003 { 1004 return layout_(output, format, arguments); 1005 } 1006 1007 private cstring layout_ (mstring output, cstring format, const(char[])[] arguments) 1008 { 1009 static badarg = "{index out of range}"; 1010 static toosmall = "{output buffer too small}"; 1011 1012 size_t pos, 1013 args; 1014 bool state; 1015 1016 args = arguments.length; 1017 foreach (c; format) 1018 { 1019 if (state) 1020 { 1021 state = false; 1022 if (c >= '0' && c <= '9') 1023 { 1024 size_t index = c - '0'; 1025 if (index < args) 1026 { 1027 auto x = arguments[index]; 1028 1029 size_t limit = pos + x.length; 1030 if (limit < output.length) 1031 { 1032 output [pos .. limit] = x; 1033 pos = limit; 1034 continue; 1035 } 1036 else 1037 return toosmall; 1038 } 1039 else 1040 return badarg; 1041 } 1042 } 1043 else 1044 if (c is '%') 1045 { 1046 state = true; 1047 continue; 1048 } 1049 1050 if (pos < output.length) 1051 { 1052 output[pos] = c; 1053 ++pos; 1054 } 1055 else 1056 return toosmall; 1057 } 1058 1059 return output [0..pos]; 1060 } 1061 1062 /****************************************************************************** 1063 1064 Convert 'escaped' chars to normal ones: \t => ^t for example. 1065 Supports \" \' \\ \a \b \f \n \r \t \v 1066 1067 ******************************************************************************/ 1068 1069 cstring unescape (cstring src, mstring dst = null) 1070 { 1071 ptrdiff_t delta; 1072 auto s = src.ptr; 1073 auto len = src.length; 1074 1075 // take a peek first to see if there's anything 1076 if ((delta = indexOf (s, '\\', len)) < len) 1077 { 1078 // make some room if not enough provided 1079 if (dst.length < src.length) 1080 dst.length = src.length; 1081 auto d = dst.ptr; 1082 1083 // copy segments over, a chunk at a time 1084 do { 1085 d [0 .. delta] = s [0 .. delta]; 1086 len -= delta; 1087 s += delta; 1088 d += delta; 1089 1090 // bogus trailing '\' 1091 if (len < 2) 1092 { 1093 *d++ = '\\'; 1094 len = 0; 1095 break; 1096 } 1097 1098 // translate \char 1099 char c = s[1]; 1100 switch (c) 1101 { 1102 case '\\': 1103 break; 1104 case '\'': 1105 c = '\''; 1106 break; 1107 case '"': 1108 c = '"'; 1109 break; 1110 case 'a': 1111 c = '\a'; 1112 break; 1113 case 'b': 1114 c = '\b'; 1115 break; 1116 case 'f': 1117 c = '\f'; 1118 break; 1119 case 'n': 1120 c = '\n'; 1121 break; 1122 case 'r': 1123 c = '\r'; 1124 break; 1125 case 't': 1126 c = '\t'; 1127 break; 1128 case 'v': 1129 c = '\v'; 1130 break; 1131 default: 1132 *d++ = '\\'; 1133 } 1134 *d++ = c; 1135 len -= 2; 1136 s += 2; 1137 } while ((delta = indexOf (s, '\\', len)) < len); 1138 1139 // copy tail too 1140 d [0 .. len] = s [0 .. len]; 1141 return dst [0 .. (d + len) - dst.ptr]; 1142 } 1143 return src; 1144 } 1145 1146 1147 /****************************************************************************** 1148 1149 jhash() -- hash a variable-length key into a 32-bit value 1150 1151 k : the key (the unaligned variable-length array of bytes) 1152 len : the length of the key, counting by bytes 1153 level : can be any 4-byte value 1154 1155 Returns a 32-bit value. Every bit of the key affects every bit of 1156 the return value. Every 1-bit and 2-bit delta achieves avalanche. 1157 1158 About 4.3*len + 80 X86 instructions, with excellent pipelining 1159 1160 The best hash table sizes are powers of 2. There is no need to do 1161 mod a prime (mod is sooo slow!). If you need less than 32 bits, 1162 use a bitmask. For example, if you need only 10 bits, do 1163 1164 h = (h & hashmask(10)); 1165 1166 In which case, the hash table should have hashsize(10) elements. 1167 If you are hashing n strings (ub1 **)k, do it like this: 1168 1169 for (i=0, h=0; i<n; ++i) h = hash( k[i], len[i], h); 1170 1171 By Bob Jenkins, 1996. bob_jenkins@burtleburtle.net. You may use 1172 this code any way you wish, private, educational, or commercial. 1173 It's free. 1174 1175 See http://burtleburtle.net/bob/hash/evahash.html 1176 Use for hash table lookup, or anything where one collision in 2^32 1177 is acceptable. Do NOT use for cryptographic purposes. 1178 1179 ******************************************************************************/ 1180 1181 size_t jhash (ubyte* k, size_t len, size_t c = 0) 1182 { 1183 size_t a = 0x9e3779b9, 1184 b = 0x9e3779b9, 1185 i = len; 1186 1187 // handle most of the key 1188 while (i >= 12) 1189 { 1190 a += *cast(uint *)(k+0); 1191 b += *cast(uint *)(k+4); 1192 c += *cast(uint *)(k+8); 1193 1194 a -= b; a -= c; a ^= (c>>13); 1195 b -= c; b -= a; b ^= (a<<8); 1196 c -= a; c -= b; c ^= (b>>13); 1197 a -= b; a -= c; a ^= (c>>12); 1198 b -= c; b -= a; b ^= (a<<16); 1199 c -= a; c -= b; c ^= (b>>5); 1200 a -= b; a -= c; a ^= (c>>3); 1201 b -= c; b -= a; b ^= (a<<10); 1202 c -= a; c -= b; c ^= (b>>15); 1203 k += 12; i -= 12; 1204 } 1205 1206 // handle the last 11 bytes 1207 c += len; 1208 switch (i) 1209 { 1210 case 11: c+=(cast(uint)k[10]<<24); goto case; 1211 case 10: c+=(cast(uint)k[9]<<16); goto case; 1212 case 9 : c+=(cast(uint)k[8]<<8); goto case; 1213 case 8 : b+=(cast(uint)k[7]<<24); goto case; 1214 case 7 : b+=(cast(uint)k[6]<<16); goto case; 1215 case 6 : b+=(cast(uint)k[5]<<8); goto case; 1216 case 5 : b+=(cast(uint)k[4]); goto case; 1217 case 4 : a+=(cast(uint)k[3]<<24); goto case; 1218 case 3 : a+=(cast(uint)k[2]<<16); goto case; 1219 case 2 : a+=(cast(uint)k[1]<<8); goto case; 1220 case 1 : a+=(cast(uint)k[0]); goto default; 1221 default: 1222 } 1223 1224 a -= b; a -= c; a ^= (c>>13); 1225 b -= c; b -= a; b ^= (a<<8); 1226 c -= a; c -= b; c ^= (b>>13); 1227 a -= b; a -= c; a ^= (c>>12); 1228 b -= c; b -= a; b ^= (a<<16); 1229 c -= a; c -= b; c ^= (b>>5); 1230 a -= b; a -= c; a ^= (c>>3); 1231 b -= c; b -= a; b ^= (a<<10); 1232 c -= a; c -= b; c ^= (b>>15); 1233 1234 return c; 1235 } 1236 1237 /// ditto 1238 size_t jhash (void[] x, size_t c = 0) 1239 { 1240 return jhash (cast(ubyte*) x.ptr, x.length, c); 1241 } 1242 1243 1244 /****************************************************************************** 1245 1246 Helper fruct for iterator lines(). A fruct is a low 1247 impact mechanism for capturing context relating to an 1248 opApply (conjunction of the names struct and foreach) 1249 1250 ******************************************************************************/ 1251 1252 private struct LineFruct(T) 1253 { 1254 private T[] src; 1255 1256 int opApply (scope int delegate (ref T[] line) dg) 1257 { 1258 int ret; 1259 size_t pos, 1260 mark; 1261 T[] line; 1262 1263 enum T nl = '\n'; 1264 enum T cr = '\r'; 1265 1266 while ((pos = locate (src, nl, mark)) < src.length) 1267 { 1268 auto end = pos; 1269 if (end && src[end-1] is cr) 1270 --end; 1271 1272 line = src [mark .. end]; 1273 if ((ret = dg (line)) != 0) 1274 return ret; 1275 mark = pos + 1; 1276 } 1277 1278 line = src [mark .. $]; 1279 if (mark <= src.length) 1280 ret = dg (line); 1281 1282 return ret; 1283 } 1284 } 1285 1286 /****************************************************************************** 1287 1288 Helper fruct for iterator delims(). A fruct is a low 1289 impact mechanism for capturing context relating to an 1290 opApply (conjunction of the names struct and foreach) 1291 1292 ******************************************************************************/ 1293 1294 private struct DelimFruct (T) 1295 { 1296 private T[] src; 1297 private cstring set; 1298 1299 int opApply (scope int delegate (ref T[] token) dg) 1300 { 1301 int ret; 1302 size_t pos, 1303 mark; 1304 T[] token; 1305 1306 // optimize for single delimiter case 1307 if (set.length is 1) 1308 while ((pos = locate (src, set[0], mark)) < src.length) 1309 { 1310 token = src [mark .. pos]; 1311 if ((ret = dg (token)) != 0) 1312 return ret; 1313 mark = pos + 1; 1314 } 1315 else 1316 if (set.length > 1) 1317 foreach (i, elem; src) 1318 if (contains (set, elem)) 1319 { 1320 token = src [mark .. i]; 1321 if ((ret = dg (token)) != 0) 1322 return ret; 1323 mark = i + 1; 1324 } 1325 1326 token = src [mark .. $]; 1327 if (mark <= src.length) 1328 ret = dg (token); 1329 1330 return ret; 1331 } 1332 } 1333 1334 /****************************************************************************** 1335 1336 Helper fruct for iterator patterns(). A fruct is a low 1337 impact mechanism for capturing context relating to an 1338 opApply (conjunction of the names struct and foreach) 1339 1340 ******************************************************************************/ 1341 1342 public struct PatternFruct (T) 1343 { 1344 private T[] src, sub; 1345 private cstring pattern; 1346 1347 int opApply (scope int delegate (ref T[] token) dg) 1348 { 1349 int ret; 1350 size_t pos, 1351 mark; 1352 T[] token; 1353 1354 while ((pos = index (src, pattern, mark)) < src.length) 1355 { 1356 token = src [mark .. pos]; 1357 if ((ret = dg(token)) != 0) 1358 return ret; 1359 if (sub.ptr && (ret = dg(sub)) != 0) 1360 return ret; 1361 mark = pos + pattern.length; 1362 } 1363 1364 token = src [mark .. $]; 1365 if (mark <= src.length) 1366 ret = dg (token); 1367 1368 return ret; 1369 } 1370 } 1371 1372 /****************************************************************************** 1373 1374 Helper fruct for iterator quotes(). A fruct is a low 1375 impact mechanism for capturing context relating to an 1376 opApply (conjunction of the names struct and foreach) 1377 1378 ******************************************************************************/ 1379 1380 private struct QuoteFruct (T) 1381 { 1382 private T[] src; 1383 private cstring set; 1384 1385 int opApply (scope int delegate (ref T[] token) dg) 1386 { 1387 int ret; 1388 size_t mark; 1389 T[] token; 1390 1391 if (set.length) 1392 for (size_t i=0; i < src.length; ++i) 1393 { 1394 T c = src[i]; 1395 if (c is '"' || c is '\'') 1396 i = locate (src, c, i+1); 1397 else 1398 if (contains (set, c)) 1399 { 1400 token = src [mark .. i]; 1401 if ((ret = dg (token)) != 0) 1402 return ret; 1403 mark = i + 1; 1404 } 1405 } 1406 1407 token = src [mark .. $]; 1408 if (mark <= src.length) 1409 ret = dg (token); 1410 1411 return ret; 1412 } 1413 } 1414 1415 1416 /****************************************************************************** 1417 1418 ******************************************************************************/ 1419 1420 unittest 1421 { 1422 char[64] tmp; 1423 1424 test (isSpace (' ') && !isSpace ('d')); 1425 1426 test (indexOf ("abc".ptr, 'a', 3) is 0); 1427 test (indexOf ("abc".ptr, 'b', 3) is 1); 1428 test (indexOf ("abc".ptr, 'c', 3) is 2); 1429 test (indexOf ("abc".ptr, 'd', 3) is 3); 1430 test (indexOf ("abcabcabc".ptr, 'd', 9) is 9); 1431 1432 test (mismatch ("abc".ptr, "abc".ptr, 3) is 3); 1433 test (mismatch ("abc".ptr, "abd".ptr, 3) is 2); 1434 test (mismatch ("abc".ptr, "acc".ptr, 3) is 1); 1435 test (mismatch ("abc".ptr, "ccc".ptr, 3) is 0); 1436 1437 test (matching ("abc".ptr, "abc".ptr, 3)); 1438 test (matching ("abc".ptr, "abb".ptr, 3) is false); 1439 1440 test (contains ("abc", 'a')); 1441 test (contains ("abc", 'b')); 1442 test (contains ("abc", 'c')); 1443 test (contains ("abc", 'd') is false); 1444 1445 test (containsPattern ("abc", "ab")); 1446 test (containsPattern ("abc", "bc")); 1447 test (containsPattern ("abc", "abc")); 1448 test (containsPattern ("abc", "zabc") is false); 1449 test (containsPattern ("abc", "abcd") is false); 1450 test (containsPattern ("abc", "za") is false); 1451 test (containsPattern ("abc", "cd") is false); 1452 1453 test (trim ("") == ""); 1454 test (trim (" abc ") == "abc"); 1455 test (trim (" ") == ""); 1456 1457 test (strip ("", '%') == ""); 1458 test (strip ("%abc%%%", '%') == "abc"); 1459 test (strip ("#####", '#') == ""); 1460 test (stripl ("#####", '#') == ""); 1461 test (stripl (" ###", ' ') == "###"); 1462 test (stripl ("#####", 's') == "#####"); 1463 test (stripr ("#####", '#') == ""); 1464 test (stripr ("### ", ' ') == "###"); 1465 test (stripr ("#####", 's') == "#####"); 1466 1467 test (replace ("abc".dup, 'b', ':') == "a:c"); 1468 test (substitute ("abc", "bc", "x") == "ax"); 1469 1470 test (locate ("abc".dup, 'c', 1) is 2); 1471 1472 test (locate ("abc", 'c') is 2); 1473 test (locate ("abc", 'a') is 0); 1474 test (locate ("abc", 'd') is 3); 1475 test (locate ("", 'c') is 0); 1476 1477 test (locatePrior ("abce".dup, 'c') is 2); 1478 test (locatePrior ("abce", 'a') is 0); 1479 test (locatePrior ("abce", 'd') is 4); 1480 test (locatePrior ("abce", 'c', 3) is 2); 1481 test (locatePrior ("abce", 'c', 2) is 4); 1482 test (locatePrior ("", 'c') is 0); 1483 1484 auto x = delimit ("::b", ":"); 1485 test (x.length is 3 && x[0] == "" && x[1] == "" && x[2] == "b"); 1486 x = delimit ("a:bc:d", ":"); 1487 test (x.length is 3 && x[0] == "a" && x[1] == "bc" && x[2] == "d"); 1488 x = delimit ("abcd", ":"); 1489 test (x.length is 1 && x[0] == "abcd"); 1490 x = delimit ("abcd:", ":"); 1491 test (x.length is 2 && x[0] == "abcd" && x[1] == ""); 1492 x = delimit ("a;b$c#d:e@f", ";:$#@"); 1493 test (x.length is 6 && x[0]=="a" && x[1]=="b" && x[2]=="c" && 1494 x[3]=="d" && x[4]=="e" && x[5]=="f"); 1495 1496 test (locatePattern ("abcdefg".dup, "") is 7); 1497 test (locatePattern ("abcdefg", "g") is 6); 1498 test (locatePattern ("abcdefg", "abcdefg") is 0); 1499 test (locatePattern ("abcdefg", "abcdefgx") is 7); 1500 test (locatePattern ("abcdefg", "cce") is 7); 1501 test (locatePattern ("abcdefg", "cde") is 2); 1502 test (locatePattern ("abcdefgcde", "cde", 3) is 7); 1503 1504 test (locatePatternPrior ("abcdefg".dup, "") is 7); 1505 test (locatePatternPrior ("abcdefg", "cce") is 7); 1506 test (locatePatternPrior ("abcdefg", "cde") is 2); 1507 test (locatePatternPrior ("abcdefgcde", "cde", 6) is 2); 1508 test (locatePatternPrior ("abcdefgcde", "cde", 4) is 2); 1509 test (locatePatternPrior ("abcdefg", "abcdefgx") is 7); 1510 1511 x = splitLines ("a\nb\n"); 1512 test (x.length is 3 && x[0] == "a" && x[1] == "b" && x[2] == ""); 1513 x = splitLines ("a\r\n"); 1514 test (x.length is 2 && x[0] == "a" && x[1] == ""); 1515 1516 x = splitLines ("a"); 1517 test (x.length is 1 && x[0] == "a"); 1518 x = splitLines (""); 1519 test (x.length is 1); 1520 1521 cstring[] q; 1522 foreach (element; quotes ("1 'avcc cc ' 3", " ")) 1523 q ~= element; 1524 test (q.length is 3 && q[0] == "1" && q[1] == "'avcc cc '" && q[2] == "3"); 1525 1526 test (layout (tmp, "%1,%%%c %0", "abc", "efg") == "efg,%c abc"); 1527 1528 x = split ("one, two, three", ","); 1529 test (x.length is 3 && x[0] == "one" && x[1] == " two" && x[2] == " three"); 1530 x = split ("one, two, three", ", "); 1531 test (x.length is 3 && x[0] == "one" && x[1] == "two" && x[2] == "three"); 1532 x = split ("one, two, three", ",,"); 1533 test (x.length is 1 && x[0] == "one, two, three"); 1534 x = split ("one,,", ","); 1535 test (x.length is 3 && x[0] == "one" && x[1] == "" && x[2] == ""); 1536 1537 istring t, h; 1538 h = head ("one:two:three", ":", t); 1539 test (h == "one" && t == "two:three"); 1540 h = head ("one:::two:three", ":::", t); 1541 test (h == "one" && t == "two:three"); 1542 h = head ("one:two:three", "*", t); 1543 test (h == "one:two:three" && t is null); 1544 1545 t = tail ("one:two:three", ":", h); 1546 test (h == "one:two" && t == "three"); 1547 t = tail ("one:::two:three", ":::", h); 1548 test (h == "one" && t == "two:three"); 1549 t = tail ("one:two:three", "*", h); 1550 test (t == "one:two:three" && h is null); 1551 1552 test (chopl("hello world", "hello ") == "world"); 1553 test (chopl("hello", "hello") == ""); 1554 test (chopl("hello world", " ") == "hello world"); 1555 test (chopl("hello world", "") == "hello world"); 1556 1557 test (chopr("hello world", " world") == "hello"); 1558 test (chopr("hello", "hello") == ""); 1559 test (chopr("hello world", " ") == "hello world"); 1560 test (chopr("hello world", "") == "hello world"); 1561 1562 istring[] foo = ["one", "two", "three"]; 1563 auto j = join (foo); 1564 test (j == "onetwothree"); 1565 j = join (foo, ", "); 1566 test (j == "one, two, three"); 1567 j = join (foo, " ", tmp); 1568 test (j == "one two three"); 1569 test (j.ptr is tmp.ptr); 1570 1571 test (repeat ("abc", 0) == ""); 1572 test (repeat ("abc", 1) == "abc"); 1573 test (repeat ("abc", 2) == "abcabc"); 1574 test (repeat ("abc", 4) == "abcabcabcabc"); 1575 test (repeat ("", 4) == ""); 1576 char[10] rep; 1577 test (repeat ("abc", 0, rep) == ""); 1578 test (repeat ("abc", 1, rep) == "abc"); 1579 test (repeat ("abc", 2, rep) == "abcabc"); 1580 test (repeat ("", 4, rep) == ""); 1581 1582 test (unescape ("abc") == "abc"); 1583 test (unescape ("abc\\") == "abc\\"); 1584 test (unescape ("abc\\t") == "abc\t"); 1585 test (unescape ("abc\\tc") == "abc\tc"); 1586 test (unescape ("\\t") == "\t"); 1587 test (unescape ("\\tx") == "\tx"); 1588 test (unescape ("\\v\\vx") == "\v\vx"); 1589 test (unescape ("abc\\t\\a\\bc") == "abc\t\a\bc"); 1590 } 1591 1592 debug (Util) 1593 { 1594 auto x = import("Util.d"); 1595 1596 void main() 1597 { 1598 mismatch ("".ptr, x.ptr, 0); 1599 indexOf ("".ptr, '@', 0); 1600 char[] s; 1601 split (s, " "); 1602 //indexOf (s.ptr, '@', 0); 1603 1604 } 1605 }