1 /******************************************************************************* 2 3 Placeholder for a variety of wee functions. 4 5 Several of these functions return an index value, representing where 6 some criteria was identified. When said criteria is not matched, the 7 functions return a value representing the array length provided to 8 them. That is, for those scenarios where C functions might typically 9 return -1 these functions return length instead. This operate nicely 10 with D slices: 11 --- 12 auto text = "happy:faces"; 13 14 assert (text[0 .. locate (text, ':')] == "happy"); 15 16 assert (text[0 .. locate (text, '!')] == "happy:faces"); 17 --- 18 19 The contains() function is more convenient for trivial 20 lookup cases: 21 --- 22 if (contains ("fubar", '!')) 23 ... 24 --- 25 26 Note that where some functions expect a size_t as an argument, the 27 D template-matching algorithm will fail where an int is provided 28 instead. This is the typically the cause of "template not found" 29 errors. Also note that name overloading is not supported cleanly 30 by IFTI at this time, so is not applied here. 31 32 33 Applying the D "import alias" mechanism to this module is highly 34 recommended, in order to limit namespace pollution: 35 --- 36 import Util = ocean.text.Util; 37 38 auto s = Util.trim (" foo "); 39 --- 40 41 42 Function templates: 43 --- 44 trim (source) // trim whitespace 45 triml (source) // trim whitespace 46 trimr (source) // trim whitespace 47 strip (source, match) // trim elements 48 stripl (source, match) // trim elements 49 stripr (source, match) // trim elements 50 chopl (source, match) // trim pattern match 51 chopr (source, match) // trim pattern match 52 delimit (src, set) // split on delims 53 split (source, pattern) // split on pattern 54 splitLines (source); // split on lines 55 head (source, pattern, tail) // split to head & tail 56 join (source, postfix, output) // join text segments 57 prefix (dst, prefix, content...) // prefix text segments 58 postfix (dst, postfix, content...) // postfix text segments 59 combine (dst, prefix, postfix, content...) // combine lotsa stuff 60 repeat (source, count, output) // repeat source 61 replace (source, match, replacement) // replace chars 62 substitute (source, match, replacement) // replace/remove matches 63 count (source, match) // count instances 64 contains (source, match) // has char? 65 containsPattern (source, match) // has pattern? 66 index (source, match, start) // find match index 67 locate (source, match, start) // find char 68 locatePrior (source, match, start) // find prior char 69 locatePattern (source, match, start); // find pattern 70 locatePatternPrior (source, match, start); // find prior pattern 71 indexOf (s*, match, length) // low-level lookup 72 mismatch (s1*, s2*, length) // low-level compare 73 matching (s1*, s2*, length) // low-level compare 74 isSpace (match) // is whitespace? 75 unescape(source, output) // convert '\' prefixes 76 lines (str) // foreach lines 77 quotes (str, set) // foreach quotes 78 delimiters (str, set) // foreach delimiters 79 patterns (str, pattern) // foreach patterns 80 --- 81 82 Please note that any 'pattern' referred to within this module 83 refers to a pattern of characters, and not some kind of regex 84 descriptor. Use the Regex module for regex operation. 85 86 Copyright: 87 Copyright (c) 2004 Kris Bell. 88 Some parts copyright (c) 2009-2016 dunnhumby Germany GmbH. 89 All rights reserved. 90 91 License: 92 Tango Dual License: 3-Clause BSD License / Academic Free License v3.0. 93 See LICENSE_TANGO.txt for details. 94 95 Version: 96 Apr 2004: Initial release 97 Dec 2006: South Seas version 98 99 Authors: Kris 100 101 *******************************************************************************/ 102 103 module ocean.text.Util; 104 105 //import ocean.meta.types.Qualifiers : cstring, mstring; 106 import ocean.meta.types.Qualifiers; 107 import ocean.core.Verify; 108 109 version (unittest) import ocean.core.Test; 110 111 112 /****************************************************************************** 113 114 Trim the provided array by stripping whitespace from both 115 ends. Returns a slice of the original content 116 117 ******************************************************************************/ 118 119 inout(char)[] trim (inout(char)[] source) 120 { 121 inout(char)* head = source.ptr, 122 tail = head + source.length; 123 124 while (head < tail && isSpace(*head)) 125 ++head; 126 127 while (tail > head && isSpace(*(tail-1))) 128 --tail; 129 130 return head [0 .. tail - head]; 131 } 132 133 /****************************************************************************** 134 135 Trim the provided array by stripping whitespace from the left. 136 Returns a slice of the original content 137 138 ******************************************************************************/ 139 140 inout(char)[] triml (inout(char)[] source) 141 { 142 inout(char)* head = source.ptr, 143 tail = head + source.length; 144 145 while (head < tail && isSpace(*head)) 146 ++head; 147 148 return head [0 .. tail - head]; 149 } 150 151 /****************************************************************************** 152 153 Trim the provided array by stripping whitespace from the right. 154 Returns a slice of the original content 155 156 ******************************************************************************/ 157 158 inout(char)[] trimr (inout(char)[] source) 159 { 160 inout(char)* head = source.ptr, 161 tail = head + source.length; 162 163 while (tail > head && isSpace(*(tail-1))) 164 --tail; 165 166 return head [0 .. tail - head]; 167 } 168 169 /****************************************************************************** 170 171 Trim the given array by stripping the provided match from 172 both ends. Returns a slice of the original content 173 174 ******************************************************************************/ 175 176 inout(char)[] strip (inout(char)[] source, char match) 177 { 178 inout(char)* head = source.ptr, 179 tail = head + source.length; 180 181 while (head < tail && *head is match) 182 ++head; 183 184 while (tail > head && *(tail-1) is match) 185 --tail; 186 187 return head [0 .. tail - head]; 188 } 189 190 /****************************************************************************** 191 192 Trim the given array by stripping the provided match from 193 the left hand side. Returns a slice of the original content 194 195 ******************************************************************************/ 196 197 inout(char)[] stripl (inout(char)[] source, char match) 198 { 199 inout(char)* head = source.ptr, 200 tail = head + source.length; 201 202 while (head < tail && *head is match) 203 ++head; 204 205 return head [0 .. tail - head]; 206 } 207 208 /****************************************************************************** 209 210 Trim the given array by stripping the provided match from 211 the right hand side. Returns a slice of the original content 212 213 ******************************************************************************/ 214 215 inout(char)[] stripr (inout(char)[] source, char match) 216 { 217 inout(char)* head = source.ptr, 218 tail = head + source.length; 219 220 while (tail > head && *(tail-1) is match) 221 --tail; 222 223 return head [0 .. tail - head]; 224 } 225 226 /****************************************************************************** 227 228 Chop the given source by stripping the provided match from 229 the left hand side. Returns a slice of the original content 230 231 ******************************************************************************/ 232 233 inout(char)[] chopl (inout(char)[] source, cstring match) 234 { 235 if (match.length <= source.length) 236 if (source[0 .. match.length] == match) 237 source = source [match.length .. $]; 238 239 return source; 240 } 241 242 /****************************************************************************** 243 244 Chop the given source by stripping the provided match from 245 the right hand side. Returns a slice of the original content 246 247 ******************************************************************************/ 248 249 inout(char)[] chopr (inout(char)[] source, cstring match) 250 { 251 if (match.length <= source.length) 252 if (source[$-match.length .. $] == match) 253 source = source [0 .. $-match.length]; 254 255 return source; 256 } 257 258 /****************************************************************************** 259 260 Replace all instances of one element with another (in place) 261 262 ******************************************************************************/ 263 264 mstring replace (mstring source, char match, char replacement) 265 { 266 foreach (ref c; source) 267 if (c is match) 268 c = replacement; 269 return source; 270 } 271 272 /****************************************************************************** 273 274 Substitute all instances of match from source. Set replacement 275 to null in order to remove instead of replace 276 277 ******************************************************************************/ 278 279 mstring substitute (cstring source, cstring match, cstring replacement) 280 { 281 mstring output; 282 283 foreach (s; patterns (source, match, replacement)) 284 { 285 output ~= s; 286 } 287 return output; 288 } 289 290 /****************************************************************************** 291 292 Count all instances of match within source 293 294 ******************************************************************************/ 295 296 size_t count (cstring source, cstring match) 297 { 298 size_t c; 299 300 foreach (s; patterns (source, match)) 301 ++c; 302 verify(c > 0); 303 return c - 1; 304 } 305 306 /****************************************************************************** 307 308 Returns whether or not the provided array contains an instance 309 of the given match 310 311 ******************************************************************************/ 312 313 bool contains (cstring source, char match) 314 { 315 return indexOf (source.ptr, match, source.length) != source.length; 316 } 317 318 /****************************************************************************** 319 320 Returns whether or not the provided array contains an instance 321 of the given match 322 323 ******************************************************************************/ 324 325 bool containsPattern (cstring source, cstring match) 326 { 327 return locatePattern (source, match) != source.length; 328 } 329 330 unittest 331 { 332 mstring greeting = "Hello world".dup; 333 istring satan = "Hell"; 334 test(containsPattern(greeting, satan), "Pattern not found"); 335 } 336 337 338 /****************************************************************************** 339 340 Return the index of the next instance of 'match' starting at 341 position 'start', or source.length where there is no match. 342 343 Parameter 'start' defaults to 0 344 345 ******************************************************************************/ 346 347 size_t index (cstring source, cstring match, size_t start=0) 348 { 349 return (match.length is 1) ? locate (source, match[0], start) 350 : locatePattern (source, match, start); 351 } 352 353 unittest 354 { 355 char[] url; 356 size_t start; 357 cstring CSLASHSLASH = "://"; 358 start = index(url, CSLASHSLASH); 359 } 360 361 /****************************************************************************** 362 363 Return the index of the prior instance of 'match' starting 364 just before 'start', or source.length where there is no match. 365 366 Parameter 'start' defaults to source.length 367 368 ******************************************************************************/ 369 370 size_t rindex (cstring source, cstring match, size_t start=size_t.max) 371 { 372 return (match.length is 1) ? locatePrior (source, match[0], start) 373 : locatePatternPrior (source, match, start); 374 } 375 376 /****************************************************************************** 377 378 Return the index of the next instance of 'match' starting at 379 position 'start', or source.length where there is no match. 380 381 Parameter 'start' defaults to 0 382 383 ******************************************************************************/ 384 385 size_t locate (cstring source, char match, size_t start=0) 386 { 387 if (start > source.length) 388 start = source.length; 389 390 return indexOf (source.ptr+start, match, source.length - start) + start; 391 } 392 393 unittest 394 { 395 size_t start; 396 char[] url; 397 start = locate(url, '/', start); 398 } 399 400 /****************************************************************************** 401 402 Return the index of the prior instance of 'match' starting 403 just before 'start', or source.length where there is no match. 404 405 Parameter 'start' defaults to source.length 406 407 ******************************************************************************/ 408 409 size_t locatePrior (cstring source, char match, size_t start = size_t.max) 410 { 411 if (start > source.length) 412 start = source.length; 413 414 while (start > 0) 415 if (source[--start] is match) 416 return start; 417 return source.length; 418 } 419 420 /****************************************************************************** 421 422 Return the index of the next instance of 'match' starting at 423 position 'start', or source.length where there is no match. 424 425 Parameter 'start' defaults to 0 426 427 ******************************************************************************/ 428 429 size_t locatePattern (cstring source, cstring match, size_t start=0) 430 { 431 size_t idx; 432 const(char)* p = source.ptr + start; 433 size_t extent = source.length - start - match.length + 1; 434 435 if (match.length && extent <= source.length) 436 { 437 while (extent) 438 if ((idx = indexOf (p, match[0], extent)) is extent) 439 break; 440 else 441 { 442 if (matching (p+=idx, match.ptr, match.length)) 443 return p - source.ptr; 444 else 445 { 446 extent -= (idx+1); 447 ++p; 448 } 449 } 450 } 451 452 return source.length; 453 } 454 455 /****************************************************************************** 456 457 Return the index of the prior instance of 'match' starting 458 just before 'start', or source.length where there is no match. 459 460 Parameter 'start' defaults to source.length 461 462 ******************************************************************************/ 463 464 size_t locatePatternPrior (cstring source, cstring match, size_t start=size_t.max) 465 { 466 auto len = source.length; 467 468 if (start > len) 469 start = len; 470 471 if (match.length && match.length <= len) 472 while (start) 473 { 474 start = locatePrior (source, match[0], start); 475 if (start is len) 476 break; 477 else 478 if ((start + match.length) <= len) 479 if (matching (source.ptr+start, match.ptr, match.length)) 480 return start; 481 } 482 483 return len; 484 } 485 486 /****************************************************************************** 487 488 Split the provided array on the first pattern instance, and 489 return the resultant head and tail. The pattern is excluded 490 from the two segments. 491 492 Where a segment is not found, tail will be null and the return 493 value will be the original array. 494 495 ******************************************************************************/ 496 497 inout(char)[] head (inout(char)[] src, cstring pattern, out inout(char)[] tail) 498 { 499 auto i = locatePattern (src, pattern); 500 if (i != src.length) 501 { 502 tail = src [i + pattern.length .. $]; 503 src = src [0 .. i]; 504 } 505 return src; 506 } 507 508 /****************************************************************************** 509 510 Split the provided array on the last pattern instance, and 511 return the resultant head and tail. The pattern is excluded 512 from the two segments. 513 514 Where a segment is not found, head will be null and the return 515 value will be the original array. 516 517 ******************************************************************************/ 518 519 inout(char)[] tail (inout(char)[] src, cstring pattern, out inout(char)[] head) 520 { 521 auto i = locatePatternPrior (src, pattern); 522 if (i != src.length) 523 { 524 head = src [0 .. i]; 525 src = src [i + pattern.length .. $]; 526 } 527 return src; 528 } 529 530 /****************************************************************************** 531 532 Split the provided array wherever a delimiter-set instance is 533 found, and return the resultant segments. The delimiters are 534 excluded from each of the segments. Note that delimiters are 535 matched as a set of alternates rather than as a pattern. 536 537 Splitting on a single delimiter is considerably faster than 538 splitting upon a set of alternatives. 539 540 Note that the src content is not duplicated by this function, 541 but is sliced instead. 542 543 ******************************************************************************/ 544 545 inout(char)[][] delimit (inout(char)[] src, cstring set) 546 { 547 typeof(return) result; 548 549 // Cast is needed to avoid instantiating `DelimFruct!(inout(char))` 550 // which can't compile. We cast segment back to `inout` after so the end 551 // result is still type-checked. 552 553 foreach (segment; delimiters (cast(cstring) src, set)) 554 result ~= cast(typeof(src)) segment; 555 return result; 556 } 557 558 /****************************************************************************** 559 560 Split the provided array wherever a pattern instance is 561 found, and return the resultant segments. The pattern is 562 excluded from each of the segments. 563 564 Note that the src content is not duplicated by this function, 565 but is sliced instead. 566 567 ******************************************************************************/ 568 569 inout(char)[][] split (inout(char)[] src, cstring pattern) 570 { 571 typeof(return) result; 572 573 // Cast is needed to avoid instantiating `PatternFruct!(inout(char))` 574 // which can't compile. We cast segment back to `inout` after so the end 575 // result is still type-checked. 576 577 foreach (segment; patterns (cast(cstring) src, pattern)) 578 result ~= cast(typeof(src)) segment; 579 return result; 580 } 581 582 /****************************************************************************** 583 584 Convert text into a set of lines, where each line is identified 585 by a \n or \r\n combination. The line terminator is stripped from 586 each resultant array 587 588 Note that the src content is not duplicated by this function, but 589 is sliced instead. 590 591 ******************************************************************************/ 592 593 inout(char)[][] toLines (inout(char)[] src) 594 { 595 typeof(return) result; 596 597 // Cast is needed to avoid instantiating `LineFruct!(inout(char))` 598 // which can't compile. We cast line back to `inout` after so the end 599 // result is still type-checked. 600 601 foreach (line; lines (cast(cstring) src)) 602 result ~= cast(typeof(src)) line; 603 return result; 604 } 605 606 alias toLines splitLines; 607 608 /****************************************************************************** 609 610 Return the indexed line, where each line is identified by a \n 611 or \r\n combination. The line terminator is stripped from the 612 resultant line 613 614 Note that src content is not duplicated by this function, but 615 is sliced instead. 616 617 ******************************************************************************/ 618 619 inout(char)[] lineOf (inout(char)[] src, size_t index) 620 { 621 int i = 0; 622 623 // Cast is needed to avoid instantiating `LineFruct!(inout(char))` 624 // which can't compile. We cast line back to `inout` after so the end 625 // result is still type-checked. 626 627 foreach (line; lines(cast(cstring) src)) 628 if (i++ is index) 629 return cast(typeof(return)) line; 630 return null; 631 } 632 633 /****************************************************************************** 634 635 Combine a series of text segments together, each appended with 636 a postfix pattern. An optional output buffer can be provided to 637 avoid heap activity - it should be large enough to contain the 638 entire output, otherwise the heap will be used instead. 639 640 Returns a valid slice of the output, containing the concatenated 641 text. 642 643 ******************************************************************************/ 644 645 mstring join (const(char[])[] src, cstring postfix=null, mstring dst = null) 646 { 647 return combine(dst, null, postfix, src); 648 } 649 650 unittest 651 { 652 test (join([ "aaa", "bbb", "ccc" ], ",") == "aaa,bbb,ccc"); 653 654 // ensure `join` works with differently qualified arguments 655 const(char[][]) mut = [ "xxx".dup, "yyy".dup, "zzz" ]; 656 char[20] buf; 657 auto ret = join(mut, " ", buf); 658 test (ret == "xxx yyy zzz"); 659 test (ret.ptr is buf.ptr); 660 } 661 662 /****************************************************************************** 663 664 Combine a series of text segments together, each prepended with 665 a prefix pattern. An optional output buffer can be provided to 666 avoid heap activity - it should be large enough to contain the 667 entire output, otherwise the heap will be used instead. 668 669 Note that, unlike join(), the output buffer is specified first 670 such that a set of trailing strings can be provided. 671 672 Returns a valid slice of the output, containing the concatenated 673 text. 674 675 ******************************************************************************/ 676 677 mstring prefix (mstring dst, cstring prefix, const(char[])[] src...) 678 { 679 return combine(dst, prefix, null, src); 680 } 681 682 /****************************************************************************** 683 684 Combine a series of text segments together, each appended with an 685 optional postfix pattern. An optional output buffer can be provided 686 to avoid heap activity - it should be large enough to contain the 687 entire output, otherwise the heap will be used instead. 688 689 Note that, unlike join(), the output buffer is specified first 690 such that a set of trailing strings can be provided. 691 692 Returns a valid slice of the output, containing the concatenated 693 text. 694 695 ******************************************************************************/ 696 697 mstring postfix (mstring dst, cstring postfix, cstring[] src...) 698 { 699 return combine(dst, null, postfix, src); 700 } 701 702 /****************************************************************************** 703 704 Combine a series of text segments together, each prefixed and/or 705 postfixed with optional strings. An optional output buffer can be 706 provided to avoid heap activity - which should be large enough to 707 contain the entire output, otherwise the heap will be used instead. 708 709 Note that, unlike join(), the output buffer is specified first 710 such that a set of trailing strings can be provided. 711 712 Returns a valid slice of the output, containing the concatenated 713 text. 714 715 ******************************************************************************/ 716 717 mstring combine (mstring dst, cstring prefix, cstring postfix, const(char[])[] src ...) 718 { 719 size_t len = src.length * prefix.length + 720 src.length * postfix.length; 721 722 foreach (segment; src) 723 len += segment.length; 724 725 if (dst.length < len) 726 dst.length = len; 727 728 auto p = dst.ptr; 729 foreach (segment; src) 730 { 731 p[0 .. prefix.length] = prefix; 732 p += prefix.length; 733 p[0 .. segment.length] = segment; 734 p += segment.length; 735 p[0 .. postfix.length] = postfix; 736 p += postfix.length; 737 } 738 739 // remove trailing seperator 740 if (len) 741 len -= postfix.length; 742 return dst [0 .. len]; 743 } 744 745 /****************************************************************************** 746 747 Repeat an array for a specific number of times. An optional output 748 buffer can be provided to avoid heap activity - it should be large 749 enough to contain the entire output, otherwise the heap will be used 750 instead. 751 752 Returns a valid slice of the output, containing the concatenated 753 text. 754 755 ******************************************************************************/ 756 757 mstring repeat (cstring src, size_t count, mstring dst=null) 758 { 759 size_t len = src.length * count; 760 if (len is 0) 761 return null; 762 763 if (dst.length < len) 764 dst.length = len; 765 766 for (auto p = dst.ptr; count--; p += src.length) 767 p[0 .. src.length] = src; 768 769 return dst [0 .. len]; 770 } 771 772 /****************************************************************************** 773 774 Is the argument a whitespace character? 775 776 ******************************************************************************/ 777 778 bool isSpace (char c) 779 { 780 return (c <= 32 && (c is ' ' || c is '\t' || c is '\r' || c is '\n' || c is '\f' || c is '\v')); 781 } 782 783 /****************************************************************************** 784 785 Return whether or not the two arrays have matching content 786 787 ******************************************************************************/ 788 789 bool matching (const(char)* s1, const(char)* s2, size_t length) 790 { 791 return mismatch(s1, s2, length) is length; 792 } 793 794 /****************************************************************************** 795 796 Returns the index of the first match in str, failing once 797 length is reached. Note that we return 'length' for failure 798 and a 0-based index on success 799 800 ******************************************************************************/ 801 802 size_t indexOf (const(char)* str, char match, size_t length) 803 { 804 enum m1 = cast(size_t) 0x0101010101010101; 805 enum m2 = cast(size_t) 0x8080808080808080; 806 807 if (length) 808 { 809 size_t m = match; 810 m += m << 8; 811 m += (m << (8 * 2)); 812 m += (m << (8 * 4)); 813 814 auto p = str; 815 auto e = p + length - size_t.sizeof; 816 while (p < e) 817 { 818 // clear matching T segments 819 auto v = (*cast(size_t*) p) ^ m; 820 // test for zero, courtesy of Alan Mycroft 821 if ((v - m1) & ~v & m2) 822 break; 823 p += size_t.sizeof; 824 } 825 826 e += size_t.sizeof; 827 while (p < e) 828 if (*p++ is match) 829 return cast(size_t) (p - str - 1); 830 } 831 return length; 832 } 833 834 /****************************************************************************** 835 836 Returns the index of a mismatch between s1 & s2, failing when 837 length is reached. Note that we return 'length' upon failure 838 (array content matches) and a 0-based index upon success. 839 840 Use this as a faster opEquals. Also provides the basis for a 841 faster opCmp, since the index of the first mismatched character 842 can be used to determine the return value 843 844 ******************************************************************************/ 845 846 size_t mismatch (const(char)* s1, const(char)* s2, size_t length) 847 { 848 verify(s1 && s2); 849 850 if (length) 851 { 852 auto start = s1; 853 auto e = start + length - size_t.sizeof; 854 855 while (s1 < e) 856 { 857 if (*cast(size_t*) s1 != *cast(size_t*) s2) 858 break; 859 s1 += size_t.sizeof; 860 s2 += size_t.sizeof; 861 } 862 863 e += size_t.sizeof; 864 while (s1 < e) 865 if (*s1++ != *s2++) 866 return s1 - start - 1; 867 } 868 869 return length; 870 } 871 872 /****************************************************************************** 873 874 Iterator to isolate lines. 875 876 Converts text into a set of lines, where each line is identified 877 by a \n or \r\n combination. The line terminator is stripped from 878 each resultant array. 879 880 --- 881 foreach (line; lines ("one\ntwo\nthree")) 882 ... 883 --- 884 885 ******************************************************************************/ 886 887 LineFruct!(T) lines(T) (T[] src) 888 { 889 LineFruct!(T) lines; 890 lines.src = src; 891 return lines; 892 } 893 894 /****************************************************************************** 895 896 Iterator to isolate text elements. 897 898 Splits the provided array wherever a delimiter-set instance is 899 found, and return the resultant segments. The delimiters are 900 excluded from each of the segments. Note that delimiters are 901 matched as a set of alternates rather than as a pattern. 902 903 Splitting on a single delimiter is considerably faster than 904 splitting upon a set of alternatives. 905 906 --- 907 foreach (segment; delimiters ("one,two;three", ",;")) 908 ... 909 --- 910 911 Has to be templated to propagate mutable/const input qualifier to 912 return struct. 913 914 ******************************************************************************/ 915 916 DelimFruct!T delimiters (T) (T[] src, cstring set) 917 { 918 DelimFruct!T elements; 919 elements.set = set; 920 elements.src = src; 921 return elements; 922 } 923 924 /****************************************************************************** 925 926 Iterator to isolate text elements. 927 928 Split the provided array wherever a pattern instance is found, 929 and return the resultant segments. Pattern are excluded from 930 each of the segments, and an optional sub argument enables 931 replacement. 932 933 --- 934 foreach (segment; patterns ("one, two, three", ", ")) 935 ... 936 --- 937 938 ******************************************************************************/ 939 940 PatternFruct!T patterns(T) (T[] src, cstring pattern, T[] sub = null) 941 { 942 PatternFruct!T elements; 943 elements.pattern = pattern; 944 elements.sub = sub; 945 elements.src = src; 946 return elements; 947 } 948 949 unittest 950 { 951 cstring[] arr; 952 foreach (match; patterns("aaa..bbb..ccc", "..")) 953 arr ~= match; 954 test (arr == [ "aaa", "bbb", "ccc" ]); 955 956 arr = [ ]; 957 foreach (match; patterns("aaa..bbb..ccc", "..", "X")) 958 arr ~= match; 959 test (arr == [ "aaa", "X", "bbb", "X", "ccc" ]); 960 } 961 962 /****************************************************************************** 963 964 Iterator to isolate optionally quoted text elements. 965 966 As per elements(), but with the extension of being quote-aware; 967 the set of delimiters is ignored inside a pair of quotes. Note 968 that an unterminated quote will consume remaining content. 969 970 --- 971 foreach (quote; quotes ("one two 'three four' five", " ")) 972 ... 973 --- 974 975 ******************************************************************************/ 976 977 QuoteFruct!T quotes(T) (T[] src, cstring set) 978 { 979 QuoteFruct!T quotes; 980 quotes.set = set; 981 quotes.src = src; 982 return quotes; 983 } 984 985 /****************************************************************************** 986 987 Convert 'escaped' chars to normal ones: \t => ^t for example. 988 Supports \" \' \\ \a \b \f \n \r \t \v 989 990 ******************************************************************************/ 991 992 cstring unescape (cstring src, mstring dst = null) 993 { 994 ptrdiff_t delta; 995 auto s = src.ptr; 996 auto len = src.length; 997 998 // take a peek first to see if there's anything 999 if ((delta = indexOf (s, '\\', len)) < len) 1000 { 1001 // make some room if not enough provided 1002 if (dst.length < src.length) 1003 dst.length = src.length; 1004 auto d = dst.ptr; 1005 1006 // copy segments over, a chunk at a time 1007 do { 1008 d [0 .. delta] = s [0 .. delta]; 1009 len -= delta; 1010 s += delta; 1011 d += delta; 1012 1013 // bogus trailing '\' 1014 if (len < 2) 1015 { 1016 *d++ = '\\'; 1017 len = 0; 1018 break; 1019 } 1020 1021 // translate \char 1022 char c = s[1]; 1023 switch (c) 1024 { 1025 case '\\': 1026 break; 1027 case '\'': 1028 c = '\''; 1029 break; 1030 case '"': 1031 c = '"'; 1032 break; 1033 case 'a': 1034 c = '\a'; 1035 break; 1036 case 'b': 1037 c = '\b'; 1038 break; 1039 case 'f': 1040 c = '\f'; 1041 break; 1042 case 'n': 1043 c = '\n'; 1044 break; 1045 case 'r': 1046 c = '\r'; 1047 break; 1048 case 't': 1049 c = '\t'; 1050 break; 1051 case 'v': 1052 c = '\v'; 1053 break; 1054 default: 1055 *d++ = '\\'; 1056 } 1057 *d++ = c; 1058 len -= 2; 1059 s += 2; 1060 } while ((delta = indexOf (s, '\\', len)) < len); 1061 1062 // copy tail too 1063 d [0 .. len] = s [0 .. len]; 1064 return dst [0 .. (d + len) - dst.ptr]; 1065 } 1066 return src; 1067 } 1068 1069 1070 /****************************************************************************** 1071 1072 jhash() -- hash a variable-length key into a 32-bit value 1073 1074 k : the key (the unaligned variable-length array of bytes) 1075 len : the length of the key, counting by bytes 1076 level : can be any 4-byte value 1077 1078 Returns a 32-bit value. Every bit of the key affects every bit of 1079 the return value. Every 1-bit and 2-bit delta achieves avalanche. 1080 1081 About 4.3*len + 80 X86 instructions, with excellent pipelining 1082 1083 The best hash table sizes are powers of 2. There is no need to do 1084 mod a prime (mod is sooo slow!). If you need less than 32 bits, 1085 use a bitmask. For example, if you need only 10 bits, do 1086 1087 h = (h & hashmask(10)); 1088 1089 In which case, the hash table should have hashsize(10) elements. 1090 If you are hashing n strings (ub1 **)k, do it like this: 1091 1092 for (i=0, h=0; i<n; ++i) h = hash( k[i], len[i], h); 1093 1094 By Bob Jenkins, 1996. bob_jenkins@burtleburtle.net. You may use 1095 this code any way you wish, private, educational, or commercial. 1096 It's free. 1097 1098 See http://burtleburtle.net/bob/hash/evahash.html 1099 Use for hash table lookup, or anything where one collision in 2^32 1100 is acceptable. Do NOT use for cryptographic purposes. 1101 1102 ******************************************************************************/ 1103 1104 size_t jhash (ubyte* k, size_t len, size_t c = 0) 1105 { 1106 size_t a = 0x9e3779b9, 1107 b = 0x9e3779b9, 1108 i = len; 1109 1110 // handle most of the key 1111 while (i >= 12) 1112 { 1113 a += *cast(uint *)(k+0); 1114 b += *cast(uint *)(k+4); 1115 c += *cast(uint *)(k+8); 1116 1117 a -= b; a -= c; a ^= (c>>13); 1118 b -= c; b -= a; b ^= (a<<8); 1119 c -= a; c -= b; c ^= (b>>13); 1120 a -= b; a -= c; a ^= (c>>12); 1121 b -= c; b -= a; b ^= (a<<16); 1122 c -= a; c -= b; c ^= (b>>5); 1123 a -= b; a -= c; a ^= (c>>3); 1124 b -= c; b -= a; b ^= (a<<10); 1125 c -= a; c -= b; c ^= (b>>15); 1126 k += 12; i -= 12; 1127 } 1128 1129 // handle the last 11 bytes 1130 c += len; 1131 switch (i) 1132 { 1133 case 11: c+=(cast(uint)k[10]<<24); goto case; 1134 case 10: c+=(cast(uint)k[9]<<16); goto case; 1135 case 9 : c+=(cast(uint)k[8]<<8); goto case; 1136 case 8 : b+=(cast(uint)k[7]<<24); goto case; 1137 case 7 : b+=(cast(uint)k[6]<<16); goto case; 1138 case 6 : b+=(cast(uint)k[5]<<8); goto case; 1139 case 5 : b+=(cast(uint)k[4]); goto case; 1140 case 4 : a+=(cast(uint)k[3]<<24); goto case; 1141 case 3 : a+=(cast(uint)k[2]<<16); goto case; 1142 case 2 : a+=(cast(uint)k[1]<<8); goto case; 1143 case 1 : a+=(cast(uint)k[0]); goto default; 1144 default: 1145 } 1146 1147 a -= b; a -= c; a ^= (c>>13); 1148 b -= c; b -= a; b ^= (a<<8); 1149 c -= a; c -= b; c ^= (b>>13); 1150 a -= b; a -= c; a ^= (c>>12); 1151 b -= c; b -= a; b ^= (a<<16); 1152 c -= a; c -= b; c ^= (b>>5); 1153 a -= b; a -= c; a ^= (c>>3); 1154 b -= c; b -= a; b ^= (a<<10); 1155 c -= a; c -= b; c ^= (b>>15); 1156 1157 return c; 1158 } 1159 1160 /// ditto 1161 size_t jhash (void[] x, size_t c = 0) 1162 { 1163 return jhash (cast(ubyte*) x.ptr, x.length, c); 1164 } 1165 1166 1167 /****************************************************************************** 1168 1169 Helper fruct for iterator lines(). A fruct is a low 1170 impact mechanism for capturing context relating to an 1171 opApply (conjunction of the names struct and foreach) 1172 1173 ******************************************************************************/ 1174 1175 private struct LineFruct(T) 1176 { 1177 private T[] src; 1178 1179 int opApply (scope int delegate (ref T[] line) dg) 1180 { 1181 int ret; 1182 size_t pos, 1183 mark; 1184 T[] line; 1185 1186 enum T nl = '\n'; 1187 enum T cr = '\r'; 1188 1189 while ((pos = locate (src, nl, mark)) < src.length) 1190 { 1191 auto end = pos; 1192 if (end && src[end-1] is cr) 1193 --end; 1194 1195 line = src [mark .. end]; 1196 if ((ret = dg (line)) != 0) 1197 return ret; 1198 mark = pos + 1; 1199 } 1200 1201 line = src [mark .. $]; 1202 if (mark <= src.length) 1203 ret = dg (line); 1204 1205 return ret; 1206 } 1207 } 1208 1209 /****************************************************************************** 1210 1211 Helper fruct for iterator delims(). A fruct is a low 1212 impact mechanism for capturing context relating to an 1213 opApply (conjunction of the names struct and foreach) 1214 1215 ******************************************************************************/ 1216 1217 private struct DelimFruct (T) 1218 { 1219 private T[] src; 1220 private cstring set; 1221 1222 int opApply (scope int delegate (ref T[] token) dg) 1223 { 1224 int ret; 1225 size_t pos, 1226 mark; 1227 T[] token; 1228 1229 // optimize for single delimiter case 1230 if (set.length is 1) 1231 while ((pos = locate (src, set[0], mark)) < src.length) 1232 { 1233 token = src [mark .. pos]; 1234 if ((ret = dg (token)) != 0) 1235 return ret; 1236 mark = pos + 1; 1237 } 1238 else 1239 if (set.length > 1) 1240 foreach (i, elem; src) 1241 if (contains (set, elem)) 1242 { 1243 token = src [mark .. i]; 1244 if ((ret = dg (token)) != 0) 1245 return ret; 1246 mark = i + 1; 1247 } 1248 1249 token = src [mark .. $]; 1250 if (mark <= src.length) 1251 ret = dg (token); 1252 1253 return ret; 1254 } 1255 } 1256 1257 /****************************************************************************** 1258 1259 Helper fruct for iterator patterns(). A fruct is a low 1260 impact mechanism for capturing context relating to an 1261 opApply (conjunction of the names struct and foreach) 1262 1263 ******************************************************************************/ 1264 1265 public struct PatternFruct (T) 1266 { 1267 private T[] src, sub; 1268 private cstring pattern; 1269 1270 int opApply (scope int delegate (ref T[] token) dg) 1271 { 1272 int ret; 1273 size_t pos, 1274 mark; 1275 T[] token; 1276 1277 while ((pos = index (src, pattern, mark)) < src.length) 1278 { 1279 token = src [mark .. pos]; 1280 if ((ret = dg(token)) != 0) 1281 return ret; 1282 if (sub.ptr && (ret = dg(sub)) != 0) 1283 return ret; 1284 mark = pos + pattern.length; 1285 } 1286 1287 token = src [mark .. $]; 1288 if (mark <= src.length) 1289 ret = dg (token); 1290 1291 return ret; 1292 } 1293 } 1294 1295 /****************************************************************************** 1296 1297 Helper fruct for iterator quotes(). A fruct is a low 1298 impact mechanism for capturing context relating to an 1299 opApply (conjunction of the names struct and foreach) 1300 1301 ******************************************************************************/ 1302 1303 private struct QuoteFruct (T) 1304 { 1305 private T[] src; 1306 private cstring set; 1307 1308 int opApply (scope int delegate (ref T[] token) dg) 1309 { 1310 int ret; 1311 size_t mark; 1312 T[] token; 1313 1314 if (set.length) 1315 for (size_t i=0; i < src.length; ++i) 1316 { 1317 T c = src[i]; 1318 if (c is '"' || c is '\'') 1319 i = locate (src, c, i+1); 1320 else 1321 if (contains (set, c)) 1322 { 1323 token = src [mark .. i]; 1324 if ((ret = dg (token)) != 0) 1325 return ret; 1326 mark = i + 1; 1327 } 1328 } 1329 1330 token = src [mark .. $]; 1331 if (mark <= src.length) 1332 ret = dg (token); 1333 1334 return ret; 1335 } 1336 } 1337 1338 1339 /****************************************************************************** 1340 1341 ******************************************************************************/ 1342 1343 unittest 1344 { 1345 char[64] tmp; 1346 1347 test (isSpace (' ') && !isSpace ('d')); 1348 1349 test (indexOf ("abc".ptr, 'a', 3) is 0); 1350 test (indexOf ("abc".ptr, 'b', 3) is 1); 1351 test (indexOf ("abc".ptr, 'c', 3) is 2); 1352 test (indexOf ("abc".ptr, 'd', 3) is 3); 1353 test (indexOf ("abcabcabc".ptr, 'd', 9) is 9); 1354 1355 test (mismatch ("abc".ptr, "abc".ptr, 3) is 3); 1356 test (mismatch ("abc".ptr, "abd".ptr, 3) is 2); 1357 test (mismatch ("abc".ptr, "acc".ptr, 3) is 1); 1358 test (mismatch ("abc".ptr, "ccc".ptr, 3) is 0); 1359 1360 test (matching ("abc".ptr, "abc".ptr, 3)); 1361 test (matching ("abc".ptr, "abb".ptr, 3) is false); 1362 1363 test (contains ("abc", 'a')); 1364 test (contains ("abc", 'b')); 1365 test (contains ("abc", 'c')); 1366 test (contains ("abc", 'd') is false); 1367 1368 test (containsPattern ("abc", "ab")); 1369 test (containsPattern ("abc", "bc")); 1370 test (containsPattern ("abc", "abc")); 1371 test (containsPattern ("abc", "zabc") is false); 1372 test (containsPattern ("abc", "abcd") is false); 1373 test (containsPattern ("abc", "za") is false); 1374 test (containsPattern ("abc", "cd") is false); 1375 1376 test (trim ("") == ""); 1377 test (trim (" abc ") == "abc"); 1378 test (trim (" ") == ""); 1379 1380 test (strip ("", '%') == ""); 1381 test (strip ("%abc%%%", '%') == "abc"); 1382 test (strip ("#####", '#') == ""); 1383 test (stripl ("#####", '#') == ""); 1384 test (stripl (" ###", ' ') == "###"); 1385 test (stripl ("#####", 's') == "#####"); 1386 test (stripr ("#####", '#') == ""); 1387 test (stripr ("### ", ' ') == "###"); 1388 test (stripr ("#####", 's') == "#####"); 1389 1390 test (replace ("abc".dup, 'b', ':') == "a:c"); 1391 test (substitute ("abc", "bc", "x") == "ax"); 1392 1393 test (locate ("abc".dup, 'c', 1) is 2); 1394 1395 test (locate ("abc", 'c') is 2); 1396 test (locate ("abc", 'a') is 0); 1397 test (locate ("abc", 'd') is 3); 1398 test (locate ("", 'c') is 0); 1399 1400 test (locatePrior ("abce".dup, 'c') is 2); 1401 test (locatePrior ("abce", 'a') is 0); 1402 test (locatePrior ("abce", 'd') is 4); 1403 test (locatePrior ("abce", 'c', 3) is 2); 1404 test (locatePrior ("abce", 'c', 2) is 4); 1405 test (locatePrior ("", 'c') is 0); 1406 1407 auto x = delimit ("::b", ":"); 1408 test (x.length is 3 && x[0] == "" && x[1] == "" && x[2] == "b"); 1409 x = delimit ("a:bc:d", ":"); 1410 test (x.length is 3 && x[0] == "a" && x[1] == "bc" && x[2] == "d"); 1411 x = delimit ("abcd", ":"); 1412 test (x.length is 1 && x[0] == "abcd"); 1413 x = delimit ("abcd:", ":"); 1414 test (x.length is 2 && x[0] == "abcd" && x[1] == ""); 1415 x = delimit ("a;b$c#d:e@f", ";:$#@"); 1416 test (x.length is 6 && x[0]=="a" && x[1]=="b" && x[2]=="c" && 1417 x[3]=="d" && x[4]=="e" && x[5]=="f"); 1418 1419 test (locatePattern ("abcdefg".dup, "") is 7); 1420 test (locatePattern ("abcdefg", "g") is 6); 1421 test (locatePattern ("abcdefg", "abcdefg") is 0); 1422 test (locatePattern ("abcdefg", "abcdefgx") is 7); 1423 test (locatePattern ("abcdefg", "cce") is 7); 1424 test (locatePattern ("abcdefg", "cde") is 2); 1425 test (locatePattern ("abcdefgcde", "cde", 3) is 7); 1426 1427 test (locatePatternPrior ("abcdefg".dup, "") is 7); 1428 test (locatePatternPrior ("abcdefg", "cce") is 7); 1429 test (locatePatternPrior ("abcdefg", "cde") is 2); 1430 test (locatePatternPrior ("abcdefgcde", "cde", 6) is 2); 1431 test (locatePatternPrior ("abcdefgcde", "cde", 4) is 2); 1432 test (locatePatternPrior ("abcdefg", "abcdefgx") is 7); 1433 1434 x = splitLines ("a\nb\n"); 1435 test (x.length is 3 && x[0] == "a" && x[1] == "b" && x[2] == ""); 1436 x = splitLines ("a\r\n"); 1437 test (x.length is 2 && x[0] == "a" && x[1] == ""); 1438 1439 x = splitLines ("a"); 1440 test (x.length is 1 && x[0] == "a"); 1441 x = splitLines (""); 1442 test (x.length is 1); 1443 1444 cstring[] q; 1445 foreach (element; quotes ("1 'avcc cc ' 3", " ")) 1446 q ~= element; 1447 test (q.length is 3 && q[0] == "1" && q[1] == "'avcc cc '" && q[2] == "3"); 1448 1449 x = split ("one, two, three", ","); 1450 test (x.length is 3 && x[0] == "one" && x[1] == " two" && x[2] == " three"); 1451 x = split ("one, two, three", ", "); 1452 test (x.length is 3 && x[0] == "one" && x[1] == "two" && x[2] == "three"); 1453 x = split ("one, two, three", ",,"); 1454 test (x.length is 1 && x[0] == "one, two, three"); 1455 x = split ("one,,", ","); 1456 test (x.length is 3 && x[0] == "one" && x[1] == "" && x[2] == ""); 1457 1458 istring t, h; 1459 h = head ("one:two:three", ":", t); 1460 test (h == "one" && t == "two:three"); 1461 h = head ("one:::two:three", ":::", t); 1462 test (h == "one" && t == "two:three"); 1463 h = head ("one:two:three", "*", t); 1464 test (h == "one:two:three" && t is null); 1465 1466 t = tail ("one:two:three", ":", h); 1467 test (h == "one:two" && t == "three"); 1468 t = tail ("one:::two:three", ":::", h); 1469 test (h == "one" && t == "two:three"); 1470 t = tail ("one:two:three", "*", h); 1471 test (t == "one:two:three" && h is null); 1472 1473 test (chopl("hello world", "hello ") == "world"); 1474 test (chopl("hello", "hello") == ""); 1475 test (chopl("hello world", " ") == "hello world"); 1476 test (chopl("hello world", "") == "hello world"); 1477 1478 test (chopr("hello world", " world") == "hello"); 1479 test (chopr("hello", "hello") == ""); 1480 test (chopr("hello world", " ") == "hello world"); 1481 test (chopr("hello world", "") == "hello world"); 1482 1483 istring[] foo = ["one", "two", "three"]; 1484 auto j = join (foo); 1485 test (j == "onetwothree"); 1486 j = join (foo, ", "); 1487 test (j == "one, two, three"); 1488 j = join (foo, " ", tmp); 1489 test (j == "one two three"); 1490 test (j.ptr is tmp.ptr); 1491 1492 test (repeat ("abc", 0) == ""); 1493 test (repeat ("abc", 1) == "abc"); 1494 test (repeat ("abc", 2) == "abcabc"); 1495 test (repeat ("abc", 4) == "abcabcabcabc"); 1496 test (repeat ("", 4) == ""); 1497 char[10] rep; 1498 test (repeat ("abc", 0, rep) == ""); 1499 test (repeat ("abc", 1, rep) == "abc"); 1500 test (repeat ("abc", 2, rep) == "abcabc"); 1501 test (repeat ("", 4, rep) == ""); 1502 1503 test (unescape ("abc") == "abc"); 1504 test (unescape ("abc\\") == "abc\\"); 1505 test (unescape ("abc\\t") == "abc\t"); 1506 test (unescape ("abc\\tc") == "abc\tc"); 1507 test (unescape ("\\t") == "\t"); 1508 test (unescape ("\\tx") == "\tx"); 1509 test (unescape ("\\v\\vx") == "\v\vx"); 1510 test (unescape ("abc\\t\\a\\bc") == "abc\t\a\bc"); 1511 } 1512 1513 debug (Util) 1514 { 1515 auto x = import("Util.d"); 1516 1517 void main() 1518 { 1519 mismatch ("".ptr, x.ptr, 0); 1520 indexOf ("".ptr, '@', 0); 1521 char[] s; 1522 split (s, " "); 1523 //indexOf (s.ptr, '@', 0); 1524 1525 } 1526 }