1 /*******************************************************************************
2 
3         Placeholder for a variety of wee functions.
4 
5         Several of these functions return an index value, representing where
6         some criteria was identified. When said criteria is not matched, the
7         functions return a value representing the array length provided to
8         them. That is, for those scenarios where C functions might typically
9         return -1 these functions return length instead. This operate nicely
10         with D slices:
11         ---
12         auto text = "happy:faces";
13 
14         assert (text[0 .. locate (text, ':')] == "happy");
15 
16         assert (text[0 .. locate (text, '!')] == "happy:faces");
17         ---
18 
19         The contains() function is more convenient for trivial
20         lookup cases:
21         ---
22         if (contains ("fubar", '!'))
23             ...
24         ---
25 
26         Note that where some functions expect a size_t as an argument, the
27         D template-matching algorithm will fail where an int is provided
28         instead. This is the typically the cause of "template not found"
29         errors. Also note that name overloading is not supported cleanly
30         by IFTI at this time, so is not applied here.
31 
32 
33         Applying the D "import alias" mechanism to this module is highly
34         recommended, in order to limit namespace pollution:
35         ---
36         import Util = ocean.text.Util;
37 
38         auto s = Util.trim ("  foo ");
39         ---
40 
41 
42         Function templates:
43         ---
44         trim (source)                               // trim whitespace
45         triml (source)                              // trim whitespace
46         trimr (source)                              // trim whitespace
47         strip (source, match)                       // trim elements
48         stripl (source, match)                      // trim elements
49         stripr (source, match)                      // trim elements
50         chopl (source, match)                       // trim pattern match
51         chopr (source, match)                       // trim pattern match
52         delimit (src, set)                          // split on delims
53         split (source, pattern)                     // split on pattern
54         splitLines (source);                        // split on lines
55         head (source, pattern, tail)                // split to head & tail
56         join (source, postfix, output)              // join text segments
57         prefix (dst, prefix, content...)            // prefix text segments
58         postfix (dst, postfix, content...)          // postfix text segments
59         combine (dst, prefix, postfix, content...)  // combine lotsa stuff
60         repeat (source, count, output)              // repeat source
61         replace (source, match, replacement)        // replace chars
62         substitute (source, match, replacement)     // replace/remove matches
63         count (source, match)                       // count instances
64         contains (source, match)                    // has char?
65         containsPattern (source, match)             // has pattern?
66         index (source, match, start)                // find match index
67         locate (source, match, start)               // find char
68         locatePrior (source, match, start)          // find prior char
69         locatePattern (source, match, start);       // find pattern
70         locatePatternPrior (source, match, start);  // find prior pattern
71         indexOf (s*, match, length)                 // low-level lookup
72         mismatch (s1*, s2*, length)                 // low-level compare
73         matching (s1*, s2*, length)                 // low-level compare
74         isSpace (match)                             // is whitespace?
75         unescape(source, output)                    // convert '\' prefixes
76         lines (str)                                 // foreach lines
77         quotes (str, set)                           // foreach quotes
78         delimiters (str, set)                       // foreach delimiters
79         patterns (str, pattern)                     // foreach patterns
80         ---
81 
82         Please note that any 'pattern' referred to within this module
83         refers to a pattern of characters, and not some kind of regex
84         descriptor. Use the Regex module for regex operation.
85 
86         Copyright:
87             Copyright (c) 2004 Kris Bell.
88             Some parts copyright (c) 2009-2016 dunnhumby Germany GmbH.
89             All rights reserved.
90 
91         License:
92             Tango Dual License: 3-Clause BSD License / Academic Free License v3.0.
93             See LICENSE_TANGO.txt for details.
94 
95         Version:
96             Apr 2004: Initial release
97             Dec 2006: South Seas version
98 
99         Authors: Kris
100 
101 *******************************************************************************/
102 
103 module ocean.text.Util;
104 
105 //import ocean.meta.types.Qualifiers : cstring, mstring;
106 import ocean.meta.types.Qualifiers;
107 import ocean.core.Verify;
108 
109 version (unittest) import ocean.core.Test;
110 
111 
112 /******************************************************************************
113 
114         Trim the provided array by stripping whitespace from both
115         ends. Returns a slice of the original content
116 
117 ******************************************************************************/
118 
119 inout(char)[] trim (inout(char)[] source)
120 {
121         inout(char)*   head = source.ptr,
122                        tail = head + source.length;
123 
124         while (head < tail && isSpace(*head))
125                ++head;
126 
127         while (tail > head && isSpace(*(tail-1)))
128                --tail;
129 
130         return head [0 .. tail - head];
131 }
132 
133 /******************************************************************************
134 
135         Trim the provided array by stripping whitespace from the left.
136         Returns a slice of the original content
137 
138 ******************************************************************************/
139 
140 inout(char)[] triml (inout(char)[] source)
141 {
142         inout(char)*   head = source.ptr,
143                        tail = head + source.length;
144 
145         while (head < tail && isSpace(*head))
146                ++head;
147 
148         return head [0 .. tail - head];
149 }
150 
151 /******************************************************************************
152 
153         Trim the provided array by stripping whitespace from the right.
154         Returns a slice of the original content
155 
156 ******************************************************************************/
157 
158 inout(char)[] trimr (inout(char)[] source)
159 {
160         inout(char)*   head = source.ptr,
161                        tail = head + source.length;
162 
163         while (tail > head && isSpace(*(tail-1)))
164                --tail;
165 
166         return head [0 .. tail - head];
167 }
168 
169 /******************************************************************************
170 
171         Trim the given array by stripping the provided match from
172         both ends. Returns a slice of the original content
173 
174 ******************************************************************************/
175 
176 inout(char)[] strip (inout(char)[] source, char match)
177 {
178         inout(char)*   head = source.ptr,
179                        tail = head + source.length;
180 
181         while (head < tail && *head is match)
182                ++head;
183 
184         while (tail > head && *(tail-1) is match)
185                --tail;
186 
187         return head [0 .. tail - head];
188 }
189 
190 /******************************************************************************
191 
192         Trim the given array by stripping the provided match from
193         the left hand side. Returns a slice of the original content
194 
195 ******************************************************************************/
196 
197 inout(char)[] stripl (inout(char)[] source, char match)
198 {
199         inout(char)* head = source.ptr,
200                      tail = head + source.length;
201 
202         while (head < tail && *head is match)
203                ++head;
204 
205         return head [0 .. tail - head];
206 }
207 
208 /******************************************************************************
209 
210         Trim the given array by stripping the provided match from
211         the right hand side. Returns a slice of the original content
212 
213 ******************************************************************************/
214 
215 inout(char)[] stripr (inout(char)[] source, char match)
216 {
217         inout(char)* head = source.ptr,
218                      tail = head + source.length;
219 
220         while (tail > head && *(tail-1) is match)
221                --tail;
222 
223         return head [0 .. tail - head];
224 }
225 
226 /******************************************************************************
227 
228         Chop the given source by stripping the provided match from
229         the left hand side. Returns a slice of the original content
230 
231 ******************************************************************************/
232 
233 inout(char)[] chopl (inout(char)[] source, cstring match)
234 {
235         if (match.length <= source.length)
236             if (source[0 .. match.length] == match)
237                 source = source [match.length .. $];
238 
239         return source;
240 }
241 
242 /******************************************************************************
243 
244         Chop the given source by stripping the provided match from
245         the right hand side. Returns a slice of the original content
246 
247 ******************************************************************************/
248 
249 inout(char)[] chopr (inout(char)[] source, cstring match)
250 {
251         if (match.length <= source.length)
252             if (source[$-match.length .. $] == match)
253                 source = source [0 .. $-match.length];
254 
255         return source;
256 }
257 
258 /******************************************************************************
259 
260         Replace all instances of one element with another (in place)
261 
262 ******************************************************************************/
263 
264 mstring replace (mstring source, char match, char replacement)
265 {
266         foreach (ref c; source)
267                  if (c is match)
268                      c = replacement;
269         return source;
270 }
271 
272 /******************************************************************************
273 
274         Substitute all instances of match from source. Set replacement
275         to null in order to remove instead of replace
276 
277 ******************************************************************************/
278 
279 mstring substitute (cstring source, cstring match, cstring replacement)
280 {
281         mstring output;
282 
283         foreach (s; patterns (source, match, replacement))
284         {
285                     output ~= s;
286         }
287         return output;
288 }
289 
290 /******************************************************************************
291 
292         Count all instances of match within source
293 
294 ******************************************************************************/
295 
296 size_t count (cstring source, cstring match)
297 {
298         size_t c;
299 
300         foreach (s; patterns (source, match))
301                     ++c;
302         verify(c > 0);
303         return c - 1;
304 }
305 
306 /******************************************************************************
307 
308         Returns whether or not the provided array contains an instance
309         of the given match
310 
311 ******************************************************************************/
312 
313 bool contains (cstring source, char match)
314 {
315         return indexOf (source.ptr, match, source.length) != source.length;
316 }
317 
318 /******************************************************************************
319 
320         Returns whether or not the provided array contains an instance
321         of the given match
322 
323 ******************************************************************************/
324 
325 bool containsPattern (cstring source, cstring match)
326 {
327         return locatePattern (source, match) != source.length;
328 }
329 
330 unittest
331 {
332     mstring greeting = "Hello world".dup;
333     istring satan = "Hell";
334     test(containsPattern(greeting, satan), "Pattern not found");
335 }
336 
337 
338 /******************************************************************************
339 
340         Return the index of the next instance of 'match' starting at
341         position 'start', or source.length where there is no match.
342 
343         Parameter 'start' defaults to 0
344 
345 ******************************************************************************/
346 
347 size_t index (cstring source, cstring match, size_t start=0)
348 {
349         return (match.length is 1) ? locate (source, match[0], start)
350                                    : locatePattern (source, match, start);
351 }
352 
353 unittest
354 {
355     char[] url;
356     size_t start;
357     cstring CSLASHSLASH = "://";
358     start = index(url, CSLASHSLASH);
359 }
360 
361 /******************************************************************************
362 
363         Return the index of the prior instance of 'match' starting
364         just before 'start', or source.length where there is no match.
365 
366         Parameter 'start' defaults to source.length
367 
368 ******************************************************************************/
369 
370 size_t rindex (cstring source, cstring match, size_t start=size_t.max)
371 {
372         return (match.length is 1) ? locatePrior (source, match[0], start)
373                                    : locatePatternPrior (source, match, start);
374 }
375 
376 /******************************************************************************
377 
378         Return the index of the next instance of 'match' starting at
379         position 'start', or source.length where there is no match.
380 
381         Parameter 'start' defaults to 0
382 
383 ******************************************************************************/
384 
385 size_t locate (cstring source, char match, size_t start=0)
386 {
387         if (start > source.length)
388             start = source.length;
389 
390         return indexOf (source.ptr+start, match, source.length - start) + start;
391 }
392 
393 unittest
394 {
395     size_t start;
396     char[] url;
397     start = locate(url, '/', start);
398 }
399 
400 /******************************************************************************
401 
402         Return the index of the prior instance of 'match' starting
403         just before 'start', or source.length where there is no match.
404 
405         Parameter 'start' defaults to source.length
406 
407 ******************************************************************************/
408 
409 size_t locatePrior (cstring source, char match, size_t start = size_t.max)
410 {
411         if (start > source.length)
412             start = source.length;
413 
414         while (start > 0)
415                if (source[--start] is match)
416                    return start;
417         return source.length;
418 }
419 
420 /******************************************************************************
421 
422         Return the index of the next instance of 'match' starting at
423         position 'start', or source.length where there is no match.
424 
425         Parameter 'start' defaults to 0
426 
427 ******************************************************************************/
428 
429 size_t locatePattern (cstring source, cstring match, size_t start=0)
430 {
431         size_t idx;
432         const(char)* p = source.ptr + start;
433         size_t extent = source.length - start - match.length + 1;
434 
435         if (match.length && extent <= source.length)
436         {
437             while (extent)
438                 if ((idx = indexOf (p, match[0], extent)) is extent)
439                     break;
440                 else
441                 {
442                     if (matching (p+=idx, match.ptr, match.length))
443                         return p - source.ptr;
444                     else
445                     {
446                         extent -= (idx+1);
447                         ++p;
448                     }
449                 }
450         }
451 
452         return source.length;
453 }
454 
455 /******************************************************************************
456 
457         Return the index of the prior instance of 'match' starting
458         just before 'start', or source.length where there is no match.
459 
460         Parameter 'start' defaults to source.length
461 
462 ******************************************************************************/
463 
464 size_t locatePatternPrior (cstring source, cstring match, size_t start=size_t.max)
465 {
466         auto len = source.length;
467 
468         if (start > len)
469             start = len;
470 
471         if (match.length && match.length <= len)
472             while (start)
473                   {
474                   start = locatePrior (source, match[0], start);
475                   if (start is len)
476                       break;
477                   else
478                      if ((start + match.length) <= len)
479                           if (matching (source.ptr+start, match.ptr, match.length))
480                               return start;
481                   }
482 
483         return len;
484 }
485 
486 /******************************************************************************
487 
488         Split the provided array on the first pattern instance, and
489         return the resultant head and tail. The pattern is excluded
490         from the two segments.
491 
492         Where a segment is not found, tail will be null and the return
493         value will be the original array.
494 
495 ******************************************************************************/
496 
497 inout(char)[] head (inout(char)[] src, cstring pattern, out inout(char)[] tail)
498 {
499         auto i = locatePattern (src, pattern);
500         if (i != src.length)
501            {
502            tail = src [i + pattern.length .. $];
503            src = src [0 .. i];
504            }
505         return src;
506 }
507 
508 /******************************************************************************
509 
510         Split the provided array on the last pattern instance, and
511         return the resultant head and tail. The pattern is excluded
512         from the two segments.
513 
514         Where a segment is not found, head will be null and the return
515         value will be the original array.
516 
517 ******************************************************************************/
518 
519 inout(char)[] tail (inout(char)[] src, cstring pattern, out inout(char)[] head)
520 {
521         auto i = locatePatternPrior (src, pattern);
522         if (i != src.length)
523            {
524            head = src [0 .. i];
525            src = src [i + pattern.length .. $];
526            }
527         return src;
528 }
529 
530 /******************************************************************************
531 
532         Split the provided array wherever a delimiter-set instance is
533         found, and return the resultant segments. The delimiters are
534         excluded from each of the segments. Note that delimiters are
535         matched as a set of alternates rather than as a pattern.
536 
537         Splitting on a single delimiter is considerably faster than
538         splitting upon a set of alternatives.
539 
540         Note that the src content is not duplicated by this function,
541         but is sliced instead.
542 
543 ******************************************************************************/
544 
545 inout(char)[][] delimit (inout(char)[] src, cstring set)
546 {
547         typeof(return) result;
548 
549         // Cast is needed to avoid instantiating `DelimFruct!(inout(char))`
550         // which can't compile. We cast segment back to `inout` after so the end
551         // result is still type-checked.
552 
553         foreach (segment; delimiters (cast(cstring) src, set))
554                  result ~= cast(typeof(src)) segment;
555         return result;
556 }
557 
558 /******************************************************************************
559 
560         Split the provided array wherever a pattern instance is
561         found, and return the resultant segments. The pattern is
562         excluded from each of the segments.
563 
564         Note that the src content is not duplicated by this function,
565         but is sliced instead.
566 
567 ******************************************************************************/
568 
569 inout(char)[][] split (inout(char)[] src, cstring pattern)
570 {
571         typeof(return) result;
572 
573         // Cast is needed to avoid instantiating `PatternFruct!(inout(char))`
574         // which can't compile. We cast segment back to `inout` after so the end
575         // result is still type-checked.
576 
577         foreach (segment; patterns (cast(cstring) src, pattern))
578                  result ~= cast(typeof(src)) segment;
579         return result;
580 }
581 
582 /******************************************************************************
583 
584         Convert text into a set of lines, where each line is identified
585         by a \n or \r\n combination. The line terminator is stripped from
586         each resultant array
587 
588         Note that the src content is not duplicated by this function, but
589         is sliced instead.
590 
591 ******************************************************************************/
592 
593 inout(char)[][] toLines (inout(char)[] src)
594 {
595         typeof(return) result;
596 
597         // Cast is needed to avoid instantiating `LineFruct!(inout(char))`
598         // which can't compile. We cast line back to `inout` after so the end
599         // result is still type-checked.
600 
601         foreach (line; lines (cast(cstring) src))
602                  result ~= cast(typeof(src)) line;
603         return result;
604 }
605 
606 alias toLines splitLines;
607 
608 /******************************************************************************
609 
610         Return the indexed line, where each line is identified by a \n
611         or \r\n combination. The line terminator is stripped from the
612         resultant line
613 
614         Note that src content is not duplicated by this function, but
615         is sliced instead.
616 
617 ******************************************************************************/
618 
619 inout(char)[] lineOf (inout(char)[] src, size_t index)
620 {
621         int i = 0;
622 
623         // Cast is needed to avoid instantiating `LineFruct!(inout(char))`
624         // which can't compile. We cast line back to `inout` after so the end
625         // result is still type-checked.
626 
627         foreach (line; lines(cast(cstring) src))
628                  if (i++ is index)
629                      return cast(typeof(return)) line;
630         return null;
631 }
632 
633 /******************************************************************************
634 
635         Combine a series of text segments together, each appended with
636         a postfix pattern. An optional output buffer can be provided to
637         avoid heap activity - it should be large enough to contain the
638         entire output, otherwise the heap will be used instead.
639 
640         Returns a valid slice of the output, containing the concatenated
641         text.
642 
643 ******************************************************************************/
644 
645 mstring join (const(char[])[] src, cstring postfix=null, mstring dst = null)
646 {
647         return combine(dst, null, postfix, src);
648 }
649 
650 unittest
651 {
652     test (join([ "aaa", "bbb", "ccc" ], ",") == "aaa,bbb,ccc");
653 
654     // ensure `join` works with differently qualified arguments
655     const(char[][]) mut = [ "xxx".dup, "yyy".dup, "zzz" ];
656     char[20] buf;
657     auto ret = join(mut, " ", buf);
658     test (ret == "xxx yyy zzz");
659     test (ret.ptr is buf.ptr);
660 }
661 
662 /******************************************************************************
663 
664         Combine a series of text segments together, each prepended with
665         a prefix pattern. An optional output buffer can be provided to
666         avoid heap activity - it should be large enough to contain the
667         entire output, otherwise the heap will be used instead.
668 
669         Note that, unlike join(), the output buffer is specified first
670         such that a set of trailing strings can be provided.
671 
672         Returns a valid slice of the output, containing the concatenated
673         text.
674 
675 ******************************************************************************/
676 
677 mstring prefix (mstring dst, cstring prefix, const(char[])[] src...)
678 {
679         return combine(dst, prefix, null, src);
680 }
681 
682 /******************************************************************************
683 
684         Combine a series of text segments together, each appended with an
685         optional postfix pattern. An optional output buffer can be provided
686         to avoid heap activity - it should be large enough to contain the
687         entire output, otherwise the heap will be used instead.
688 
689         Note that, unlike join(), the output buffer is specified first
690         such that a set of trailing strings can be provided.
691 
692         Returns a valid slice of the output, containing the concatenated
693         text.
694 
695 ******************************************************************************/
696 
697 mstring postfix (mstring dst, cstring postfix, cstring[] src...)
698 {
699         return combine(dst, null, postfix, src);
700 }
701 
702 /******************************************************************************
703 
704         Combine a series of text segments together, each prefixed and/or
705         postfixed with optional strings. An optional output buffer can be
706         provided to avoid heap activity - which should be large enough to
707         contain the entire output, otherwise the heap will be used instead.
708 
709         Note that, unlike join(), the output buffer is specified first
710         such that a set of trailing strings can be provided.
711 
712         Returns a valid slice of the output, containing the concatenated
713         text.
714 
715 ******************************************************************************/
716 
717 mstring combine (mstring dst, cstring prefix, cstring postfix, const(char[])[] src ...)
718 {
719         size_t len = src.length * prefix.length +
720                    src.length * postfix.length;
721 
722         foreach (segment; src)
723                  len += segment.length;
724 
725         if (dst.length < len)
726             dst.length = len;
727 
728         auto p = dst.ptr;
729         foreach (segment; src)
730                 {
731                 p[0 .. prefix.length] = prefix;
732                 p += prefix.length;
733                 p[0 .. segment.length] = segment;
734                 p += segment.length;
735                 p[0 .. postfix.length] = postfix;
736                 p += postfix.length;
737                 }
738 
739         // remove trailing seperator
740         if (len)
741             len -= postfix.length;
742         return dst [0 .. len];
743 }
744 
745 /******************************************************************************
746 
747         Repeat an array for a specific number of times. An optional output
748         buffer can be provided to avoid heap activity - it should be large
749         enough to contain the entire output, otherwise the heap will be used
750         instead.
751 
752         Returns a valid slice of the output, containing the concatenated
753         text.
754 
755 ******************************************************************************/
756 
757 mstring repeat (cstring src, size_t count, mstring dst=null)
758 {
759         size_t len = src.length * count;
760         if (len is 0)
761             return null;
762 
763         if (dst.length < len)
764             dst.length = len;
765 
766         for (auto p = dst.ptr; count--; p += src.length)
767              p[0 .. src.length] = src;
768 
769         return dst [0 .. len];
770 }
771 
772 /******************************************************************************
773 
774         Is the argument a whitespace character?
775 
776 ******************************************************************************/
777 
778 bool isSpace (char c)
779 {
780         return (c <= 32 && (c is ' ' || c is '\t' || c is '\r' || c is '\n' || c is '\f' || c is '\v'));
781 }
782 
783 /******************************************************************************
784 
785         Return whether or not the two arrays have matching content
786 
787 ******************************************************************************/
788 
789 bool matching (const(char)* s1, const(char)* s2, size_t length)
790 {
791         return mismatch(s1, s2, length) is length;
792 }
793 
794 /******************************************************************************
795 
796         Returns the index of the first match in str, failing once
797         length is reached. Note that we return 'length' for failure
798         and a 0-based index on success
799 
800 ******************************************************************************/
801 
802 size_t indexOf (const(char)* str, char match, size_t length)
803 {
804         enum m1 = cast(size_t) 0x0101010101010101;
805         enum m2 = cast(size_t) 0x8080808080808080;
806 
807         if (length)
808         {
809            size_t m = match;
810            m += m << 8;
811            m += (m << (8 * 2));
812            m += (m << (8 * 4));
813 
814            auto p = str;
815            auto e = p + length - size_t.sizeof;
816            while (p < e)
817            {
818                  // clear matching T segments
819                  auto v = (*cast(size_t*) p) ^ m;
820                  // test for zero, courtesy of Alan Mycroft
821                  if ((v - m1) & ~v & m2)
822                       break;
823                  p += size_t.sizeof;
824            }
825 
826            e += size_t.sizeof;
827            while (p < e)
828                   if (*p++ is match)
829                       return cast(size_t) (p - str - 1);
830         }
831         return length;
832 }
833 
834 /******************************************************************************
835 
836         Returns the index of a mismatch between s1 & s2, failing when
837         length is reached. Note that we return 'length' upon failure
838         (array content matches) and a 0-based index upon success.
839 
840         Use this as a faster opEquals. Also provides the basis for a
841         faster opCmp, since the index of the first mismatched character
842         can be used to determine the return value
843 
844 ******************************************************************************/
845 
846 size_t mismatch (const(char)* s1, const(char)* s2, size_t length)
847 {
848         verify(s1 && s2);
849 
850         if (length)
851         {
852            auto start = s1;
853            auto e = start + length - size_t.sizeof;
854 
855            while (s1 < e)
856                  {
857                  if (*cast(size_t*) s1 != *cast(size_t*) s2)
858                      break;
859                  s1 += size_t.sizeof;
860                  s2 += size_t.sizeof;
861                  }
862 
863            e += size_t.sizeof;
864            while (s1 < e)
865                   if (*s1++ != *s2++)
866                       return s1 - start - 1;
867         }
868 
869         return length;
870 }
871 
872 /******************************************************************************
873 
874         Iterator to isolate lines.
875 
876         Converts text into a set of lines, where each line is identified
877         by a \n or \r\n combination. The line terminator is stripped from
878         each resultant array.
879 
880         ---
881         foreach (line; lines ("one\ntwo\nthree"))
882                  ...
883         ---
884 
885 ******************************************************************************/
886 
887 LineFruct!(T) lines(T) (T[] src)
888 {
889         LineFruct!(T) lines;
890         lines.src = src;
891         return lines;
892 }
893 
894 /******************************************************************************
895 
896         Iterator to isolate text elements.
897 
898         Splits the provided array wherever a delimiter-set instance is
899         found, and return the resultant segments. The delimiters are
900         excluded from each of the segments. Note that delimiters are
901         matched as a set of alternates rather than as a pattern.
902 
903         Splitting on a single delimiter is considerably faster than
904         splitting upon a set of alternatives.
905 
906         ---
907         foreach (segment; delimiters ("one,two;three", ",;"))
908                  ...
909         ---
910 
911         Has to be templated to propagate mutable/const input qualifier to
912         return struct.
913 
914 ******************************************************************************/
915 
916 DelimFruct!T delimiters (T) (T[] src, cstring set)
917 {
918         DelimFruct!T elements;
919         elements.set = set;
920         elements.src = src;
921         return elements;
922 }
923 
924 /******************************************************************************
925 
926         Iterator to isolate text elements.
927 
928         Split the provided array wherever a pattern instance is found,
929         and return the resultant segments. Pattern are excluded from
930         each of the segments, and an optional sub argument enables
931         replacement.
932 
933         ---
934         foreach (segment; patterns ("one, two, three", ", "))
935                  ...
936         ---
937 
938 ******************************************************************************/
939 
940 PatternFruct!T patterns(T) (T[] src, cstring pattern, T[] sub = null)
941 {
942         PatternFruct!T elements;
943         elements.pattern = pattern;
944         elements.sub = sub;
945         elements.src = src;
946         return elements;
947 }
948 
949 unittest
950 {
951     cstring[] arr;
952     foreach (match; patterns("aaa..bbb..ccc", ".."))
953         arr ~= match;
954     test (arr == [ "aaa", "bbb", "ccc" ]);
955 
956     arr = [ ];
957     foreach (match; patterns("aaa..bbb..ccc", "..", "X"))
958         arr ~= match;
959     test (arr == [ "aaa", "X", "bbb", "X", "ccc" ]);
960 }
961 
962 /******************************************************************************
963 
964         Iterator to isolate optionally quoted text elements.
965 
966         As per elements(), but with the extension of being quote-aware;
967         the set of delimiters is ignored inside a pair of quotes. Note
968         that an unterminated quote will consume remaining content.
969 
970         ---
971         foreach (quote; quotes ("one two 'three four' five", " "))
972                  ...
973         ---
974 
975 ******************************************************************************/
976 
977 QuoteFruct!T quotes(T) (T[] src, cstring set)
978 {
979         QuoteFruct!T quotes;
980         quotes.set = set;
981         quotes.src = src;
982         return quotes;
983 }
984 
985 /******************************************************************************
986 
987         Convert 'escaped' chars to normal ones: \t => ^t for example.
988         Supports \" \' \\ \a \b \f \n \r \t \v
989 
990 ******************************************************************************/
991 
992 cstring unescape (cstring src, mstring dst = null)
993 {
994         ptrdiff_t delta;
995         auto s = src.ptr;
996         auto len = src.length;
997 
998         // take a peek first to see if there's anything
999         if ((delta = indexOf (s, '\\', len)) < len)
1000            {
1001            // make some room if not enough provided
1002            if (dst.length < src.length)
1003                dst.length = src.length;
1004            auto d = dst.ptr;
1005 
1006            // copy segments over, a chunk at a time
1007            do {
1008               d [0 .. delta] = s [0 .. delta];
1009               len -= delta;
1010               s += delta;
1011               d += delta;
1012 
1013               // bogus trailing '\'
1014               if (len < 2)
1015                  {
1016                  *d++ = '\\';
1017                  len = 0;
1018                  break;
1019                  }
1020 
1021               // translate \char
1022               char c = s[1];
1023               switch (c)
1024                      {
1025                       case '\\':
1026                            break;
1027                       case '\'':
1028                            c = '\'';
1029                            break;
1030                       case '"':
1031                            c = '"';
1032                            break;
1033                       case 'a':
1034                            c = '\a';
1035                            break;
1036                       case 'b':
1037                            c = '\b';
1038                            break;
1039                       case 'f':
1040                            c = '\f';
1041                            break;
1042                       case 'n':
1043                            c = '\n';
1044                            break;
1045                       case 'r':
1046                            c = '\r';
1047                            break;
1048                       case 't':
1049                            c = '\t';
1050                            break;
1051                       case 'v':
1052                            c = '\v';
1053                            break;
1054                       default:
1055                            *d++ = '\\';
1056                      }
1057               *d++ = c;
1058               len -= 2;
1059               s += 2;
1060               } while ((delta = indexOf (s, '\\', len)) < len);
1061 
1062            // copy tail too
1063            d [0 .. len] = s [0 .. len];
1064            return dst [0 .. (d + len) - dst.ptr];
1065            }
1066         return src;
1067 }
1068 
1069 
1070 /******************************************************************************
1071 
1072         jhash() -- hash a variable-length key into a 32-bit value
1073 
1074           k     : the key (the unaligned variable-length array of bytes)
1075           len   : the length of the key, counting by bytes
1076           level : can be any 4-byte value
1077 
1078         Returns a 32-bit value.  Every bit of the key affects every bit of
1079         the return value.  Every 1-bit and 2-bit delta achieves avalanche.
1080 
1081         About 4.3*len + 80 X86 instructions, with excellent pipelining
1082 
1083         The best hash table sizes are powers of 2.  There is no need to do
1084         mod a prime (mod is sooo slow!).  If you need less than 32 bits,
1085         use a bitmask.  For example, if you need only 10 bits, do
1086 
1087                     h = (h & hashmask(10));
1088 
1089         In which case, the hash table should have hashsize(10) elements.
1090         If you are hashing n strings (ub1 **)k, do it like this:
1091 
1092                     for (i=0, h=0; i<n; ++i) h = hash( k[i], len[i], h);
1093 
1094         By Bob Jenkins, 1996.  bob_jenkins@burtleburtle.net.  You may use
1095         this code any way you wish, private, educational, or commercial.
1096         It's free.
1097 
1098         See http://burtleburtle.net/bob/hash/evahash.html
1099         Use for hash table lookup, or anything where one collision in 2^32
1100         is acceptable. Do NOT use for cryptographic purposes.
1101 
1102 ******************************************************************************/
1103 
1104 size_t jhash (ubyte* k, size_t len, size_t c = 0)
1105 {
1106         size_t a = 0x9e3779b9,
1107              b = 0x9e3779b9,
1108              i = len;
1109 
1110         // handle most of the key
1111         while (i >= 12)
1112               {
1113               a += *cast(uint *)(k+0);
1114               b += *cast(uint *)(k+4);
1115               c += *cast(uint *)(k+8);
1116 
1117               a -= b; a -= c; a ^= (c>>13);
1118               b -= c; b -= a; b ^= (a<<8);
1119               c -= a; c -= b; c ^= (b>>13);
1120               a -= b; a -= c; a ^= (c>>12);
1121               b -= c; b -= a; b ^= (a<<16);
1122               c -= a; c -= b; c ^= (b>>5);
1123               a -= b; a -= c; a ^= (c>>3);
1124               b -= c; b -= a; b ^= (a<<10);
1125               c -= a; c -= b; c ^= (b>>15);
1126               k += 12; i -= 12;
1127               }
1128 
1129         // handle the last 11 bytes
1130         c += len;
1131         switch (i)
1132                {
1133                case 11: c+=(cast(uint)k[10]<<24); goto case;
1134                case 10: c+=(cast(uint)k[9]<<16); goto case;
1135                case 9 : c+=(cast(uint)k[8]<<8); goto case;
1136                case 8 : b+=(cast(uint)k[7]<<24); goto case;
1137                case 7 : b+=(cast(uint)k[6]<<16); goto case;
1138                case 6 : b+=(cast(uint)k[5]<<8); goto case;
1139                case 5 : b+=(cast(uint)k[4]); goto case;
1140                case 4 : a+=(cast(uint)k[3]<<24); goto case;
1141                case 3 : a+=(cast(uint)k[2]<<16); goto case;
1142                case 2 : a+=(cast(uint)k[1]<<8); goto case;
1143                case 1 : a+=(cast(uint)k[0]); goto default;
1144                default:
1145                }
1146 
1147         a -= b; a -= c; a ^= (c>>13);
1148         b -= c; b -= a; b ^= (a<<8);
1149         c -= a; c -= b; c ^= (b>>13);
1150         a -= b; a -= c; a ^= (c>>12);
1151         b -= c; b -= a; b ^= (a<<16);
1152         c -= a; c -= b; c ^= (b>>5);
1153         a -= b; a -= c; a ^= (c>>3);
1154         b -= c; b -= a; b ^= (a<<10);
1155         c -= a; c -= b; c ^= (b>>15);
1156 
1157         return c;
1158 }
1159 
1160 /// ditto
1161 size_t jhash (void[] x, size_t c = 0)
1162 {
1163         return jhash (cast(ubyte*) x.ptr, x.length, c);
1164 }
1165 
1166 
1167 /******************************************************************************
1168 
1169         Helper fruct for iterator lines(). A fruct is a low
1170         impact mechanism for capturing context relating to an
1171         opApply (conjunction of the names struct and foreach)
1172 
1173 ******************************************************************************/
1174 
1175 private struct LineFruct(T)
1176 {
1177         private T[] src;
1178 
1179         int opApply (scope int delegate (ref T[] line) dg)
1180         {
1181                 int     ret;
1182                 size_t  pos,
1183                         mark;
1184                 T[]     line;
1185 
1186                 enum T nl = '\n';
1187                 enum T cr = '\r';
1188 
1189                 while ((pos = locate (src, nl, mark)) < src.length)
1190                       {
1191                       auto end = pos;
1192                       if (end && src[end-1] is cr)
1193                           --end;
1194 
1195                       line = src [mark .. end];
1196                       if ((ret = dg (line)) != 0)
1197                            return ret;
1198                       mark = pos + 1;
1199                       }
1200 
1201                 line = src [mark .. $];
1202                 if (mark <= src.length)
1203                     ret = dg (line);
1204 
1205                 return ret;
1206         }
1207 }
1208 
1209 /******************************************************************************
1210 
1211         Helper fruct for iterator delims(). A fruct is a low
1212         impact mechanism for capturing context relating to an
1213         opApply (conjunction of the names struct and foreach)
1214 
1215 ******************************************************************************/
1216 
1217 private struct DelimFruct (T)
1218 {
1219         private T[]     src;
1220         private cstring set;
1221 
1222         int opApply (scope int delegate (ref T[] token) dg)
1223         {
1224                 int     ret;
1225                 size_t  pos,
1226                         mark;
1227                 T[]     token;
1228 
1229                 // optimize for single delimiter case
1230                 if (set.length is 1)
1231                     while ((pos = locate (src, set[0], mark)) < src.length)
1232                           {
1233                           token = src [mark .. pos];
1234                           if ((ret = dg (token)) != 0)
1235                                return ret;
1236                           mark = pos + 1;
1237                           }
1238                 else
1239                    if (set.length > 1)
1240                        foreach (i, elem; src)
1241                                 if (contains (set, elem))
1242                                    {
1243                                    token = src [mark .. i];
1244                                    if ((ret = dg (token)) != 0)
1245                                         return ret;
1246                                    mark = i + 1;
1247                                    }
1248 
1249                 token = src [mark .. $];
1250                 if (mark <= src.length)
1251                     ret = dg (token);
1252 
1253                 return ret;
1254         }
1255 }
1256 
1257 /******************************************************************************
1258 
1259         Helper fruct for iterator patterns(). A fruct is a low
1260         impact mechanism for capturing context relating to an
1261         opApply (conjunction of the names struct and foreach)
1262 
1263 ******************************************************************************/
1264 
1265 public struct PatternFruct (T)
1266 {
1267         private T[] src, sub;
1268         private cstring pattern;
1269 
1270         int opApply (scope int delegate (ref T[] token) dg)
1271         {
1272                 int     ret;
1273                 size_t  pos,
1274                         mark;
1275                 T[]     token;
1276 
1277                 while ((pos = index (src, pattern, mark)) < src.length)
1278                       {
1279                       token = src [mark .. pos];
1280                       if ((ret = dg(token)) != 0)
1281                            return ret;
1282                       if (sub.ptr && (ret = dg(sub)) != 0)
1283                           return ret;
1284                       mark = pos + pattern.length;
1285                       }
1286 
1287                 token = src [mark .. $];
1288                 if (mark <= src.length)
1289                     ret = dg (token);
1290 
1291                 return ret;
1292         }
1293 }
1294 
1295 /******************************************************************************
1296 
1297         Helper fruct for iterator quotes(). A fruct is a low
1298         impact mechanism for capturing context relating to an
1299         opApply (conjunction of the names struct and foreach)
1300 
1301 ******************************************************************************/
1302 
1303 private struct QuoteFruct (T)
1304 {
1305         private T[]     src;
1306         private cstring set;
1307 
1308         int opApply (scope int delegate (ref T[] token) dg)
1309         {
1310                 int     ret;
1311                 size_t  mark;
1312                 T[]     token;
1313 
1314                 if (set.length)
1315                     for (size_t i=0; i < src.length; ++i)
1316                         {
1317                         T c = src[i];
1318                         if (c is '"' || c is '\'')
1319                             i = locate (src, c, i+1);
1320                         else
1321                            if (contains (set, c))
1322                               {
1323                               token = src [mark .. i];
1324                               if ((ret = dg (token)) != 0)
1325                                    return ret;
1326                               mark = i + 1;
1327                               }
1328                         }
1329 
1330                 token = src [mark .. $];
1331                 if (mark <= src.length)
1332                     ret = dg (token);
1333 
1334                 return ret;
1335         }
1336 }
1337 
1338 
1339 /******************************************************************************
1340 
1341 ******************************************************************************/
1342 
1343 unittest
1344 {
1345     char[64] tmp;
1346 
1347     test (isSpace (' ') && !isSpace ('d'));
1348 
1349     test (indexOf ("abc".ptr, 'a', 3) is 0);
1350     test (indexOf ("abc".ptr, 'b', 3) is 1);
1351     test (indexOf ("abc".ptr, 'c', 3) is 2);
1352     test (indexOf ("abc".ptr, 'd', 3) is 3);
1353     test (indexOf ("abcabcabc".ptr, 'd', 9) is 9);
1354 
1355     test (mismatch ("abc".ptr, "abc".ptr, 3) is 3);
1356     test (mismatch ("abc".ptr, "abd".ptr, 3) is 2);
1357     test (mismatch ("abc".ptr, "acc".ptr, 3) is 1);
1358     test (mismatch ("abc".ptr, "ccc".ptr, 3) is 0);
1359 
1360     test (matching ("abc".ptr, "abc".ptr, 3));
1361     test (matching ("abc".ptr, "abb".ptr, 3) is false);
1362 
1363     test (contains ("abc", 'a'));
1364     test (contains ("abc", 'b'));
1365     test (contains ("abc", 'c'));
1366     test (contains ("abc", 'd') is false);
1367 
1368     test (containsPattern ("abc", "ab"));
1369     test (containsPattern ("abc", "bc"));
1370     test (containsPattern ("abc", "abc"));
1371     test (containsPattern ("abc", "zabc") is false);
1372     test (containsPattern ("abc", "abcd") is false);
1373     test (containsPattern ("abc", "za") is false);
1374     test (containsPattern ("abc", "cd") is false);
1375 
1376     test (trim ("") == "");
1377     test (trim (" abc  ") == "abc");
1378     test (trim ("   ") == "");
1379 
1380     test (strip ("", '%') == "");
1381     test (strip ("%abc%%%", '%') == "abc");
1382     test (strip ("#####", '#') == "");
1383     test (stripl ("#####", '#') == "");
1384     test (stripl (" ###", ' ') == "###");
1385     test (stripl ("#####", 's') == "#####");
1386     test (stripr ("#####", '#') == "");
1387     test (stripr ("### ", ' ') == "###");
1388     test (stripr ("#####", 's') == "#####");
1389 
1390     test (replace ("abc".dup, 'b', ':') == "a:c");
1391     test (substitute ("abc", "bc", "x") == "ax");
1392 
1393     test (locate ("abc".dup, 'c', 1) is 2);
1394 
1395     test (locate ("abc", 'c') is 2);
1396     test (locate ("abc", 'a') is 0);
1397     test (locate ("abc", 'd') is 3);
1398     test (locate ("", 'c') is 0);
1399 
1400     test (locatePrior ("abce".dup, 'c') is 2);
1401     test (locatePrior ("abce", 'a') is 0);
1402     test (locatePrior ("abce", 'd') is 4);
1403     test (locatePrior ("abce", 'c', 3) is 2);
1404     test (locatePrior ("abce", 'c', 2) is 4);
1405     test (locatePrior ("", 'c') is 0);
1406 
1407     auto x = delimit ("::b", ":");
1408     test (x.length is 3 && x[0] == "" && x[1] == "" && x[2] == "b");
1409     x = delimit ("a:bc:d", ":");
1410     test (x.length is 3 && x[0] == "a" && x[1] == "bc" && x[2] == "d");
1411     x = delimit ("abcd", ":");
1412     test (x.length is 1 && x[0] == "abcd");
1413     x = delimit ("abcd:", ":");
1414     test (x.length is 2 && x[0] == "abcd" && x[1] == "");
1415     x = delimit ("a;b$c#d:e@f", ";:$#@");
1416     test (x.length is 6 && x[0]=="a" && x[1]=="b" && x[2]=="c" &&
1417             x[3]=="d" && x[4]=="e" && x[5]=="f");
1418 
1419     test (locatePattern ("abcdefg".dup, "") is 7);
1420     test (locatePattern ("abcdefg", "g") is 6);
1421     test (locatePattern ("abcdefg", "abcdefg") is 0);
1422     test (locatePattern ("abcdefg", "abcdefgx") is 7);
1423     test (locatePattern ("abcdefg", "cce") is 7);
1424     test (locatePattern ("abcdefg", "cde") is 2);
1425     test (locatePattern ("abcdefgcde", "cde", 3) is 7);
1426 
1427     test (locatePatternPrior ("abcdefg".dup, "") is 7);
1428     test (locatePatternPrior ("abcdefg", "cce") is 7);
1429     test (locatePatternPrior ("abcdefg", "cde") is 2);
1430     test (locatePatternPrior ("abcdefgcde", "cde", 6) is 2);
1431     test (locatePatternPrior ("abcdefgcde", "cde", 4) is 2);
1432     test (locatePatternPrior ("abcdefg", "abcdefgx") is 7);
1433 
1434     x = splitLines ("a\nb\n");
1435     test (x.length is 3 && x[0] == "a" && x[1] == "b" && x[2] == "");
1436     x = splitLines ("a\r\n");
1437     test (x.length is 2 && x[0] == "a" && x[1] == "");
1438 
1439     x = splitLines ("a");
1440     test (x.length is 1 && x[0] == "a");
1441     x = splitLines ("");
1442     test (x.length is 1);
1443 
1444     cstring[] q;
1445     foreach (element; quotes ("1 'avcc   cc ' 3", " "))
1446         q ~= element;
1447     test (q.length is 3 && q[0] == "1" && q[1] == "'avcc   cc '" && q[2] == "3");
1448 
1449     x = split ("one, two, three", ",");
1450     test (x.length is 3 && x[0] == "one" && x[1] == " two" && x[2] == " three");
1451     x = split ("one, two, three", ", ");
1452     test (x.length is 3 && x[0] == "one" && x[1] == "two" && x[2] == "three");
1453     x = split ("one, two, three", ",,");
1454     test (x.length is 1 && x[0] == "one, two, three");
1455     x = split ("one,,", ",");
1456     test (x.length is 3 && x[0] == "one" && x[1] == "" && x[2] == "");
1457 
1458     istring t, h;
1459     h =  head ("one:two:three", ":", t);
1460     test (h == "one" && t == "two:three");
1461     h = head ("one:::two:three", ":::", t);
1462     test (h == "one" && t == "two:three");
1463     h = head ("one:two:three", "*", t);
1464     test (h == "one:two:three" && t is null);
1465 
1466     t =  tail ("one:two:three", ":", h);
1467     test (h == "one:two" && t == "three");
1468     t = tail ("one:::two:three", ":::", h);
1469     test (h == "one" && t == "two:three");
1470     t = tail ("one:two:three", "*", h);
1471     test (t == "one:two:three" && h is null);
1472 
1473     test (chopl("hello world", "hello ") == "world");
1474     test (chopl("hello", "hello") == "");
1475     test (chopl("hello world", " ") == "hello world");
1476     test (chopl("hello world", "") == "hello world");
1477 
1478     test (chopr("hello world", " world") == "hello");
1479     test (chopr("hello", "hello") == "");
1480     test (chopr("hello world", " ") == "hello world");
1481     test (chopr("hello world", "") == "hello world");
1482 
1483     istring[] foo = ["one", "two", "three"];
1484     auto j = join (foo);
1485     test (j == "onetwothree");
1486     j = join (foo, ", ");
1487     test (j == "one, two, three");
1488     j = join (foo, " ", tmp);
1489     test (j == "one two three");
1490     test (j.ptr is tmp.ptr);
1491 
1492     test (repeat ("abc", 0) == "");
1493     test (repeat ("abc", 1) == "abc");
1494     test (repeat ("abc", 2) == "abcabc");
1495     test (repeat ("abc", 4) == "abcabcabcabc");
1496     test (repeat ("", 4) == "");
1497     char[10] rep;
1498     test (repeat ("abc", 0, rep) == "");
1499     test (repeat ("abc", 1, rep) == "abc");
1500     test (repeat ("abc", 2, rep) == "abcabc");
1501     test (repeat ("", 4, rep) == "");
1502 
1503     test (unescape ("abc") == "abc");
1504     test (unescape ("abc\\") == "abc\\");
1505     test (unescape ("abc\\t") == "abc\t");
1506     test (unescape ("abc\\tc") == "abc\tc");
1507     test (unescape ("\\t") == "\t");
1508     test (unescape ("\\tx") == "\tx");
1509     test (unescape ("\\v\\vx") == "\v\vx");
1510     test (unescape ("abc\\t\\a\\bc") == "abc\t\a\bc");
1511 }
1512 
1513 debug (Util)
1514 {
1515         auto x = import("Util.d");
1516 
1517         void main()
1518         {
1519                 mismatch ("".ptr, x.ptr, 0);
1520                 indexOf ("".ptr, '@', 0);
1521                 char[] s;
1522                 split (s, " ");
1523                 //indexOf (s.ptr, '@', 0);
1524 
1525         }
1526 }