1 /*******************************************************************************
2 
3         Placeholder for a variety of wee functions.
4 
5         Several of these functions return an index value, representing where
6         some criteria was identified. When said criteria is not matched, the
7         functions return a value representing the array length provided to
8         them. That is, for those scenarios where C functions might typically
9         return -1 these functions return length instead. This operate nicely
10         with D slices:
11         ---
12         auto text = "happy:faces";
13 
14         assert (text[0 .. locate (text, ':')] == "happy");
15 
16         assert (text[0 .. locate (text, '!')] == "happy:faces");
17         ---
18 
19         The contains() function is more convenient for trivial
20         lookup cases:
21         ---
22         if (contains ("fubar", '!'))
23             ...
24         ---
25 
26         Note that where some functions expect a size_t as an argument, the
27         D template-matching algorithm will fail where an int is provided
28         instead. This is the typically the cause of "template not found"
29         errors. Also note that name overloading is not supported cleanly
30         by IFTI at this time, so is not applied here.
31 
32 
33         Applying the D "import alias" mechanism to this module is highly
34         recommended, in order to limit namespace pollution:
35         ---
36         import Util = ocean.text.Util;
37 
38         auto s = Util.trim ("  foo ");
39         ---
40 
41 
42         Function templates:
43         ---
44         trim (source)                               // trim whitespace
45         triml (source)                              // trim whitespace
46         trimr (source)                              // trim whitespace
47         strip (source, match)                       // trim elements
48         stripl (source, match)                      // trim elements
49         stripr (source, match)                      // trim elements
50         chopl (source, match)                       // trim pattern match
51         chopr (source, match)                       // trim pattern match
52         delimit (src, set)                          // split on delims
53         split (source, pattern)                     // split on pattern
54         splitLines (source);                        // split on lines
55         head (source, pattern, tail)                // split to head & tail
56         join (source, postfix, output)              // join text segments
57         prefix (dst, prefix, content...)            // prefix text segments
58         postfix (dst, postfix, content...)          // postfix text segments
59         combine (dst, prefix, postfix, content...)  // combine lotsa stuff
60         repeat (source, count, output)              // repeat source
61         replace (source, match, replacement)        // replace chars
62         substitute (source, match, replacement)     // replace/remove matches
63         count (source, match)                       // count instances
64         contains (source, match)                    // has char?
65         containsPattern (source, match)             // has pattern?
66         index (source, match, start)                // find match index
67         locate (source, match, start)               // find char
68         locatePrior (source, match, start)          // find prior char
69         locatePattern (source, match, start);       // find pattern
70         locatePatternPrior (source, match, start);  // find prior pattern
71         indexOf (s*, match, length)                 // low-level lookup
72         mismatch (s1*, s2*, length)                 // low-level compare
73         matching (s1*, s2*, length)                 // low-level compare
74         isSpace (match)                             // is whitespace?
75         unescape(source, output)                    // convert '\' prefixes
76         layout (destination, format ...)            // featherweight printf
77         lines (str)                                 // foreach lines
78         quotes (str, set)                           // foreach quotes
79         delimiters (str, set)                       // foreach delimiters
80         patterns (str, pattern)                     // foreach patterns
81         ---
82 
83         Please note that any 'pattern' referred to within this module
84         refers to a pattern of characters, and not some kind of regex
85         descriptor. Use the Regex module for regex operation.
86 
87         Copyright:
88             Copyright (c) 2004 Kris Bell.
89             Some parts copyright (c) 2009-2016 dunnhumby Germany GmbH.
90             All rights reserved.
91 
92         License:
93             Tango Dual License: 3-Clause BSD License / Academic Free License v3.0.
94             See LICENSE_TANGO.txt for details.
95 
96         Version:
97             Apr 2004: Initial release
98             Dec 2006: South Seas version
99 
100         Authors: Kris
101 
102 *******************************************************************************/
103 
104 module ocean.text.Util;
105 
106 //import ocean.meta.types.Qualifiers : cstring, mstring;
107 import ocean.transition;
108 import ocean.core.Verify;
109 
110 version (UnitTest) import ocean.core.Test;
111 
112 
113 /******************************************************************************
114 
115         Trim the provided array by stripping whitespace from both
116         ends. Returns a slice of the original content
117 
118 ******************************************************************************/
119 
120 inout(char)[] trim (inout(char)[] source)
121 {
122         inout(char)*   head = source.ptr,
123                        tail = head + source.length;
124 
125         while (head < tail && isSpace(*head))
126                ++head;
127 
128         while (tail > head && isSpace(*(tail-1)))
129                --tail;
130 
131         return head [0 .. tail - head];
132 }
133 
134 /******************************************************************************
135 
136         Trim the provided array by stripping whitespace from the left.
137         Returns a slice of the original content
138 
139 ******************************************************************************/
140 
141 inout(char)[] triml (inout(char)[] source)
142 {
143         inout(char)*   head = source.ptr,
144                        tail = head + source.length;
145 
146         while (head < tail && isSpace(*head))
147                ++head;
148 
149         return head [0 .. tail - head];
150 }
151 
152 /******************************************************************************
153 
154         Trim the provided array by stripping whitespace from the right.
155         Returns a slice of the original content
156 
157 ******************************************************************************/
158 
159 inout(char)[] trimr (inout(char)[] source)
160 {
161         inout(char)*   head = source.ptr,
162                        tail = head + source.length;
163 
164         while (tail > head && isSpace(*(tail-1)))
165                --tail;
166 
167         return head [0 .. tail - head];
168 }
169 
170 /******************************************************************************
171 
172         Trim the given array by stripping the provided match from
173         both ends. Returns a slice of the original content
174 
175 ******************************************************************************/
176 
177 inout(char)[] strip (inout(char)[] source, char match)
178 {
179         inout(char)*   head = source.ptr,
180                        tail = head + source.length;
181 
182         while (head < tail && *head is match)
183                ++head;
184 
185         while (tail > head && *(tail-1) is match)
186                --tail;
187 
188         return head [0 .. tail - head];
189 }
190 
191 /******************************************************************************
192 
193         Trim the given array by stripping the provided match from
194         the left hand side. Returns a slice of the original content
195 
196 ******************************************************************************/
197 
198 inout(char)[] stripl (inout(char)[] source, char match)
199 {
200         inout(char)* head = source.ptr,
201                      tail = head + source.length;
202 
203         while (head < tail && *head is match)
204                ++head;
205 
206         return head [0 .. tail - head];
207 }
208 
209 /******************************************************************************
210 
211         Trim the given array by stripping the provided match from
212         the right hand side. Returns a slice of the original content
213 
214 ******************************************************************************/
215 
216 inout(char)[] stripr (inout(char)[] source, char match)
217 {
218         inout(char)* head = source.ptr,
219                      tail = head + source.length;
220 
221         while (tail > head && *(tail-1) is match)
222                --tail;
223 
224         return head [0 .. tail - head];
225 }
226 
227 /******************************************************************************
228 
229         Chop the given source by stripping the provided match from
230         the left hand side. Returns a slice of the original content
231 
232 ******************************************************************************/
233 
234 inout(char)[] chopl (inout(char)[] source, cstring match)
235 {
236         if (match.length <= source.length)
237             if (source[0 .. match.length] == match)
238                 source = source [match.length .. $];
239 
240         return source;
241 }
242 
243 /******************************************************************************
244 
245         Chop the given source by stripping the provided match from
246         the right hand side. Returns a slice of the original content
247 
248 ******************************************************************************/
249 
250 inout(char)[] chopr (inout(char)[] source, cstring match)
251 {
252         if (match.length <= source.length)
253             if (source[$-match.length .. $] == match)
254                 source = source [0 .. $-match.length];
255 
256         return source;
257 }
258 
259 /******************************************************************************
260 
261         Replace all instances of one element with another (in place)
262 
263 ******************************************************************************/
264 
265 mstring replace (mstring source, char match, char replacement)
266 {
267         foreach (ref c; source)
268                  if (c is match)
269                      c = replacement;
270         return source;
271 }
272 
273 /******************************************************************************
274 
275         Substitute all instances of match from source. Set replacement
276         to null in order to remove instead of replace
277 
278 ******************************************************************************/
279 
280 mstring substitute (cstring source, cstring match, cstring replacement)
281 {
282         mstring output;
283 
284         foreach (s; patterns (source, match, replacement))
285         {
286                     output ~= s;
287         }
288         return output;
289 }
290 
291 /******************************************************************************
292 
293         Count all instances of match within source
294 
295 ******************************************************************************/
296 
297 size_t count (cstring source, cstring match)
298 {
299         size_t c;
300 
301         foreach (s; patterns (source, match))
302                     ++c;
303         verify(c > 0);
304         return c - 1;
305 }
306 
307 /******************************************************************************
308 
309         Returns whether or not the provided array contains an instance
310         of the given match
311 
312 ******************************************************************************/
313 
314 bool contains (cstring source, char match)
315 {
316         return indexOf (source.ptr, match, source.length) != source.length;
317 }
318 
319 /******************************************************************************
320 
321         Returns whether or not the provided array contains an instance
322         of the given match
323 
324 ******************************************************************************/
325 
326 bool containsPattern (cstring source, cstring match)
327 {
328         return locatePattern (source, match) != source.length;
329 }
330 
331 unittest
332 {
333     mstring greeting = "Hello world".dup;
334     istring satan = "Hell";
335     test(containsPattern(greeting, satan), "Pattern not found");
336 }
337 
338 
339 /******************************************************************************
340 
341         Return the index of the next instance of 'match' starting at
342         position 'start', or source.length where there is no match.
343 
344         Parameter 'start' defaults to 0
345 
346 ******************************************************************************/
347 
348 size_t index (cstring source, cstring match, size_t start=0)
349 {
350         return (match.length is 1) ? locate (source, match[0], start)
351                                    : locatePattern (source, match, start);
352 }
353 
354 unittest
355 {
356     char[] url;
357     size_t start;
358     cstring CSLASHSLASH = "://";
359     start = index(url, CSLASHSLASH);
360 }
361 
362 /******************************************************************************
363 
364         Return the index of the prior instance of 'match' starting
365         just before 'start', or source.length where there is no match.
366 
367         Parameter 'start' defaults to source.length
368 
369 ******************************************************************************/
370 
371 size_t rindex (cstring source, cstring match, size_t start=size_t.max)
372 {
373         return (match.length is 1) ? locatePrior (source, match[0], start)
374                                    : locatePatternPrior (source, match, start);
375 }
376 
377 /******************************************************************************
378 
379         Return the index of the next instance of 'match' starting at
380         position 'start', or source.length where there is no match.
381 
382         Parameter 'start' defaults to 0
383 
384 ******************************************************************************/
385 
386 size_t locate (cstring source, char match, size_t start=0)
387 {
388         if (start > source.length)
389             start = source.length;
390 
391         return indexOf (source.ptr+start, match, source.length - start) + start;
392 }
393 
394 unittest
395 {
396     size_t start;
397     char[] url;
398     start = locate(url, '/', start);
399 }
400 
401 /******************************************************************************
402 
403         Return the index of the prior instance of 'match' starting
404         just before 'start', or source.length where there is no match.
405 
406         Parameter 'start' defaults to source.length
407 
408 ******************************************************************************/
409 
410 size_t locatePrior (cstring source, char match, size_t start = size_t.max)
411 {
412         if (start > source.length)
413             start = source.length;
414 
415         while (start > 0)
416                if (source[--start] is match)
417                    return start;
418         return source.length;
419 }
420 
421 /******************************************************************************
422 
423         Return the index of the next instance of 'match' starting at
424         position 'start', or source.length where there is no match.
425 
426         Parameter 'start' defaults to 0
427 
428 ******************************************************************************/
429 
430 size_t locatePattern (cstring source, cstring match, size_t start=0)
431 {
432         size_t idx;
433         const(char)* p = source.ptr + start;
434         size_t extent = source.length - start - match.length + 1;
435 
436         if (match.length && extent <= source.length)
437         {
438             while (extent)
439                 if ((idx = indexOf (p, match[0], extent)) is extent)
440                     break;
441                 else
442                 {
443                     if (matching (p+=idx, match.ptr, match.length))
444                         return p - source.ptr;
445                     else
446                     {
447                         extent -= (idx+1);
448                         ++p;
449                     }
450                 }
451         }
452 
453         return source.length;
454 }
455 
456 /******************************************************************************
457 
458         Return the index of the prior instance of 'match' starting
459         just before 'start', or source.length where there is no match.
460 
461         Parameter 'start' defaults to source.length
462 
463 ******************************************************************************/
464 
465 size_t locatePatternPrior (cstring source, cstring match, size_t start=size_t.max)
466 {
467         auto len = source.length;
468 
469         if (start > len)
470             start = len;
471 
472         if (match.length && match.length <= len)
473             while (start)
474                   {
475                   start = locatePrior (source, match[0], start);
476                   if (start is len)
477                       break;
478                   else
479                      if ((start + match.length) <= len)
480                           if (matching (source.ptr+start, match.ptr, match.length))
481                               return start;
482                   }
483 
484         return len;
485 }
486 
487 /******************************************************************************
488 
489         Split the provided array on the first pattern instance, and
490         return the resultant head and tail. The pattern is excluded
491         from the two segments.
492 
493         Where a segment is not found, tail will be null and the return
494         value will be the original array.
495 
496 ******************************************************************************/
497 
498 inout(char)[] head (inout(char)[] src, cstring pattern, out inout(char)[] tail)
499 {
500         auto i = locatePattern (src, pattern);
501         if (i != src.length)
502            {
503            tail = src [i + pattern.length .. $];
504            src = src [0 .. i];
505            }
506         return src;
507 }
508 
509 /******************************************************************************
510 
511         Split the provided array on the last pattern instance, and
512         return the resultant head and tail. The pattern is excluded
513         from the two segments.
514 
515         Where a segment is not found, head will be null and the return
516         value will be the original array.
517 
518 ******************************************************************************/
519 
520 inout(char)[] tail (inout(char)[] src, cstring pattern, out inout(char)[] head)
521 {
522         auto i = locatePatternPrior (src, pattern);
523         if (i != src.length)
524            {
525            head = src [0 .. i];
526            src = src [i + pattern.length .. $];
527            }
528         return src;
529 }
530 
531 /******************************************************************************
532 
533         Split the provided array wherever a delimiter-set instance is
534         found, and return the resultant segments. The delimiters are
535         excluded from each of the segments. Note that delimiters are
536         matched as a set of alternates rather than as a pattern.
537 
538         Splitting on a single delimiter is considerably faster than
539         splitting upon a set of alternatives.
540 
541         Note that the src content is not duplicated by this function,
542         but is sliced instead.
543 
544 ******************************************************************************/
545 
546 inout(char)[][] delimit (inout(char)[] src, cstring set)
547 {
548         typeof(return) result;
549 
550         // Cast is needed to avoid instantiating `DelimFruct!(inout(char))`
551         // which can't compile. We cast segment back to `inout` after so the end
552         // result is still type-checked.
553 
554         foreach (segment; delimiters (cast(cstring) src, set))
555                  result ~= cast(typeof(src)) segment;
556         return result;
557 }
558 
559 /******************************************************************************
560 
561         Split the provided array wherever a pattern instance is
562         found, and return the resultant segments. The pattern is
563         excluded from each of the segments.
564 
565         Note that the src content is not duplicated by this function,
566         but is sliced instead.
567 
568 ******************************************************************************/
569 
570 inout(char)[][] split (inout(char)[] src, cstring pattern)
571 {
572         typeof(return) result;
573 
574         // Cast is needed to avoid instantiating `PatternFruct!(inout(char))`
575         // which can't compile. We cast segment back to `inout` after so the end
576         // result is still type-checked.
577 
578         foreach (segment; patterns (cast(cstring) src, pattern))
579                  result ~= cast(typeof(src)) segment;
580         return result;
581 }
582 
583 /******************************************************************************
584 
585         Convert text into a set of lines, where each line is identified
586         by a \n or \r\n combination. The line terminator is stripped from
587         each resultant array
588 
589         Note that the src content is not duplicated by this function, but
590         is sliced instead.
591 
592 ******************************************************************************/
593 
594 inout(char)[][] toLines (inout(char)[] src)
595 {
596         typeof(return) result;
597 
598         // Cast is needed to avoid instantiating `LineFruct!(inout(char))`
599         // which can't compile. We cast line back to `inout` after so the end
600         // result is still type-checked.
601 
602         foreach (line; lines (cast(cstring) src))
603                  result ~= cast(typeof(src)) line;
604         return result;
605 }
606 
607 alias toLines splitLines;
608 
609 /******************************************************************************
610 
611         Return the indexed line, where each line is identified by a \n
612         or \r\n combination. The line terminator is stripped from the
613         resultant line
614 
615         Note that src content is not duplicated by this function, but
616         is sliced instead.
617 
618 ******************************************************************************/
619 
620 inout(char)[] lineOf (inout(char)[] src, size_t index)
621 {
622         int i = 0;
623 
624         // Cast is needed to avoid instantiating `LineFruct!(inout(char))`
625         // which can't compile. We cast line back to `inout` after so the end
626         // result is still type-checked.
627 
628         foreach (line; lines(cast(cstring) src))
629                  if (i++ is index)
630                      return cast(typeof(return)) line;
631         return null;
632 }
633 
634 /******************************************************************************
635 
636         Combine a series of text segments together, each appended with
637         a postfix pattern. An optional output buffer can be provided to
638         avoid heap activity - it should be large enough to contain the
639         entire output, otherwise the heap will be used instead.
640 
641         Returns a valid slice of the output, containing the concatenated
642         text.
643 
644 ******************************************************************************/
645 
646 mstring join (const(char[])[] src, cstring postfix=null, mstring dst = null)
647 {
648         return combine(dst, null, postfix, src);
649 }
650 
651 unittest
652 {
653     test (join([ "aaa", "bbb", "ccc" ], ",") == "aaa,bbb,ccc");
654 
655     // ensure `join` works with differently qualified arguments
656     const(char[][]) mut = [ "xxx".dup, "yyy".dup, "zzz" ];
657     char[20] buf;
658     auto ret = join(mut, " ", buf);
659     test (ret == "xxx yyy zzz");
660     test (ret.ptr is buf.ptr);
661 }
662 
663 /******************************************************************************
664 
665         Combine a series of text segments together, each prepended with
666         a prefix pattern. An optional output buffer can be provided to
667         avoid heap activity - it should be large enough to contain the
668         entire output, otherwise the heap will be used instead.
669 
670         Note that, unlike join(), the output buffer is specified first
671         such that a set of trailing strings can be provided.
672 
673         Returns a valid slice of the output, containing the concatenated
674         text.
675 
676 ******************************************************************************/
677 
678 mstring prefix (mstring dst, cstring prefix, const(char[])[] src...)
679 {
680         return combine(dst, prefix, null, src);
681 }
682 
683 /******************************************************************************
684 
685         Combine a series of text segments together, each appended with an
686         optional postfix pattern. An optional output buffer can be provided
687         to avoid heap activity - it should be large enough to contain the
688         entire output, otherwise the heap will be used instead.
689 
690         Note that, unlike join(), the output buffer is specified first
691         such that a set of trailing strings can be provided.
692 
693         Returns a valid slice of the output, containing the concatenated
694         text.
695 
696 ******************************************************************************/
697 
698 mstring postfix (mstring dst, cstring postfix, cstring[] src...)
699 {
700         return combine(dst, null, postfix, src);
701 }
702 
703 /******************************************************************************
704 
705         Combine a series of text segments together, each prefixed and/or
706         postfixed with optional strings. An optional output buffer can be
707         provided to avoid heap activity - which should be large enough to
708         contain the entire output, otherwise the heap will be used instead.
709 
710         Note that, unlike join(), the output buffer is specified first
711         such that a set of trailing strings can be provided.
712 
713         Returns a valid slice of the output, containing the concatenated
714         text.
715 
716 ******************************************************************************/
717 
718 mstring combine (mstring dst, cstring prefix, cstring postfix, const(char[])[] src ...)
719 {
720         size_t len = src.length * prefix.length +
721                    src.length * postfix.length;
722 
723         foreach (segment; src)
724                  len += segment.length;
725 
726         if (dst.length < len)
727             dst.length = len;
728 
729         auto p = dst.ptr;
730         foreach (segment; src)
731                 {
732                 p[0 .. prefix.length] = prefix;
733                 p += prefix.length;
734                 p[0 .. segment.length] = segment;
735                 p += segment.length;
736                 p[0 .. postfix.length] = postfix;
737                 p += postfix.length;
738                 }
739 
740         // remove trailing seperator
741         if (len)
742             len -= postfix.length;
743         return dst [0 .. len];
744 }
745 
746 /******************************************************************************
747 
748         Repeat an array for a specific number of times. An optional output
749         buffer can be provided to avoid heap activity - it should be large
750         enough to contain the entire output, otherwise the heap will be used
751         instead.
752 
753         Returns a valid slice of the output, containing the concatenated
754         text.
755 
756 ******************************************************************************/
757 
758 mstring repeat (cstring src, size_t count, mstring dst=null)
759 {
760         size_t len = src.length * count;
761         if (len is 0)
762             return null;
763 
764         if (dst.length < len)
765             dst.length = len;
766 
767         for (auto p = dst.ptr; count--; p += src.length)
768              p[0 .. src.length] = src;
769 
770         return dst [0 .. len];
771 }
772 
773 /******************************************************************************
774 
775         Is the argument a whitespace character?
776 
777 ******************************************************************************/
778 
779 bool isSpace (char c)
780 {
781         return (c <= 32 && (c is ' ' || c is '\t' || c is '\r' || c is '\n' || c is '\f' || c is '\v'));
782 }
783 
784 /******************************************************************************
785 
786         Return whether or not the two arrays have matching content
787 
788 ******************************************************************************/
789 
790 bool matching (const(char)* s1, const(char)* s2, size_t length)
791 {
792         return mismatch(s1, s2, length) is length;
793 }
794 
795 /******************************************************************************
796 
797         Returns the index of the first match in str, failing once
798         length is reached. Note that we return 'length' for failure
799         and a 0-based index on success
800 
801 ******************************************************************************/
802 
803 size_t indexOf (const(char)* str, char match, size_t length)
804 {
805         enum m1 = cast(size_t) 0x0101010101010101;
806         enum m2 = cast(size_t) 0x8080808080808080;
807 
808         if (length)
809         {
810            size_t m = match;
811            m += m << 8;
812            m += (m << (8 * 2));
813            m += (m << (8 * 4));
814 
815            auto p = str;
816            auto e = p + length - size_t.sizeof;
817            while (p < e)
818            {
819                  // clear matching T segments
820                  auto v = (*cast(size_t*) p) ^ m;
821                  // test for zero, courtesy of Alan Mycroft
822                  if ((v - m1) & ~v & m2)
823                       break;
824                  p += size_t.sizeof;
825            }
826 
827            e += size_t.sizeof;
828            while (p < e)
829                   if (*p++ is match)
830                       return cast(size_t) (p - str - 1);
831         }
832         return length;
833 }
834 
835 /******************************************************************************
836 
837         Returns the index of a mismatch between s1 & s2, failing when
838         length is reached. Note that we return 'length' upon failure
839         (array content matches) and a 0-based index upon success.
840 
841         Use this as a faster opEquals. Also provides the basis for a
842         faster opCmp, since the index of the first mismatched character
843         can be used to determine the return value
844 
845 ******************************************************************************/
846 
847 size_t mismatch (const(char)* s1, const(char)* s2, size_t length)
848 {
849         verify(s1 && s2);
850 
851         if (length)
852         {
853            auto start = s1;
854            auto e = start + length - size_t.sizeof;
855 
856            while (s1 < e)
857                  {
858                  if (*cast(size_t*) s1 != *cast(size_t*) s2)
859                      break;
860                  s1 += size_t.sizeof;
861                  s2 += size_t.sizeof;
862                  }
863 
864            e += size_t.sizeof;
865            while (s1 < e)
866                   if (*s1++ != *s2++)
867                       return s1 - start - 1;
868         }
869 
870         return length;
871 }
872 
873 /******************************************************************************
874 
875         Iterator to isolate lines.
876 
877         Converts text into a set of lines, where each line is identified
878         by a \n or \r\n combination. The line terminator is stripped from
879         each resultant array.
880 
881         ---
882         foreach (line; lines ("one\ntwo\nthree"))
883                  ...
884         ---
885 
886 ******************************************************************************/
887 
888 LineFruct!(T) lines(T) (T[] src)
889 {
890         LineFruct!(T) lines;
891         lines.src = src;
892         return lines;
893 }
894 
895 /******************************************************************************
896 
897         Iterator to isolate text elements.
898 
899         Splits the provided array wherever a delimiter-set instance is
900         found, and return the resultant segments. The delimiters are
901         excluded from each of the segments. Note that delimiters are
902         matched as a set of alternates rather than as a pattern.
903 
904         Splitting on a single delimiter is considerably faster than
905         splitting upon a set of alternatives.
906 
907         ---
908         foreach (segment; delimiters ("one,two;three", ",;"))
909                  ...
910         ---
911 
912         Has to be templated to propagate mutable/const input qualifier to
913         return struct.
914 
915 ******************************************************************************/
916 
917 DelimFruct!T delimiters (T) (T[] src, cstring set)
918 {
919         DelimFruct!T elements;
920         elements.set = set;
921         elements.src = src;
922         return elements;
923 }
924 
925 /******************************************************************************
926 
927         Iterator to isolate text elements.
928 
929         Split the provided array wherever a pattern instance is found,
930         and return the resultant segments. Pattern are excluded from
931         each of the segments, and an optional sub argument enables
932         replacement.
933 
934         ---
935         foreach (segment; patterns ("one, two, three", ", "))
936                  ...
937         ---
938 
939 ******************************************************************************/
940 
941 PatternFruct!T patterns(T) (T[] src, cstring pattern, T[] sub = null)
942 {
943         PatternFruct!T elements;
944         elements.pattern = pattern;
945         elements.sub = sub;
946         elements.src = src;
947         return elements;
948 }
949 
950 unittest
951 {
952     cstring[] arr;
953     foreach (match; patterns("aaa..bbb..ccc", ".."))
954         arr ~= match;
955     test (arr == [ "aaa", "bbb", "ccc" ]);
956 
957     arr = [ ];
958     foreach (match; patterns("aaa..bbb..ccc", "..", "X"))
959         arr ~= match;
960     test (arr == [ "aaa", "X", "bbb", "X", "ccc" ]);
961 }
962 
963 /******************************************************************************
964 
965         Iterator to isolate optionally quoted text elements.
966 
967         As per elements(), but with the extension of being quote-aware;
968         the set of delimiters is ignored inside a pair of quotes. Note
969         that an unterminated quote will consume remaining content.
970 
971         ---
972         foreach (quote; quotes ("one two 'three four' five", " "))
973                  ...
974         ---
975 
976 ******************************************************************************/
977 
978 QuoteFruct!T quotes(T) (T[] src, cstring set)
979 {
980         QuoteFruct!T quotes;
981         quotes.set = set;
982         quotes.src = src;
983         return quotes;
984 }
985 
986 /*******************************************************************************
987 
988         Arranges text strings in order, using indices to specify where
989         each particular argument should be positioned within the text.
990         This is handy for collating I18N components, or as a simplistic
991         and lightweight formatter. Indices range from zero through nine.
992 
993         ---
994         // write ordered text to the console
995         char[64] tmp;
996 
997         Cout (layout (tmp, "%1 is after %0", "zero", "one")).newline;
998         ---
999 
1000 *******************************************************************************/
1001 
1002 cstring layout(mstring output, cstring format, const(char[])[] arguments ...)
1003 {
1004     return layout_(output, format, arguments);
1005 }
1006 
1007 private cstring layout_ (mstring output, cstring format, const(char[])[] arguments)
1008 {
1009         static badarg   = "{index out of range}";
1010         static toosmall = "{output buffer too small}";
1011 
1012         size_t  pos,
1013                 args;
1014         bool    state;
1015 
1016         args = arguments.length;
1017         foreach (c; format)
1018                 {
1019                 if (state)
1020                    {
1021                    state = false;
1022                    if (c >= '0' && c <= '9')
1023                       {
1024                       size_t index = c - '0';
1025                       if (index < args)
1026                          {
1027                          auto x = arguments[index];
1028 
1029                          size_t limit = pos + x.length;
1030                          if (limit < output.length)
1031                             {
1032                             output [pos .. limit] = x;
1033                             pos = limit;
1034                             continue;
1035                             }
1036                          else
1037                             return toosmall;
1038                          }
1039                       else
1040                          return badarg;
1041                       }
1042                    }
1043                 else
1044                    if (c is '%')
1045                       {
1046                       state = true;
1047                       continue;
1048                       }
1049 
1050                 if (pos < output.length)
1051                    {
1052                    output[pos] = c;
1053                    ++pos;
1054                    }
1055                 else
1056                    return toosmall;
1057                 }
1058 
1059         return output [0..pos];
1060 }
1061 
1062 /******************************************************************************
1063 
1064         Convert 'escaped' chars to normal ones: \t => ^t for example.
1065         Supports \" \' \\ \a \b \f \n \r \t \v
1066 
1067 ******************************************************************************/
1068 
1069 cstring unescape (cstring src, mstring dst = null)
1070 {
1071         ptrdiff_t delta;
1072         auto s = src.ptr;
1073         auto len = src.length;
1074 
1075         // take a peek first to see if there's anything
1076         if ((delta = indexOf (s, '\\', len)) < len)
1077            {
1078            // make some room if not enough provided
1079            if (dst.length < src.length)
1080                dst.length = src.length;
1081            auto d = dst.ptr;
1082 
1083            // copy segments over, a chunk at a time
1084            do {
1085               d [0 .. delta] = s [0 .. delta];
1086               len -= delta;
1087               s += delta;
1088               d += delta;
1089 
1090               // bogus trailing '\'
1091               if (len < 2)
1092                  {
1093                  *d++ = '\\';
1094                  len = 0;
1095                  break;
1096                  }
1097 
1098               // translate \char
1099               char c = s[1];
1100               switch (c)
1101                      {
1102                       case '\\':
1103                            break;
1104                       case '\'':
1105                            c = '\'';
1106                            break;
1107                       case '"':
1108                            c = '"';
1109                            break;
1110                       case 'a':
1111                            c = '\a';
1112                            break;
1113                       case 'b':
1114                            c = '\b';
1115                            break;
1116                       case 'f':
1117                            c = '\f';
1118                            break;
1119                       case 'n':
1120                            c = '\n';
1121                            break;
1122                       case 'r':
1123                            c = '\r';
1124                            break;
1125                       case 't':
1126                            c = '\t';
1127                            break;
1128                       case 'v':
1129                            c = '\v';
1130                            break;
1131                       default:
1132                            *d++ = '\\';
1133                      }
1134               *d++ = c;
1135               len -= 2;
1136               s += 2;
1137               } while ((delta = indexOf (s, '\\', len)) < len);
1138 
1139            // copy tail too
1140            d [0 .. len] = s [0 .. len];
1141            return dst [0 .. (d + len) - dst.ptr];
1142            }
1143         return src;
1144 }
1145 
1146 
1147 /******************************************************************************
1148 
1149         jhash() -- hash a variable-length key into a 32-bit value
1150 
1151           k     : the key (the unaligned variable-length array of bytes)
1152           len   : the length of the key, counting by bytes
1153           level : can be any 4-byte value
1154 
1155         Returns a 32-bit value.  Every bit of the key affects every bit of
1156         the return value.  Every 1-bit and 2-bit delta achieves avalanche.
1157 
1158         About 4.3*len + 80 X86 instructions, with excellent pipelining
1159 
1160         The best hash table sizes are powers of 2.  There is no need to do
1161         mod a prime (mod is sooo slow!).  If you need less than 32 bits,
1162         use a bitmask.  For example, if you need only 10 bits, do
1163 
1164                     h = (h & hashmask(10));
1165 
1166         In which case, the hash table should have hashsize(10) elements.
1167         If you are hashing n strings (ub1 **)k, do it like this:
1168 
1169                     for (i=0, h=0; i<n; ++i) h = hash( k[i], len[i], h);
1170 
1171         By Bob Jenkins, 1996.  bob_jenkins@burtleburtle.net.  You may use
1172         this code any way you wish, private, educational, or commercial.
1173         It's free.
1174 
1175         See http://burtleburtle.net/bob/hash/evahash.html
1176         Use for hash table lookup, or anything where one collision in 2^32
1177         is acceptable. Do NOT use for cryptographic purposes.
1178 
1179 ******************************************************************************/
1180 
1181 size_t jhash (ubyte* k, size_t len, size_t c = 0)
1182 {
1183         size_t a = 0x9e3779b9,
1184              b = 0x9e3779b9,
1185              i = len;
1186 
1187         // handle most of the key
1188         while (i >= 12)
1189               {
1190               a += *cast(uint *)(k+0);
1191               b += *cast(uint *)(k+4);
1192               c += *cast(uint *)(k+8);
1193 
1194               a -= b; a -= c; a ^= (c>>13);
1195               b -= c; b -= a; b ^= (a<<8);
1196               c -= a; c -= b; c ^= (b>>13);
1197               a -= b; a -= c; a ^= (c>>12);
1198               b -= c; b -= a; b ^= (a<<16);
1199               c -= a; c -= b; c ^= (b>>5);
1200               a -= b; a -= c; a ^= (c>>3);
1201               b -= c; b -= a; b ^= (a<<10);
1202               c -= a; c -= b; c ^= (b>>15);
1203               k += 12; i -= 12;
1204               }
1205 
1206         // handle the last 11 bytes
1207         c += len;
1208         switch (i)
1209                {
1210                case 11: c+=(cast(uint)k[10]<<24); goto case;
1211                case 10: c+=(cast(uint)k[9]<<16); goto case;
1212                case 9 : c+=(cast(uint)k[8]<<8); goto case;
1213                case 8 : b+=(cast(uint)k[7]<<24); goto case;
1214                case 7 : b+=(cast(uint)k[6]<<16); goto case;
1215                case 6 : b+=(cast(uint)k[5]<<8); goto case;
1216                case 5 : b+=(cast(uint)k[4]); goto case;
1217                case 4 : a+=(cast(uint)k[3]<<24); goto case;
1218                case 3 : a+=(cast(uint)k[2]<<16); goto case;
1219                case 2 : a+=(cast(uint)k[1]<<8); goto case;
1220                case 1 : a+=(cast(uint)k[0]); goto default;
1221                default:
1222                }
1223 
1224         a -= b; a -= c; a ^= (c>>13);
1225         b -= c; b -= a; b ^= (a<<8);
1226         c -= a; c -= b; c ^= (b>>13);
1227         a -= b; a -= c; a ^= (c>>12);
1228         b -= c; b -= a; b ^= (a<<16);
1229         c -= a; c -= b; c ^= (b>>5);
1230         a -= b; a -= c; a ^= (c>>3);
1231         b -= c; b -= a; b ^= (a<<10);
1232         c -= a; c -= b; c ^= (b>>15);
1233 
1234         return c;
1235 }
1236 
1237 /// ditto
1238 size_t jhash (void[] x, size_t c = 0)
1239 {
1240         return jhash (cast(ubyte*) x.ptr, x.length, c);
1241 }
1242 
1243 
1244 /******************************************************************************
1245 
1246         Helper fruct for iterator lines(). A fruct is a low
1247         impact mechanism for capturing context relating to an
1248         opApply (conjunction of the names struct and foreach)
1249 
1250 ******************************************************************************/
1251 
1252 private struct LineFruct(T)
1253 {
1254         private T[] src;
1255 
1256         int opApply (scope int delegate (ref T[] line) dg)
1257         {
1258                 int     ret;
1259                 size_t  pos,
1260                         mark;
1261                 T[]     line;
1262 
1263                 enum T nl = '\n';
1264                 enum T cr = '\r';
1265 
1266                 while ((pos = locate (src, nl, mark)) < src.length)
1267                       {
1268                       auto end = pos;
1269                       if (end && src[end-1] is cr)
1270                           --end;
1271 
1272                       line = src [mark .. end];
1273                       if ((ret = dg (line)) != 0)
1274                            return ret;
1275                       mark = pos + 1;
1276                       }
1277 
1278                 line = src [mark .. $];
1279                 if (mark <= src.length)
1280                     ret = dg (line);
1281 
1282                 return ret;
1283         }
1284 }
1285 
1286 /******************************************************************************
1287 
1288         Helper fruct for iterator delims(). A fruct is a low
1289         impact mechanism for capturing context relating to an
1290         opApply (conjunction of the names struct and foreach)
1291 
1292 ******************************************************************************/
1293 
1294 private struct DelimFruct (T)
1295 {
1296         private T[]     src;
1297         private cstring set;
1298 
1299         int opApply (scope int delegate (ref T[] token) dg)
1300         {
1301                 int     ret;
1302                 size_t  pos,
1303                         mark;
1304                 T[]     token;
1305 
1306                 // optimize for single delimiter case
1307                 if (set.length is 1)
1308                     while ((pos = locate (src, set[0], mark)) < src.length)
1309                           {
1310                           token = src [mark .. pos];
1311                           if ((ret = dg (token)) != 0)
1312                                return ret;
1313                           mark = pos + 1;
1314                           }
1315                 else
1316                    if (set.length > 1)
1317                        foreach (i, elem; src)
1318                                 if (contains (set, elem))
1319                                    {
1320                                    token = src [mark .. i];
1321                                    if ((ret = dg (token)) != 0)
1322                                         return ret;
1323                                    mark = i + 1;
1324                                    }
1325 
1326                 token = src [mark .. $];
1327                 if (mark <= src.length)
1328                     ret = dg (token);
1329 
1330                 return ret;
1331         }
1332 }
1333 
1334 /******************************************************************************
1335 
1336         Helper fruct for iterator patterns(). A fruct is a low
1337         impact mechanism for capturing context relating to an
1338         opApply (conjunction of the names struct and foreach)
1339 
1340 ******************************************************************************/
1341 
1342 public struct PatternFruct (T)
1343 {
1344         private T[] src, sub;
1345         private cstring pattern;
1346 
1347         int opApply (scope int delegate (ref T[] token) dg)
1348         {
1349                 int     ret;
1350                 size_t  pos,
1351                         mark;
1352                 T[]     token;
1353 
1354                 while ((pos = index (src, pattern, mark)) < src.length)
1355                       {
1356                       token = src [mark .. pos];
1357                       if ((ret = dg(token)) != 0)
1358                            return ret;
1359                       if (sub.ptr && (ret = dg(sub)) != 0)
1360                           return ret;
1361                       mark = pos + pattern.length;
1362                       }
1363 
1364                 token = src [mark .. $];
1365                 if (mark <= src.length)
1366                     ret = dg (token);
1367 
1368                 return ret;
1369         }
1370 }
1371 
1372 /******************************************************************************
1373 
1374         Helper fruct for iterator quotes(). A fruct is a low
1375         impact mechanism for capturing context relating to an
1376         opApply (conjunction of the names struct and foreach)
1377 
1378 ******************************************************************************/
1379 
1380 private struct QuoteFruct (T)
1381 {
1382         private T[]     src;
1383         private cstring set;
1384 
1385         int opApply (scope int delegate (ref T[] token) dg)
1386         {
1387                 int     ret;
1388                 size_t  mark;
1389                 T[]     token;
1390 
1391                 if (set.length)
1392                     for (size_t i=0; i < src.length; ++i)
1393                         {
1394                         T c = src[i];
1395                         if (c is '"' || c is '\'')
1396                             i = locate (src, c, i+1);
1397                         else
1398                            if (contains (set, c))
1399                               {
1400                               token = src [mark .. i];
1401                               if ((ret = dg (token)) != 0)
1402                                    return ret;
1403                               mark = i + 1;
1404                               }
1405                         }
1406 
1407                 token = src [mark .. $];
1408                 if (mark <= src.length)
1409                     ret = dg (token);
1410 
1411                 return ret;
1412         }
1413 }
1414 
1415 
1416 /******************************************************************************
1417 
1418 ******************************************************************************/
1419 
1420 unittest
1421 {
1422     char[64] tmp;
1423 
1424     test (isSpace (' ') && !isSpace ('d'));
1425 
1426     test (indexOf ("abc".ptr, 'a', 3) is 0);
1427     test (indexOf ("abc".ptr, 'b', 3) is 1);
1428     test (indexOf ("abc".ptr, 'c', 3) is 2);
1429     test (indexOf ("abc".ptr, 'd', 3) is 3);
1430     test (indexOf ("abcabcabc".ptr, 'd', 9) is 9);
1431 
1432     test (mismatch ("abc".ptr, "abc".ptr, 3) is 3);
1433     test (mismatch ("abc".ptr, "abd".ptr, 3) is 2);
1434     test (mismatch ("abc".ptr, "acc".ptr, 3) is 1);
1435     test (mismatch ("abc".ptr, "ccc".ptr, 3) is 0);
1436 
1437     test (matching ("abc".ptr, "abc".ptr, 3));
1438     test (matching ("abc".ptr, "abb".ptr, 3) is false);
1439 
1440     test (contains ("abc", 'a'));
1441     test (contains ("abc", 'b'));
1442     test (contains ("abc", 'c'));
1443     test (contains ("abc", 'd') is false);
1444 
1445     test (containsPattern ("abc", "ab"));
1446     test (containsPattern ("abc", "bc"));
1447     test (containsPattern ("abc", "abc"));
1448     test (containsPattern ("abc", "zabc") is false);
1449     test (containsPattern ("abc", "abcd") is false);
1450     test (containsPattern ("abc", "za") is false);
1451     test (containsPattern ("abc", "cd") is false);
1452 
1453     test (trim ("") == "");
1454     test (trim (" abc  ") == "abc");
1455     test (trim ("   ") == "");
1456 
1457     test (strip ("", '%') == "");
1458     test (strip ("%abc%%%", '%') == "abc");
1459     test (strip ("#####", '#') == "");
1460     test (stripl ("#####", '#') == "");
1461     test (stripl (" ###", ' ') == "###");
1462     test (stripl ("#####", 's') == "#####");
1463     test (stripr ("#####", '#') == "");
1464     test (stripr ("### ", ' ') == "###");
1465     test (stripr ("#####", 's') == "#####");
1466 
1467     test (replace ("abc".dup, 'b', ':') == "a:c");
1468     test (substitute ("abc", "bc", "x") == "ax");
1469 
1470     test (locate ("abc".dup, 'c', 1) is 2);
1471 
1472     test (locate ("abc", 'c') is 2);
1473     test (locate ("abc", 'a') is 0);
1474     test (locate ("abc", 'd') is 3);
1475     test (locate ("", 'c') is 0);
1476 
1477     test (locatePrior ("abce".dup, 'c') is 2);
1478     test (locatePrior ("abce", 'a') is 0);
1479     test (locatePrior ("abce", 'd') is 4);
1480     test (locatePrior ("abce", 'c', 3) is 2);
1481     test (locatePrior ("abce", 'c', 2) is 4);
1482     test (locatePrior ("", 'c') is 0);
1483 
1484     auto x = delimit ("::b", ":");
1485     test (x.length is 3 && x[0] == "" && x[1] == "" && x[2] == "b");
1486     x = delimit ("a:bc:d", ":");
1487     test (x.length is 3 && x[0] == "a" && x[1] == "bc" && x[2] == "d");
1488     x = delimit ("abcd", ":");
1489     test (x.length is 1 && x[0] == "abcd");
1490     x = delimit ("abcd:", ":");
1491     test (x.length is 2 && x[0] == "abcd" && x[1] == "");
1492     x = delimit ("a;b$c#d:e@f", ";:$#@");
1493     test (x.length is 6 && x[0]=="a" && x[1]=="b" && x[2]=="c" &&
1494             x[3]=="d" && x[4]=="e" && x[5]=="f");
1495 
1496     test (locatePattern ("abcdefg".dup, "") is 7);
1497     test (locatePattern ("abcdefg", "g") is 6);
1498     test (locatePattern ("abcdefg", "abcdefg") is 0);
1499     test (locatePattern ("abcdefg", "abcdefgx") is 7);
1500     test (locatePattern ("abcdefg", "cce") is 7);
1501     test (locatePattern ("abcdefg", "cde") is 2);
1502     test (locatePattern ("abcdefgcde", "cde", 3) is 7);
1503 
1504     test (locatePatternPrior ("abcdefg".dup, "") is 7);
1505     test (locatePatternPrior ("abcdefg", "cce") is 7);
1506     test (locatePatternPrior ("abcdefg", "cde") is 2);
1507     test (locatePatternPrior ("abcdefgcde", "cde", 6) is 2);
1508     test (locatePatternPrior ("abcdefgcde", "cde", 4) is 2);
1509     test (locatePatternPrior ("abcdefg", "abcdefgx") is 7);
1510 
1511     x = splitLines ("a\nb\n");
1512     test (x.length is 3 && x[0] == "a" && x[1] == "b" && x[2] == "");
1513     x = splitLines ("a\r\n");
1514     test (x.length is 2 && x[0] == "a" && x[1] == "");
1515 
1516     x = splitLines ("a");
1517     test (x.length is 1 && x[0] == "a");
1518     x = splitLines ("");
1519     test (x.length is 1);
1520 
1521     cstring[] q;
1522     foreach (element; quotes ("1 'avcc   cc ' 3", " "))
1523         q ~= element;
1524     test (q.length is 3 && q[0] == "1" && q[1] == "'avcc   cc '" && q[2] == "3");
1525 
1526     test (layout (tmp, "%1,%%%c %0", "abc", "efg") == "efg,%c abc");
1527 
1528     x = split ("one, two, three", ",");
1529     test (x.length is 3 && x[0] == "one" && x[1] == " two" && x[2] == " three");
1530     x = split ("one, two, three", ", ");
1531     test (x.length is 3 && x[0] == "one" && x[1] == "two" && x[2] == "three");
1532     x = split ("one, two, three", ",,");
1533     test (x.length is 1 && x[0] == "one, two, three");
1534     x = split ("one,,", ",");
1535     test (x.length is 3 && x[0] == "one" && x[1] == "" && x[2] == "");
1536 
1537     istring t, h;
1538     h =  head ("one:two:three", ":", t);
1539     test (h == "one" && t == "two:three");
1540     h = head ("one:::two:three", ":::", t);
1541     test (h == "one" && t == "two:three");
1542     h = head ("one:two:three", "*", t);
1543     test (h == "one:two:three" && t is null);
1544 
1545     t =  tail ("one:two:three", ":", h);
1546     test (h == "one:two" && t == "three");
1547     t = tail ("one:::two:three", ":::", h);
1548     test (h == "one" && t == "two:three");
1549     t = tail ("one:two:three", "*", h);
1550     test (t == "one:two:three" && h is null);
1551 
1552     test (chopl("hello world", "hello ") == "world");
1553     test (chopl("hello", "hello") == "");
1554     test (chopl("hello world", " ") == "hello world");
1555     test (chopl("hello world", "") == "hello world");
1556 
1557     test (chopr("hello world", " world") == "hello");
1558     test (chopr("hello", "hello") == "");
1559     test (chopr("hello world", " ") == "hello world");
1560     test (chopr("hello world", "") == "hello world");
1561 
1562     istring[] foo = ["one", "two", "three"];
1563     auto j = join (foo);
1564     test (j == "onetwothree");
1565     j = join (foo, ", ");
1566     test (j == "one, two, three");
1567     j = join (foo, " ", tmp);
1568     test (j == "one two three");
1569     test (j.ptr is tmp.ptr);
1570 
1571     test (repeat ("abc", 0) == "");
1572     test (repeat ("abc", 1) == "abc");
1573     test (repeat ("abc", 2) == "abcabc");
1574     test (repeat ("abc", 4) == "abcabcabcabc");
1575     test (repeat ("", 4) == "");
1576     char[10] rep;
1577     test (repeat ("abc", 0, rep) == "");
1578     test (repeat ("abc", 1, rep) == "abc");
1579     test (repeat ("abc", 2, rep) == "abcabc");
1580     test (repeat ("", 4, rep) == "");
1581 
1582     test (unescape ("abc") == "abc");
1583     test (unescape ("abc\\") == "abc\\");
1584     test (unescape ("abc\\t") == "abc\t");
1585     test (unescape ("abc\\tc") == "abc\tc");
1586     test (unescape ("\\t") == "\t");
1587     test (unescape ("\\tx") == "\tx");
1588     test (unescape ("\\v\\vx") == "\v\vx");
1589     test (unescape ("abc\\t\\a\\bc") == "abc\t\a\bc");
1590 }
1591 
1592 debug (Util)
1593 {
1594         auto x = import("Util.d");
1595 
1596         void main()
1597         {
1598                 mismatch ("".ptr, x.ptr, 0);
1599                 indexOf ("".ptr, '@', 0);
1600                 char[] s;
1601                 split (s, " ");
1602                 //indexOf (s.ptr, '@', 0);
1603 
1604         }
1605 }