1 /******************************************************************************
2 
3     String splitting utilities
4 
5     - The SplitStr class splits a string by occurrences of a delimiter string.
6     - The SplitChr class splits a string by occurrences of a delimiter
7       character.
8 
9     Copyright:
10         Copyright (c) 2009-2016 dunnhumby Germany GmbH.
11         All rights reserved.
12 
13     License:
14         Boost Software License Version 1.0. See LICENSE_BOOST.txt for details.
15         Alternatively, this file may be distributed under the terms of the Tango
16         3-Clause BSD License (see LICENSE_BSD.txt for details).
17 
18  ******************************************************************************/
19 
20 module ocean.text.util.SplitIterator;
21 
22 import ocean.meta.types.Qualifiers;
23 import ocean.core.Array: concat, copy;
24 import ocean.core.Verify;
25 import ocean.io.Stdout;
26 import ocean.text.Search: SearchFruct, search;
27 
28 import core.stdc..string: strlen, memchr, strcspn;
29 import core.stdc.ctype: isspace;
30 import core.sys.posix.sys.types: ssize_t;
31 
32 version (unittest) import ocean.core.Test;
33 
34 /******************************************************************************
35 
36     Splits a string by occurrences of a delimiter string.
37 
38     Memory friendly, suitable for stack-allocated scope instances.
39 
40  ******************************************************************************/
41 
42 class StrSplitIterator : ISplitIterator
43 {
44     alias typeof(.search(cstring.init)) Search;
45 
46     /**************************************************************************
47 
48         Contains the delimiter as match string and manages a table of indices to
49         improve the search algorithm efficiency. May be modified at any time
50         using its methods.
51 
52      **************************************************************************/
53 
54     public Search sf;
55 
56     /**************************************************************************
57 
58         Constructor
59 
60         Params:
61             delim_ = delimiter string
62 
63      **************************************************************************/
64 
65     public this ( cstring delim_ )
66     {
67         this.sf = .search(delim_);
68     }
69 
70     /**************************************************************************
71 
72         Constructor
73 
74         Params:
75             delim_ = delimiter string
76             content = Content string to split. content will be sliced (not copied).
77 
78      **************************************************************************/
79 
80     public this ( cstring delim_, cstring content )
81     {
82         this(delim_);
83         this.reset(content);
84     }
85 
86     /**************************************************************************
87 
88         Constructor
89 
90         Intended to be used for a 'scope' instance where a SearchFruct instance
91         is stored somewhere in order to reuse the search index.
92 
93         Params:
94             sf_in = delimiter string
95 
96      **************************************************************************/
97 
98     public this ( Search sf_in )
99     {
100         this.sf = sf_in;
101     }
102 
103     /**************************************************************************
104 
105         Sets the delimiter string. delim_ may or may not be NUL-terminated;
106         however, only the last character may be NUL.
107 
108         Params:
109             delim_ = new delimiter string (will be copied into an internal
110                      buffer)
111 
112         Returns:
113             delim_
114 
115      **************************************************************************/
116 
117     public cstring delim ( cstring delim_ )
118     {
119         this.sf.match = delim_;
120 
121         return delim_;
122     }
123 
124     /**************************************************************************
125 
126         Returns:
127             current delimiter string (without NUL-terminator; slices an internal
128             buffer)
129 
130      **************************************************************************/
131 
132     public cstring delim ( )
133     {
134         return this.sf.match;
135     }
136 
137     /**************************************************************************
138 
139         Locates the first occurrence of the current delimiter string in str,
140         starting from str[start].
141 
142         Params:
143              str     = string to scan for delimiter
144              start   = search start index
145 
146         Returns:
147              index of first occurrence of the current delimiter string in str or
148              str.length if not found
149 
150      **************************************************************************/
151 
152     public override size_t locateDelim ( cstring str, size_t start = 0 )
153     {
154         return this.sf.forward(str, start);
155     }
156 
157     /**************************************************************************
158 
159         Skips the delimiter which str starts with.
160         Note that the result is correct only if str really starts with a
161         delimiter.
162 
163         Params:
164             str = string starting with delimiter
165 
166         Returns:
167             index of the first character after the starting delimiter in str
168 
169      **************************************************************************/
170 
171     protected override size_t skipDelim ( cstring str )
172     {
173         verify (str.length >= this.delim.length);
174 
175         return this.sf.match.length;
176     }
177 }
178 
179 unittest
180 {
181     scope split = new StrSplitIterator("123");
182 
183     split.collapse = true;
184 
185     foreach (str; ["123" ~ "ab" ~ "123" ~ "cd" ~ "123" ~ "efg" ~ "123",
186                    "123" ~ "ab" ~ "123" ~ "123" ~ "cd" ~ "123" ~ "efg" ~ "123",
187                    "123" ~ "ab" ~ "123" ~ "123" ~ "cd" ~ "123" ~ "efg",
188                    "ab" ~ "123" ~ "123" ~ "cd" ~ "123" ~ "efg",
189 
190                    "123" ~ "123" ~ "ab" ~ "123" ~ "123"~ "cd" ~ "123" ~ "efg",
191                    "ab" ~ "123" ~ "123" ~ "cd" ~ "123" ~ "efg" ~ "123" ~ "123"])
192     {
193         foreach (element; split.reset(str))
194         {
195             istring[] elements = ["ab", "cd", "efg"];
196 
197             test (split.n);
198             test (split.n <= elements.length);
199             test (element == elements[split.n - 1]);
200         }
201     }
202 
203     split.collapse = false;
204 
205     foreach (element; split.reset("ab" ~ "123" ~ "cd" ~ "123" ~ "efg"))
206     {
207         istring[] elements = ["ab", "cd", "efg"];
208 
209         test (split.n);
210         test (split.n <= elements.length);
211         test (element == elements[split.n - 1]);
212     }
213 
214     foreach (element; split.reset("123" ~ "ab"~ "123" ~ "cd" ~ "123" ~ "efg" ~ "123"))
215     {
216         istring[] elements = ["", "ab", "cd", "efg", ""];
217 
218         test (split.n);
219         test (split.n <= elements.length);
220         test (element == elements[split.n - 1]);
221     }
222 
223     split.reset("ab" ~ "123" ~ "cd" ~ "123" ~ "efg");
224 
225     test (split.next == "ab");
226     test (split.next == "cd");
227     test (split.next == "efg");
228 
229 }
230 
231 unittest
232 {
233     scope split_constr = new StrSplitIterator("123", "ab" ~ "123" ~ "cd" ~ "123" ~ "efg");
234 
235     test (split_constr.next == "ab");
236     test (split_constr.next == "cd");
237     test (split_constr.next == "efg");
238 }
239 
240 
241 /******************************************************************************
242 
243     Splits a string by occurrences of a delimiter character
244 
245  ******************************************************************************/
246 
247 class ChrSplitIterator : ISplitIterator
248 {
249     /**************************************************************************
250 
251         Delimiter character. Must be specified in the constructor but may be
252         changed at any time, even during iteration.
253 
254      **************************************************************************/
255 
256     public char delim;
257 
258     /**************************************************************************
259 
260         Constructor
261 
262         Params:
263             delim_ = delimiter character
264 
265      **************************************************************************/
266 
267     public this ( char delim_ )
268     {
269         this.delim = delim_;
270     }
271 
272     /**************************************************************************
273 
274         Constructor
275 
276         Params:
277             delim_ = delimiter character
278             content = Content string to split. content will be sliced (not copied).
279 
280      **************************************************************************/
281 
282     public this ( char delim_, cstring content )
283     {
284         this(delim_);
285         this.reset(content);
286     }
287 
288     /**************************************************************************
289 
290         Locates the first occurrence of delim in str starting with str[start].
291 
292         Params:
293              str   = string to scan
294              start = search start index, must be at most str.length
295 
296         Returns:
297              index of first occurrence of delim in str or str.length if not
298              found
299 
300      **************************************************************************/
301 
302     public override size_t locateDelim ( cstring str, size_t start = 0 )
303     {
304         verify(
305             start <= str.length,
306             typeof (this).stringof ~ ".locateDelim: start index out of range"
307         );
308 
309         char* item = cast (char*) memchr(str.ptr + start, this.delim, str.length - start);
310 
311         return item? item - str.ptr : str.length;
312     }
313 
314     /**************************************************************************
315 
316         Skips the delimiter which str starts with.
317         Note that the result is correct only if str really starts with a
318         delimiter.
319 
320         Params:
321             str = string starting with delimiter
322 
323         Returns:
324             index of the first character after the starting delimiter in str
325 
326      **************************************************************************/
327 
328     protected override size_t skipDelim ( cstring str )
329     {
330         verify(str.length >= 1);
331 
332         return 1;
333     }
334 }
335 
336 unittest
337 {
338     scope split_constr = new ChrSplitIterator('1', "ab" ~ "1" ~ "cd" ~ "1" ~ "efg");
339 
340     test (split_constr.next == "ab");
341     test (split_constr.next == "cd");
342     test (split_constr.next == "efg");
343 }
344 
345 
346 /******************************************************************************
347 
348     Base class
349 
350  ******************************************************************************/
351 
352 abstract class ISplitIterator
353 {
354     /**************************************************************************
355 
356         Set to true to collapse consecutive delimiter occurrences to a single
357         one to prevent producing empty segments.
358 
359      **************************************************************************/
360 
361     public bool collapse = false;
362 
363     /**************************************************************************
364 
365         Set to true to do a 'foreach' cycle with the remaining content after
366         the last delimiter occurrence or when no delimiter is found.
367 
368      **************************************************************************/
369 
370     public bool include_remaining = true;
371 
372     /**************************************************************************
373 
374         String to split on next iteration and slice to remaining content.
375 
376      **************************************************************************/
377 
378     private cstring content, remaining_;
379 
380     /**************************************************************************
381 
382         'foreach' iteration counter
383 
384      **************************************************************************/
385 
386     private uint n_ = 0;
387 
388     /**************************************************************************
389 
390         Union of the supported 'foreach' iteration delegate types
391 
392      **************************************************************************/
393 
394     protected union IterationDelegate
395     {
396         int delegate ( ref size_t pos, ref cstring segment ) with_pos;
397 
398         int delegate ( ref cstring segment ) without_pos;
399     }
400 
401     /**************************************************************************
402 
403         Consistency check
404 
405      **************************************************************************/
406 
407     invariant ( )
408     {
409         if (this.n_)
410         {
411             assert (this.content);
412         }
413 
414         /*
415          * TODO: Is this what
416          * ---
417          * assert (this.content[$ - this.remaining_.length .. $] is this.remaining_);
418          * ---
419          * does, that is, comparing the memory location for identity, not the
420          * content? If so, replace it.
421          */
422 
423         assert (this.remaining_.length <= this.content.length);
424 
425         if (this.remaining_.length)
426         {
427             assert (this.remaining_.ptr is &this.content[$ - this.remaining_.length]);
428         }
429     }
430 
431     /**************************************************************************
432 
433         Sets the content string to split on next iteration.
434 
435         Params:
436             content = Content string to split; pass null to clear the content.
437                       content will be sliced (not copied).
438 
439         Returns:
440             this instance
441 
442      **************************************************************************/
443 
444     public typeof (this) reset ( cstring content = null )
445     {
446         this.content    = content;
447         this.remaining_ = this.content;
448         this.n_         = 0;
449 
450         return this;
451     }
452 
453     /**************************************************************************
454 
455         'foreach' iteration over string slices between the current and the next
456         delimiter. n() returns the number of 'foreach' loop cycles so far,
457         remaining() the slice after the next delimiter to the content end.
458         If no delimiter was found, n() is 0 after 'foreach' has finished and
459         remaining() returns the content.
460 
461         segment slices content so do not modify it. However, the content of
462         segment may be modified which will result in an in-place modification
463         of the content.
464 
465      **************************************************************************/
466 
467     public int opApply ( scope int delegate ( ref cstring segment ) dg_in )
468     {
469         IterationDelegate dg;
470 
471         dg.without_pos = dg_in;
472 
473         return this.opApply_(false, dg);
474     }
475 
476     /**************************************************************************
477 
478         'foreach' iteration over string slices between the current and the next
479         delimiter. n() returns the number of 'foreach' loop cycles so far,
480         remaining() the slice after the next delimiter to the content end.
481         If no delimiter was found, n() is 0 after 'foreach' has finished and
482         remaining() returns the content.
483 
484         pos references the current content position and may be changed to
485         specify the position where searching should be continued. If changed,
486         pos must be at most content.length.
487 
488         segment slices content so do not modify it. However, the content of
489         segment may be modified which will result in an in-place modification
490         of the content.
491 
492      **************************************************************************/
493 
494     public int opApply ( scope int delegate ( ref size_t pos, ref cstring segment ) dg_in )
495     {
496         IterationDelegate dg;
497 
498         dg.with_pos = dg_in;
499 
500         return this.opApply_(true, dg);
501     }
502 
503     /**************************************************************************
504 
505         Returns:
506             the number of 'foreach' loop cycles so far. If the value is 0,
507             either no 'foreach' iteration has been done since last reset() or
508             there is no delimiter occurrence in the content string. remaining()
509             will then return the content string.
510 
511      **************************************************************************/
512 
513     public uint n ( )
514     {
515         return this.n_;
516     }
517 
518     /**************************************************************************
519 
520         Returns:
521             - a slice to the content string after the next delimiter when
522               currently doing a 'foreach' iteration,
523             - a slice to the content string after the last delimiter after a
524               'foreach' iteration has finished,
525             - the content string if no 'foreach' iteration has been done or
526               there is no delimiter occurrence in the content string.
527 
528      **************************************************************************/
529 
530     public cstring remaining ( )
531     {
532         return this.remaining_;
533     }
534 
535     /**************************************************************************
536 
537         Locates the first delimiter occurrence in str starting from str[start].
538 
539 
540 
541         Params:
542             str   = str to locate first delimiter occurrence in
543             start = start index
544 
545         Returns:
546             index of the first delimiter occurrence in str or str.length if not
547             found
548 
549      **************************************************************************/
550 
551     abstract size_t locateDelim ( cstring str, size_t start = 0 );
552 
553     /**************************************************************************
554 
555         Locates the first delimiter occurrence in the current content string
556         starting from content[start].
557 
558         Params:
559             start = start index, must be at most content.length
560 
561         Returns:
562             index of the first delimiter occurrence in str or str.length
563             either not found or start >= content.length
564 
565      **************************************************************************/
566 
567     public size_t locateDelim ( size_t start = 0 )
568     {
569         verify (start <= this.content.length,
570             typeof (this).stringof ~ ".locateDelim(): start index out of range");
571         return this.locateDelim(this.content, start);
572     }
573 
574     /**************************************************************************
575 
576         Skips initial consecutive occurrences of the current delimiter in the
577         currently remaining content.
578 
579         Returns:
580              remaining content after the delimiters have been skipped.
581 
582      **************************************************************************/
583 
584     public cstring skipLeadingDelims ( )
585     {
586         size_t start = 0,
587                pos   = this.locateDelim(this.remaining_);
588 
589         while (pos == start && pos < this.remaining_.length)
590         {
591             start = pos + this.skipDelim(this.remaining_[pos .. $]);
592 
593             pos = this.locateDelim(this.remaining_, start);
594         }
595 
596         return this.remaining_ = this.remaining_[start .. $];
597     }
598 
599     /**************************************************************************
600 
601         Searches the next delimiter.
602 
603         Returns:
604             a slice to the content between the previous and next delimiter, if
605             found. If not found and include_remaining is true, the remaining
606             content is returned or null if include_remaining is false.
607 
608      **************************************************************************/
609 
610     public cstring next ( )
611     {
612         if (this.remaining_.length)
613         {
614             this.n_++;
615 
616             if (this.collapse)
617             {
618                 this.skipLeadingDelims();
619             }
620 
621             size_t start = this.content.length - this.remaining_.length,
622                    end   = this.locateDelim(start);
623 
624             if (end < this.content.length)
625             {
626                 this.remaining_ = this.content[end + this.skipDelim(this.content[end .. $]) .. $];
627 
628                 return this.content[start .. end];
629             }
630             else if (this.include_remaining)
631             {
632                 scope (success) this.remaining_ = null;
633 
634                 return this.remaining_;
635             }
636             else
637             {
638                 return null;
639             }
640         }
641         else
642         {
643             return null;
644         }
645     }
646 
647     /**************************************************************************
648 
649         'foreach' iteration over string slices between the current and the next
650         delimiter.
651 
652         Params:
653             with_pos = true: use dg.with_pos, false: user dg.without_pos
654             dg       = iteration delegate
655 
656         Returns:
657             passes through dg() return value.
658 
659      **************************************************************************/
660 
661     protected int opApply_ ( bool with_pos, IterationDelegate dg )
662     {
663         int result = 0;
664 
665         if (this.remaining_.length)
666         {
667             if (this.collapse)
668             {
669                 this.skipLeadingDelims();
670             }
671 
672             size_t start = this.content.length - this.remaining_.length;
673 
674             for (size_t pos = this.locateDelim(start);
675                         pos < this.content.length;
676                         pos = this.locateDelim(start))
677             {
678                 size_t next = pos + this.skipDelim(this.content[pos .. $]);
679 
680                 if (!(pos == start && collapse))
681                 {
682                     this.n_++;
683 
684                     cstring segment  = this.content[start ..  pos];
685                     this.remaining_ = this.content[next .. $];
686 
687                     if (with_pos)
688                     {
689                         result = dg.with_pos(next, segment);
690 
691                         verify (next <= this.content.length,
692                                 typeof (this).stringof ~ ": iteration delegate "
693                                 ~ "set the position out of range");
694 
695                         this.remaining_ = this.content[next .. $];
696                     }
697                     else
698                     {
699                         result = dg.without_pos(segment);
700                     }
701                 }
702 
703                 start = next;
704 
705                 if (result || start >= this.content.length) break;
706             }
707 
708             this.remaining_ = this.content[start .. $];
709 
710             if (this.include_remaining &&
711                 !(result || (!this.remaining_.length && this.collapse)))
712             {
713                 this.n_++;
714 
715                 cstring segment = this.remaining_;
716 
717                 this.remaining_ = "";
718 
719                 result = with_pos? dg.with_pos(start, segment) :
720                                    dg.without_pos(segment);
721             }
722         }
723 
724         return result;
725     }
726 
727     /**************************************************************************
728 
729         Skips the delimiter which str starts with.
730         The return value is at most str.length.
731         It is assured that str starts with a delimiter so a subclass may return
732         an undefined result otherwise. Additionally, a subclass is encouraged to
733         use an 'in' contract to ensure str starts with a delimiter and/or is
734         long enought to skip a leading delimiter.
735 
736         Params:
737             str = string starting with delimiter
738 
739         Returns:
740             index of the first character after the starting delimiter in str
741 
742      **************************************************************************/
743 
744     abstract protected size_t skipDelim ( cstring str );
745 
746     /***************************************************************************
747 
748         Trims white space from str.
749 
750         Params:
751              str       = input string
752 
753         Returns:
754              the resulting string
755 
756     ***************************************************************************/
757 
758     static cstring trim ( cstring str )
759     {
760         foreach_reverse (i, c; str)
761         {
762             if (!isspace(c))
763             {
764                 str = str[0 .. i + 1];
765                 break;
766             }
767         }
768 
769         foreach (i, c; str)
770         {
771             if (!isspace(c))
772             {
773                 return str[i .. $];
774             }
775         }
776 
777         return str? str[0 .. 0] : null;
778     }
779 
780 }