1 /******************************************************************************
2 
3     String splitting utilities
4 
5     - The SplitStr class splits a string by occurrences of a delimiter string.
6     - The SplitChr class splits a string by occurrences of a delimiter
7       character.
8 
9     Copyright:
10         Copyright (c) 2009-2016 dunnhumby Germany GmbH.
11         All rights reserved.
12 
13     License:
14         Boost Software License Version 1.0. See LICENSE_BOOST.txt for details.
15         Alternatively, this file may be distributed under the terms of the Tango
16         3-Clause BSD License (see LICENSE_BSD.txt for details).
17 
18  ******************************************************************************/
19 
20 module ocean.text.util.SplitIterator;
21 
22 
23 import ocean.transition;
24 
25 import ocean.core.Array: concat, copy;
26 import ocean.core.Verify;
27 
28 import ocean.stdc.string: strlen, memchr, strcspn;
29 import core.stdc.ctype: isspace;
30 
31 import ocean.stdc.posix.sys.types: ssize_t;
32 
33 import ocean.io.Stdout;
34 import ocean.text.Search: SearchFruct, search;
35 
36 version(UnitTest) import ocean.core.Test;
37 
38 /******************************************************************************
39 
40     Splits a string by occurrences of a delimiter string.
41 
42     Memory friendly, suitable for stack-allocated scope instances.
43 
44  ******************************************************************************/
45 
46 class StrSplitIterator : ISplitIterator
47 {
48     alias typeof(.search(cstring.init)) Search;
49 
50     /**************************************************************************
51 
52         Contains the delimiter as match string and manages a table of indices to
53         improve the search algorithm efficiency. May be modified at any time
54         using its methods.
55 
56      **************************************************************************/
57 
58     public Search sf;
59 
60     /**************************************************************************
61 
62         Constructor
63 
64         Params:
65             delim_ = delimiter string
66 
67      **************************************************************************/
68 
69     public this ( cstring delim_ )
70     {
71         this.sf = .search(delim_);
72     }
73 
74     /**************************************************************************
75 
76         Constructor
77 
78         Params:
79             delim_ = delimiter string
80             content = Content string to split. content will be sliced (not copied).
81 
82      **************************************************************************/
83 
84     public this ( cstring delim_, cstring content )
85     {
86         this(delim_);
87         this.reset(content);
88     }
89 
90     /**************************************************************************
91 
92         Constructor
93 
94         Intended to be used for a 'scope' instance where a SearchFruct instance
95         is stored somewhere in order to reuse the search index.
96 
97         Params:
98             sf_in = delimiter string
99 
100      **************************************************************************/
101 
102     public this ( Search sf_in )
103     {
104         this.sf = sf_in;
105     }
106 
107     /**************************************************************************
108 
109         Sets the delimiter string. delim_ may or may not be NUL-terminated;
110         however, only the last character may be NUL.
111 
112         Params:
113             delim_ = new delimiter string (will be copied into an internal
114                      buffer)
115 
116         Returns:
117             delim_
118 
119      **************************************************************************/
120 
121     public cstring delim ( cstring delim_ )
122     {
123         this.sf.match = delim_;
124 
125         return delim_;
126     }
127 
128     /**************************************************************************
129 
130         Returns:
131             current delimiter string (without NUL-terminator; slices an internal
132             buffer)
133 
134      **************************************************************************/
135 
136     public cstring delim ( )
137     {
138         return this.sf.match;
139     }
140 
141     /**************************************************************************
142 
143         Locates the first occurrence of the current delimiter string in str,
144         starting from str[start].
145 
146         Params:
147              str     = string to scan for delimiter
148              start   = search start index
149 
150         Returns:
151              index of first occurrence of the current delimiter string in str or
152              str.length if not found
153 
154      **************************************************************************/
155 
156     public override size_t locateDelim ( cstring str, size_t start = 0 )
157     {
158         return this.sf.forward(str, start);
159     }
160 
161     /**************************************************************************
162 
163         Skips the delimiter which str starts with.
164         Note that the result is correct only if str really starts with a
165         delimiter.
166 
167         Params:
168             str = string starting with delimiter
169 
170         Returns:
171             index of the first character after the starting delimiter in str
172 
173      **************************************************************************/
174 
175     protected override size_t skipDelim ( cstring str )
176     {
177         verify (str.length >= this.delim.length);
178 
179         return this.sf.match.length;
180     }
181 }
182 
183 unittest
184 {
185     scope split = new StrSplitIterator("123");
186 
187     split.collapse = true;
188 
189     foreach (str; ["123" ~ "ab" ~ "123" ~ "cd" ~ "123" ~ "efg" ~ "123",
190                    "123" ~ "ab" ~ "123" ~ "123" ~ "cd" ~ "123" ~ "efg" ~ "123",
191                    "123" ~ "ab" ~ "123" ~ "123" ~ "cd" ~ "123" ~ "efg",
192                    "ab" ~ "123" ~ "123" ~ "cd" ~ "123" ~ "efg",
193 
194                    "123" ~ "123" ~ "ab" ~ "123" ~ "123"~ "cd" ~ "123" ~ "efg",
195                    "ab" ~ "123" ~ "123" ~ "cd" ~ "123" ~ "efg" ~ "123" ~ "123"])
196     {
197         foreach (element; split.reset(str))
198         {
199             istring[] elements = ["ab", "cd", "efg"];
200 
201             test (split.n);
202             test (split.n <= elements.length);
203             test (element == elements[split.n - 1]);
204         }
205     }
206 
207     split.collapse = false;
208 
209     foreach (element; split.reset("ab" ~ "123" ~ "cd" ~ "123" ~ "efg"))
210     {
211         istring[] elements = ["ab", "cd", "efg"];
212 
213         test (split.n);
214         test (split.n <= elements.length);
215         test (element == elements[split.n - 1]);
216     }
217 
218     foreach (element; split.reset("123" ~ "ab"~ "123" ~ "cd" ~ "123" ~ "efg" ~ "123"))
219     {
220         istring[] elements = ["", "ab", "cd", "efg", ""];
221 
222         test (split.n);
223         test (split.n <= elements.length);
224         test (element == elements[split.n - 1]);
225     }
226 
227     split.reset("ab" ~ "123" ~ "cd" ~ "123" ~ "efg");
228 
229     test (split.next == "ab");
230     test (split.next == "cd");
231     test (split.next == "efg");
232 
233 }
234 
235 unittest
236 {
237     scope split_constr = new StrSplitIterator("123", "ab" ~ "123" ~ "cd" ~ "123" ~ "efg");
238 
239     test (split_constr.next == "ab");
240     test (split_constr.next == "cd");
241     test (split_constr.next == "efg");
242 }
243 
244 
245 /******************************************************************************
246 
247     Splits a string by occurrences of a delimiter character
248 
249  ******************************************************************************/
250 
251 class ChrSplitIterator : ISplitIterator
252 {
253     /**************************************************************************
254 
255         Delimiter character. Must be specified in the constructor but may be
256         changed at any time, even during iteration.
257 
258      **************************************************************************/
259 
260     public char delim;
261 
262     /**************************************************************************
263 
264         Constructor
265 
266         Params:
267             delim_ = delimiter character
268 
269      **************************************************************************/
270 
271     public this ( char delim_ )
272     {
273         this.delim = delim_;
274     }
275 
276     /**************************************************************************
277 
278         Constructor
279 
280         Params:
281             delim_ = delimiter character
282             content = Content string to split. content will be sliced (not copied).
283 
284      **************************************************************************/
285 
286     public this ( char delim_, cstring content )
287     {
288         this(delim_);
289         this.reset(content);
290     }
291 
292     /**************************************************************************
293 
294         Locates the first occurrence of delim in str starting with str[start].
295 
296         Params:
297              str   = string to scan
298              start = search start index, must be at most str.length
299 
300         Returns:
301              index of first occurrence of delim in str or str.length if not
302              found
303 
304      **************************************************************************/
305 
306     public override size_t locateDelim ( cstring str, size_t start = 0 )
307     {
308         verify(
309             start <= str.length,
310             typeof (this).stringof ~ ".locateDelim: start index out of range"
311         );
312 
313         char* item = cast (char*) memchr(str.ptr + start, this.delim, str.length - start);
314 
315         return item? item - str.ptr : str.length;
316     }
317 
318     /**************************************************************************
319 
320         Skips the delimiter which str starts with.
321         Note that the result is correct only if str really starts with a
322         delimiter.
323 
324         Params:
325             str = string starting with delimiter
326 
327         Returns:
328             index of the first character after the starting delimiter in str
329 
330      **************************************************************************/
331 
332     protected override size_t skipDelim ( cstring str )
333     {
334         verify(str.length >= 1);
335 
336         return 1;
337     }
338 }
339 
340 unittest
341 {
342     scope split_constr = new ChrSplitIterator('1', "ab" ~ "1" ~ "cd" ~ "1" ~ "efg");
343 
344     test (split_constr.next == "ab");
345     test (split_constr.next == "cd");
346     test (split_constr.next == "efg");
347 }
348 
349 
350 /******************************************************************************
351 
352     Base class
353 
354  ******************************************************************************/
355 
356 abstract class ISplitIterator
357 {
358     /**************************************************************************
359 
360         Set to true to collapse consecutive delimiter occurrences to a single
361         one to prevent producing empty segments.
362 
363      **************************************************************************/
364 
365     public bool collapse = false;
366 
367     /**************************************************************************
368 
369         Set to true to do a 'foreach' cycle with the remaining content after
370         the last delimiter occurrence or when no delimiter is found.
371 
372      **************************************************************************/
373 
374     public bool include_remaining = true;
375 
376     /**************************************************************************
377 
378         String to split on next iteration and slice to remaining content.
379 
380      **************************************************************************/
381 
382     private cstring content, remaining_;
383 
384     /**************************************************************************
385 
386         'foreach' iteration counter
387 
388      **************************************************************************/
389 
390     private uint n_ = 0;
391 
392     /**************************************************************************
393 
394         Union of the supported 'foreach' iteration delegate types
395 
396      **************************************************************************/
397 
398     protected union IterationDelegate
399     {
400         int delegate ( ref size_t pos, ref cstring segment ) with_pos;
401 
402         int delegate ( ref cstring segment ) without_pos;
403     }
404 
405     /**************************************************************************
406 
407         Consistency check
408 
409      **************************************************************************/
410 
411     invariant ( )
412     {
413         if (this.n_)
414         {
415             assert (this.content);
416         }
417 
418         /*
419          * TODO: Is this what
420          * ---
421          * assert (this.content[$ - this.remaining_.length .. $] is this.remaining_);
422          * ---
423          * does, that is, comparing the memory location for identity, not the
424          * content? If so, replace it.
425          */
426 
427         assert (this.remaining_.length <= this.content.length);
428 
429         if (this.remaining_.length)
430         {
431             assert (this.remaining_.ptr is &this.content[$ - this.remaining_.length]);
432         }
433     }
434 
435     /**************************************************************************
436 
437         Sets the content string to split on next iteration.
438 
439         Params:
440             content = Content string to split; pass null to clear the content.
441                       content will be sliced (not copied).
442 
443         Returns:
444             this instance
445 
446      **************************************************************************/
447 
448     public typeof (this) reset ( cstring content = null )
449     {
450         this.content    = content;
451         this.remaining_ = this.content;
452         this.n_         = 0;
453 
454         return this;
455     }
456 
457     /**************************************************************************
458 
459         'foreach' iteration over string slices between the current and the next
460         delimiter. n() returns the number of 'foreach' loop cycles so far,
461         remaining() the slice after the next delimiter to the content end.
462         If no delimiter was found, n() is 0 after 'foreach' has finished and
463         remaining() returns the content.
464 
465         segment slices content so do not modify it. However, the content of
466         segment may be modified which will result in an in-place modification
467         of the content.
468 
469      **************************************************************************/
470 
471     public int opApply ( scope int delegate ( ref cstring segment ) dg_in )
472     {
473         IterationDelegate dg;
474 
475         dg.without_pos = dg_in;
476 
477         return this.opApply_(false, dg);
478     }
479 
480     /**************************************************************************
481 
482         'foreach' iteration over string slices between the current and the next
483         delimiter. n() returns the number of 'foreach' loop cycles so far,
484         remaining() the slice after the next delimiter to the content end.
485         If no delimiter was found, n() is 0 after 'foreach' has finished and
486         remaining() returns the content.
487 
488         pos references the current content position and may be changed to
489         specify the position where searching should be continued. If changed,
490         pos must be at most content.length.
491 
492         segment slices content so do not modify it. However, the content of
493         segment may be modified which will result in an in-place modification
494         of the content.
495 
496      **************************************************************************/
497 
498     public int opApply ( scope int delegate ( ref size_t pos, ref cstring segment ) dg_in )
499     {
500         IterationDelegate dg;
501 
502         dg.with_pos = dg_in;
503 
504         return this.opApply_(true, dg);
505     }
506 
507     /**************************************************************************
508 
509         Returns:
510             the number of 'foreach' loop cycles so far. If the value is 0,
511             either no 'foreach' iteration has been done since last reset() or
512             there is no delimiter occurrence in the content string. remaining()
513             will then return the content string.
514 
515      **************************************************************************/
516 
517     public uint n ( )
518     {
519         return this.n_;
520     }
521 
522     /**************************************************************************
523 
524         Returns:
525             - a slice to the content string after the next delimiter when
526               currently doing a 'foreach' iteration,
527             - a slice to the content string after the last delimiter after a
528               'foreach' iteration has finished,
529             - the content string if no 'foreach' iteration has been done or
530               there is no delimiter occurrence in the content string.
531 
532      **************************************************************************/
533 
534     public cstring remaining ( )
535     {
536         return this.remaining_;
537     }
538 
539     /**************************************************************************
540 
541         Locates the first delimiter occurrence in str starting from str[start].
542 
543 
544 
545         Params:
546             str   = str to locate first delimiter occurrence in
547             start = start index
548 
549         Returns:
550             index of the first delimiter occurrence in str or str.length if not
551             found
552 
553      **************************************************************************/
554 
555     abstract size_t locateDelim ( cstring str, size_t start = 0 );
556 
557     /**************************************************************************
558 
559         Locates the first delimiter occurrence in the current content string
560         starting from content[start].
561 
562         Params:
563             start = start index, must be at most content.length
564 
565         Returns:
566             index of the first delimiter occurrence in str or str.length
567             either not found or start >= content.length
568 
569      **************************************************************************/
570 
571     public size_t locateDelim ( size_t start = 0 )
572     {
573         verify (start <= this.content.length,
574             typeof (this).stringof ~ ".locateDelim(): start index out of range");
575         return this.locateDelim(this.content, start);
576     }
577 
578     /**************************************************************************
579 
580         Skips initial consecutive occurrences of the current delimiter in the
581         currently remaining content.
582 
583         Returns:
584              remaining content after the delimiters have been skipped.
585 
586      **************************************************************************/
587 
588     public cstring skipLeadingDelims ( )
589     {
590         size_t start = 0,
591                pos   = this.locateDelim(this.remaining_);
592 
593         while (pos == start && pos < this.remaining_.length)
594         {
595             start = pos + this.skipDelim(this.remaining_[pos .. $]);
596 
597             pos = this.locateDelim(this.remaining_, start);
598         }
599 
600         return this.remaining_ = this.remaining_[start .. $];
601     }
602 
603     /**************************************************************************
604 
605         Searches the next delimiter.
606 
607         Returns:
608             a slice to the content between the previous and next delimiter, if
609             found. If not found and include_remaining is true, the remaining
610             content is returned or null if include_remaining is false.
611 
612      **************************************************************************/
613 
614     public cstring next ( )
615     {
616         if (this.remaining_.length)
617         {
618             this.n_++;
619 
620             if (this.collapse)
621             {
622                 this.skipLeadingDelims();
623             }
624 
625             size_t start = this.content.length - this.remaining_.length,
626                    end   = this.locateDelim(start);
627 
628             if (end < this.content.length)
629             {
630                 this.remaining_ = this.content[end + this.skipDelim(this.content[end .. $]) .. $];
631 
632                 return this.content[start .. end];
633             }
634             else if (this.include_remaining)
635             {
636                 scope (success) this.remaining_ = null;
637 
638                 return this.remaining_;
639             }
640             else
641             {
642                 return null;
643             }
644         }
645         else
646         {
647             return null;
648         }
649     }
650 
651     /**************************************************************************
652 
653         'foreach' iteration over string slices between the current and the next
654         delimiter.
655 
656         Params:
657             with_pos = true: use dg.with_pos, false: user dg.without_pos
658             dg       = iteration delegate
659 
660         Returns:
661             passes through dg() return value.
662 
663      **************************************************************************/
664 
665     protected int opApply_ ( bool with_pos, IterationDelegate dg )
666     {
667         int result = 0;
668 
669         if (this.remaining_.length)
670         {
671             if (this.collapse)
672             {
673                 this.skipLeadingDelims();
674             }
675 
676             size_t start = this.content.length - this.remaining_.length;
677 
678             for (size_t pos = this.locateDelim(start);
679                         pos < this.content.length;
680                         pos = this.locateDelim(start))
681             {
682                 size_t next = pos + this.skipDelim(this.content[pos .. $]);
683 
684                 if (!(pos == start && collapse))
685                 {
686                     this.n_++;
687 
688                     cstring segment  = this.content[start ..  pos];
689                     this.remaining_ = this.content[next .. $];
690 
691                     if (with_pos)
692                     {
693                         result = dg.with_pos(next, segment);
694 
695                         verify (next <= this.content.length,
696                                 typeof (this).stringof ~ ": iteration delegate "
697                                 ~ "set the position out of range");
698 
699                         this.remaining_ = this.content[next .. $];
700                     }
701                     else
702                     {
703                         result = dg.without_pos(segment);
704                     }
705                 }
706 
707                 start = next;
708 
709                 if (result || start >= this.content.length) break;
710             }
711 
712             this.remaining_ = this.content[start .. $];
713 
714             if (this.include_remaining &&
715                 !(result || (!this.remaining_.length && this.collapse)))
716             {
717                 this.n_++;
718 
719                 cstring segment = this.remaining_;
720 
721                 this.remaining_ = "";
722 
723                 result = with_pos? dg.with_pos(start, segment) :
724                                    dg.without_pos(segment);
725             }
726         }
727 
728         return result;
729     }
730 
731     /**************************************************************************
732 
733         Skips the delimiter which str starts with.
734         The return value is at most str.length.
735         It is assured that str starts with a delimiter so a subclass may return
736         an undefined result otherwise. Additionally, a subclass is encouraged to
737         use an 'in' contract to ensure str starts with a delimiter and/or is
738         long enought to skip a leading delimiter.
739 
740         Params:
741             str = string starting with delimiter
742 
743         Returns:
744             index of the first character after the starting delimiter in str
745 
746      **************************************************************************/
747 
748     abstract protected size_t skipDelim ( cstring str );
749 
750     /***************************************************************************
751 
752         Trims white space from str.
753 
754         Params:
755              str       = input string
756 
757         Returns:
758              the resulting string
759 
760     ***************************************************************************/
761 
762     static cstring trim ( cstring str )
763     {
764         foreach_reverse (i, c; str)
765         {
766             if (!isspace(c))
767             {
768                 str = str[0 .. i + 1];
769                 break;
770             }
771         }
772 
773         foreach (i, c; str)
774         {
775             if (!isspace(c))
776             {
777                 return str[i .. $];
778             }
779         }
780 
781         return str? str[0 .. 0] : null;
782     }
783 
784 }