1 /*******************************************************************************
2 
3     Copyright:
4         Copyright (c) 2009 Kris Bell.
5         Some parts copyright (c) 2009-2016 dunnhumby Germany GmbH.
6         All rights reserved.
7 
8     License:
9         Tango Dual License: 3-Clause BSD License / Academic Free License v3.0.
10         See LICENSE_TANGO.txt for details.
11 
12     Version: May 2009: Initial release
13 
14     Authors: Kris
15 
16 *******************************************************************************/
17 
18 module ocean.text.Search;
19 
20 import ocean.meta.types.Qualifiers : mstring, cstring;
21 
22 import Util = ocean.text.Util;
23 
24 version (unittest) import ocean.core.Test;
25 
26 /******************************************************************************
27 
28   Returns a lightweight pattern matcher, good for short patterns
29   and/or short to medium length content. Brute-force approach with
30   fast multi-byte comparisons
31 
32  ******************************************************************************/
33 
34 FindFruct find (cstring what)
35 {
36     return FindFruct(what);
37 }
38 
39 /******************************************************************************
40 
41   Returns a welterweight pattern matcher, good for long patterns
42   and/or extensive content. Based on the QS algorithm which is a
43   Boyer-Moore variant. Does not allocate memory for the alphabet.
44 
45   Generally becomes faster as the match-length grows
46 
47  ******************************************************************************/
48 
49 SearchFruct search (cstring what)
50 {
51     return SearchFruct(what);
52 }
53 
54 /******************************************************************************
55 
56     Convenient bundle of lightweight find utilities, without the
57     hassle of IFTI problems. Create one of these using the find()
58     function:
59     ---
60     auto match = find ("foo");
61     auto content = "wumpus foo bar"
62 
63     // search in the forward direction
64     auto index = match.forward (content);
65     assert (index is 7);
66 
67     // search again - returns length when no match found
68     assert (match.forward(content, index+1) is content.length);
69     ---
70 
71     Searching operates both forward and backward, with an optional
72     start offset (can be more convenient than slicing the content).
73     There are methods to replace matches within given content, and
74     others which return foreach() iterators for traversing content.
75 
76     SearchFruct is a more sophisticated variant, which operates more
77     efficiently on longer matches and/or more extensive content.
78 
79  ******************************************************************************/
80 
81 public struct FindFruct
82 {
83     private cstring what;
84 
85     /***********************************************************************
86 
87       Search forward in the given content, starting at the
88       optional index.
89 
90       Returns the index of a match, or content.length where
91       no match was located.
92 
93      ***********************************************************************/
94 
95     size_t forward (cstring content, size_t ofs = 0)
96     {
97         return Util.index (content, what, ofs);
98     }
99 
100     /***********************************************************************
101 
102       Search backward in the given content, starting at the
103       optional index.
104 
105       Returns the index of a match, or content.length where
106       no match was located.
107 
108      ***********************************************************************/
109 
110     size_t reverse (cstring content, size_t ofs = size_t.max)
111     {
112         return Util.rindex (content, what, ofs);
113     }
114 
115     /***********************************************************************
116 
117       Return the match text
118 
119      ***********************************************************************/
120 
121     cstring match ()
122     {
123         return what;
124     }
125 
126     /***********************************************************************
127 
128       Reset the text to match
129 
130      ***********************************************************************/
131 
132     void match (cstring what)
133     {
134         this.what = what;
135     }
136 
137     /***********************************************************************
138 
139       Returns true if there is a match within the given content
140 
141      ***********************************************************************/
142 
143     bool within (cstring content)
144     {
145         return forward(content) != content.length;
146     }
147 
148     /***********************************************************************
149 
150       Returns number of matches within the given content
151 
152      ***********************************************************************/
153 
154     size_t count (cstring content)
155     {
156         size_t mark, count;
157 
158         while ((mark = Util.index (content, what, mark)) != content.length)
159             ++count, ++mark;
160         return count;
161     }
162 
163     /***********************************************************************
164 
165       Replace all matches with the given character. Use method
166       tokens() instead to avoid heap activity.
167 
168       Returns a copy of the content with replacements made
169 
170      ***********************************************************************/
171 
172     mstring replace (cstring content, char chr)
173     {
174         return replace (content, (&chr)[0..1]);
175     }
176 
177     /***********************************************************************
178 
179       Replace all matches with the given substitution. Use
180       method tokens() instead to avoid heap activity.
181 
182       Returns a copy of the content with replacements made
183 
184      ***********************************************************************/
185 
186     mstring replace (cstring content, cstring sub = null)
187     {
188         mstring output;
189 
190         foreach (s; tokens (content, sub))
191             output ~= s;
192         return output;
193     }
194 
195     /***********************************************************************
196 
197       Returns a foreach() iterator which exposes text segments
198       between all matches within the given content. Substitution
199       text is also injected in place of each match, and null can
200       be used to indicate removal instead:
201       ---
202       char[] result;
203 
204       auto match = find ("foo");
205       foreach (token; match.tokens ("$foo&&foo*", "bar"))
206       result ~= token;
207       assert (result == "$bar&&bar*");
208       ---
209 
210       This mechanism avoids internal heap activity.
211 
212      ***********************************************************************/
213 
214     Util.PatternFruct!(const(char)) tokens (cstring content, cstring sub = null)
215     {
216         return Util.patterns (content, what, sub);
217     }
218 
219     /***********************************************************************
220 
221       Returns a foreach() iterator which exposes the indices of
222       all matches within the given content:
223       ---
224       int count;
225 
226       auto f = find ("foo");
227       foreach (index; f.indices("$foo&&foo*"))
228       ++count;
229       assert (count is 2);
230       ---
231 
232      ***********************************************************************/
233 
234     Indices indices (cstring content)
235     {
236         return Indices (what, content);
237     }
238 
239     /***********************************************************************
240 
241       Simple foreach() iterator
242 
243      ***********************************************************************/
244 
245     private struct Indices
246     {
247         cstring what, content;
248 
249         int opApply (scope int delegate (ref size_t index) dg)
250         {
251             int    ret;
252             size_t mark;
253 
254             while ((mark = Util.index(content, what, mark)) != content.length)
255                 if ((ret = dg(mark)) is 0)
256                     ++mark;
257                 else
258                     break;
259             return ret;
260         }
261     }
262 }
263 
264 
265 /******************************************************************************
266 
267   Convenient bundle of welterweight search utilities, without the
268   hassle of IFTI problems. Create one of these using the search()
269   function:
270   ---
271   auto match = search ("foo");
272   auto content = "wumpus foo bar"
273 
274   // search in the forward direction
275   auto index = match.forward (content);
276   assert (index is 7);
277 
278   // search again - returns length when no match found
279   assert (match.forward(content, index+1) is content.length);
280   ---
281 
282   Searching operates both forward and backward, with an optional
283   start offset (can be more convenient than slicing the content).
284   There are methods to replace matches within given content, and
285   others which return foreach() iterators for traversing content.
286 
287   FindFruct is a simpler variant, which can operate efficiently on
288   short matches and/or short content (employs brute-force strategy)
289 
290  ******************************************************************************/
291 
292 public struct SearchFruct
293 {
294     private cstring         what;
295     private bool            fore;
296     private ptrdiff_t[256]  offsets = void;
297 
298     /***********************************************************************
299 
300       Construct the fruct
301 
302      ***********************************************************************/
303 
304     static SearchFruct opCall (cstring what)
305     {
306         SearchFruct find = void;
307         find.match = what;
308         return find;
309     }
310 
311     /***********************************************************************
312 
313       Return the match text
314 
315      ***********************************************************************/
316 
317     cstring match ()
318     {
319         return what;
320     }
321 
322     /***********************************************************************
323 
324       Reset the text to match
325 
326      ***********************************************************************/
327 
328     void match (cstring what)
329     {
330         offsets[] = what.length + 1;
331         this.fore = true;
332         this.what = what;
333         reset;
334     }
335 
336     /***********************************************************************
337 
338       Search forward in the given content, starting at the
339       optional index.
340 
341       Returns the index of a match, or content.length where
342       no match was located.
343 
344      ***********************************************************************/
345 
346     size_t forward (cstring content, size_t ofs = 0)
347     {
348         if (! fore)
349             flip;
350 
351         if (ofs > content.length)
352             ofs = content.length;
353 
354         return find (cast(char*) what.ptr, what.length * char.sizeof,
355                 cast(char*) content.ptr, content.length * char.sizeof,
356                 ofs * char.sizeof) / char.sizeof;
357     }
358 
359     /***********************************************************************
360 
361       Search backward in the given content, starting at the
362       optional index.
363 
364       Returns the index of a match, or content.length where
365       no match was located.
366 
367      ***********************************************************************/
368 
369     size_t reverse (cstring content, size_t ofs = size_t.max)
370     {
371         if (fore)
372             flip;
373 
374         if (ofs > content.length)
375             ofs = content.length;
376 
377         return rfind (cast(char*) what.ptr, what.length * char.sizeof,
378                 cast(char*) content.ptr, content.length * char.sizeof,
379                 ofs * char.sizeof) / char.sizeof;
380     }
381 
382     /***********************************************************************
383 
384       Returns true if there is a match within the given content
385 
386      ***********************************************************************/
387 
388     bool within (cstring content)
389     {
390         return forward(content) != content.length;
391     }
392 
393     /***********************************************************************
394 
395       Returns number of matches within the given content
396 
397      ***********************************************************************/
398 
399     size_t count (cstring content)
400     {
401         size_t mark, count;
402 
403         while ((mark = forward (content, mark)) != content.length)
404             ++count, ++mark;
405         return count;
406     }
407 
408     /***********************************************************************
409 
410       Replace all matches with the given character. Use method
411       tokens() instead to avoid heap activity.
412 
413       Returns a copy of the content with replacements made
414 
415      ***********************************************************************/
416 
417     mstring replace (cstring content, char chr)
418     {
419         return replace (content, (&chr)[0..1]);
420     }
421 
422     /***********************************************************************
423 
424       Replace all matches with the given substitution. Use
425       method tokens() instead to avoid heap activity.
426 
427       Returns a copy of the content with replacements made
428 
429      ***********************************************************************/
430 
431     mstring replace (cstring content, cstring sub = null)
432     {
433         mstring output;
434 
435         foreach (s; tokens (content, sub))
436             output ~= s;
437         return output;
438     }
439 
440     /***********************************************************************
441 
442       Returns a foreach() iterator which exposes text segments
443       between all matches within the given content. Substitution
444       text is also injected in place of each match, and null can
445       be used to indicate removal instead:
446       ---
447       char[] result;
448 
449       auto match = search ("foo");
450       foreach (token; match.tokens("$foo&&foo*", "bar"))
451       result ~= token;
452       assert (result == "$bar&&bar*");
453       ---
454 
455       This mechanism avoids internal heap activity
456 
457      ***********************************************************************/
458 
459     Substitute tokens (cstring content, cstring sub = null) return
460     {
461         return Substitute (sub, what, content, &forward);
462     }
463 
464     /***********************************************************************
465 
466       Returns a foreach() iterator which exposes the indices of
467       all matches within the given content:
468       ---
469       int count;
470 
471       auto match = search ("foo");
472       foreach (index; match.indices("$foo&&foo*"))
473       ++count;
474       assert (count is 2);
475       ---
476 
477      ***********************************************************************/
478 
479     Indices indices (cstring content) return
480     {
481         return Indices (content, &forward);
482     }
483 
484     /***********************************************************************
485 
486      ***********************************************************************/
487 
488     private size_t find (const(char)* what, size_t wlen, const(char)* content,
489         size_t len, size_t ofs)
490     {
491         if (len == 0 && len < wlen)
492             return len;
493 
494         auto s = content;
495         content += ofs;
496         auto e = s + len - wlen;
497         while (content <= e)
498             if (*what is *content && matches(what, content, wlen))
499                 return content - s;
500             else
501                 content += offsets [content[wlen]];
502         return len;
503     }
504 
505 
506     /***********************************************************************
507 
508      ***********************************************************************/
509 
510     private size_t rfind (const(char)* what, size_t wlen,
511         const(char)* content, size_t len, size_t ofs)
512     {
513         if (len == 0 && len < wlen)
514             return len;
515 
516         auto s = content;
517         auto e = s + ofs - wlen;
518         while (e >= content)
519             if (*what is *e && matches(what, e, wlen))
520                 return e - s;
521             else
522                 e -= offsets [*(e-1)];
523         return len;
524     }
525 
526 
527     /***********************************************************************
528 
529      ***********************************************************************/
530 
531     private static bool matches (const(char)* a, const(char)* b, size_t length)
532     {
533         while (length > size_t.sizeof)
534             if (*cast(size_t*) a is *cast(size_t*) b)
535                 a += size_t.sizeof, b += size_t.sizeof, length -= size_t.sizeof;
536             else
537                 return false;
538 
539         while (length--)
540             if (*a++ != *b++)
541                 return false;
542         return true;
543     }
544 
545     /***********************************************************************
546 
547       Construct lookup table. We force the alphabet to be char[]
548       always, and consider wider characters to be longer patterns
549       instead
550 
551      ***********************************************************************/
552 
553     private void reset ()
554     {
555         if (fore)
556             for (ptrdiff_t i=0; i < this.what.length; ++i)
557                 offsets[this.what[i]] = this.what.length - i;
558         else
559             for (ptrdiff_t i= this.what.length; i--;)
560                 offsets[this.what[i]] = i+1;
561     }
562 
563     /***********************************************************************
564 
565       Reverse lookup-table direction
566 
567      ***********************************************************************/
568 
569     private void flip ()
570     {
571         fore ^= true;
572         reset;
573     }
574 
575     /***********************************************************************
576 
577       Simple foreach() iterator
578 
579      ***********************************************************************/
580 
581     private struct Indices
582     {
583         cstring content;
584         size_t delegate(cstring, size_t) call;
585 
586         int opApply (scope int delegate (ref size_t index) dg)
587         {
588             int     ret;
589             size_t  mark;
590 
591             while ((mark = call(content, mark)) != content.length)
592                 if ((ret = dg(mark)) is 0)
593                     ++mark;
594                 else
595                     break;
596             return ret;
597         }
598     }
599 
600     /***********************************************************************
601 
602       Substitution foreach() iterator
603 
604      ***********************************************************************/
605 
606     private struct Substitute
607     {
608         private cstring sub;
609         private cstring what;
610         private cstring content;
611 
612         size_t      delegate(cstring, size_t) call;
613 
614         int opApply (scope int delegate (ref cstring token) dg)
615         {
616             size_t  ret,
617                     pos,
618                     mark;
619             cstring token;
620 
621             while ((pos = call (content, mark)) < content.length)
622             {
623                 token = content [mark .. pos];
624                 if ((ret = dg(token)) != 0)
625                     return cast(int) ret;
626                 if (sub.ptr && (ret = dg(sub)) != 0)
627                     return cast(int) ret;
628                 mark = pos + what.length;
629             }
630 
631             token = content [mark .. $];
632             if (mark <= content.length)
633                 ret = dg (token);
634             return cast(int) ret;
635         }
636     }
637 }
638 
639 
640 unittest
641 {
642     auto searcher = search("aaa");
643 
644     // match
645     mstring content1 = "bbaaa".dup;
646     test (
647         searcher.find(
648             searcher.what.ptr, searcher.what.length,
649             content1.ptr, content1.length, 0
650             ) == 2
651         );
652 
653     // no match
654     mstring content2 = "bbbbb".dup;
655     test (
656         searcher.find(
657             searcher.what.ptr, searcher.what.length,
658             content2.ptr, content2.length, 0
659             ) == content2.length
660         );
661 
662     // empty text
663     test (searcher.find(searcher.what.ptr, searcher.what.length,
664                           null, 0, 0) == 0);
665 }
666 
667 unittest
668 {
669     auto searcher = search("aaa");
670 
671     // match
672     mstring content1 = "baaab".dup;
673     test (
674         searcher.rfind(
675             searcher.what.ptr, searcher.what.length,
676             content1.ptr, content1.length, content1.length
677             ) == 1
678         );
679 
680     // no match
681     mstring content2 = "bbbbb".dup;
682     test (
683         searcher.rfind(
684             searcher.what.ptr, searcher.what.length,
685             content2.ptr, content2.length, content2.length
686             ) == content2.length
687         );
688 
689     // empty text
690     test (searcher.rfind(searcher.what.ptr, searcher.what.length,
691                            null, 0, 0) == 0);
692 }