1 /*******************************************************************************
2
3 Copyright:
4 Copyright (c) 2009 Kris Bell.
5 Some parts copyright (c) 2009-2016 dunnhumby Germany GmbH.
6 All rights reserved.
7
8 License:
9 Tango Dual License: 3-Clause BSD License / Academic Free License v3.0.
10 See LICENSE_TANGO.txt for details.
11
12 Version: May 2009: Initial release
13
14 Authors: Kris
15
16 *******************************************************************************/
17
18 module ocean.text.Search;
19
20 import ocean.meta.types.Qualifiers : mstring, cstring;
21
22 import Util = ocean.text.Util;
23
24 version(UnitTest) import ocean.core.Test;
25
26 /******************************************************************************
27
28 Returns a lightweight pattern matcher, good for short patterns
29 and/or short to medium length content. Brute-force approach with
30 fast multi-byte comparisons
31
32 ******************************************************************************/
33
34 FindFruct find (cstring what)
35 {
36 return FindFruct(what);
37 }
38
39 /******************************************************************************
40
41 Returns a welterweight pattern matcher, good for long patterns
42 and/or extensive content. Based on the QS algorithm which is a
43 Boyer-Moore variant. Does not allocate memory for the alphabet.
44
45 Generally becomes faster as the match-length grows
46
47 ******************************************************************************/
48
49 SearchFruct search (cstring what)
50 {
51 return SearchFruct(what);
52 }
53
54 /******************************************************************************
55
56 Convenient bundle of lightweight find utilities, without the
57 hassle of IFTI problems. Create one of these using the find()
58 function:
59 ---
60 auto match = find ("foo");
61 auto content = "wumpus foo bar"
62
63 // search in the forward direction
64 auto index = match.forward (content);
65 assert (index is 7);
66
67 // search again - returns length when no match found
68 assert (match.forward(content, index+1) is content.length);
69 ---
70
71 Searching operates both forward and backward, with an optional
72 start offset (can be more convenient than slicing the content).
73 There are methods to replace matches within given content, and
74 others which return foreach() iterators for traversing content.
75
76 SearchFruct is a more sophisticated variant, which operates more
77 efficiently on longer matches and/or more extensive content.
78
79 ******************************************************************************/
80
81 public struct FindFruct
82 {
83 private cstring what;
84
85 /***********************************************************************
86
87 Search forward in the given content, starting at the
88 optional index.
89
90 Returns the index of a match, or content.length where
91 no match was located.
92
93 ***********************************************************************/
94
95 size_t forward (cstring content, size_t ofs = 0)
96 {
97 return Util.index (content, what, ofs);
98 }
99
100 /***********************************************************************
101
102 Search backward in the given content, starting at the
103 optional index.
104
105 Returns the index of a match, or content.length where
106 no match was located.
107
108 ***********************************************************************/
109
110 size_t reverse (cstring content, size_t ofs = size_t.max)
111 {
112 return Util.rindex (content, what, ofs);
113 }
114
115 /***********************************************************************
116
117 Return the match text
118
119 ***********************************************************************/
120
121 cstring match ()
122 {
123 return what;
124 }
125
126 /***********************************************************************
127
128 Reset the text to match
129
130 ***********************************************************************/
131
132 void match (cstring what)
133 {
134 (&this).what = what;
135 }
136
137 /***********************************************************************
138
139 Returns true if there is a match within the given content
140
141 ***********************************************************************/
142
143 bool within (cstring content)
144 {
145 return forward(content) != content.length;
146 }
147
148 /***********************************************************************
149
150 Returns number of matches within the given content
151
152 ***********************************************************************/
153
154 size_t count (cstring content)
155 {
156 size_t mark, count;
157
158 while ((mark = Util.index (content, what, mark)) != content.length)
159 ++count, ++mark;
160 return count;
161 }
162
163 /***********************************************************************
164
165 Replace all matches with the given character. Use method
166 tokens() instead to avoid heap activity.
167
168 Returns a copy of the content with replacements made
169
170 ***********************************************************************/
171
172 mstring replace (cstring content, char chr)
173 {
174 return replace (content, (&chr)[0..1]);
175 }
176
177 /***********************************************************************
178
179 Replace all matches with the given substitution. Use
180 method tokens() instead to avoid heap activity.
181
182 Returns a copy of the content with replacements made
183
184 ***********************************************************************/
185
186 mstring replace (cstring content, cstring sub = null)
187 {
188 mstring output;
189
190 foreach (s; tokens (content, sub))
191 output ~= s;
192 return output;
193 }
194
195 /***********************************************************************
196
197 Returns a foreach() iterator which exposes text segments
198 between all matches within the given content. Substitution
199 text is also injected in place of each match, and null can
200 be used to indicate removal instead:
201 ---
202 char[] result;
203
204 auto match = find ("foo");
205 foreach (token; match.tokens ("$foo&&foo*", "bar"))
206 result ~= token;
207 assert (result == "$bar&&bar*");
208 ---
209
210 This mechanism avoids internal heap activity.
211
212 ***********************************************************************/
213
214 Util.PatternFruct!(const(char)) tokens (cstring content, cstring sub = null)
215 {
216 return Util.patterns (content, what, sub);
217 }
218
219 /***********************************************************************
220
221 Returns a foreach() iterator which exposes the indices of
222 all matches within the given content:
223 ---
224 int count;
225
226 auto f = find ("foo");
227 foreach (index; f.indices("$foo&&foo*"))
228 ++count;
229 assert (count is 2);
230 ---
231
232 ***********************************************************************/
233
234 Indices indices (cstring content)
235 {
236 return Indices (what, content);
237 }
238
239 /***********************************************************************
240
241 Simple foreach() iterator
242
243 ***********************************************************************/
244
245 private struct Indices
246 {
247 cstring what, content;
248
249 int opApply (scope int delegate (ref size_t index) dg)
250 {
251 int ret;
252 size_t mark;
253
254 while ((mark = Util.index(content, what, mark)) != content.length)
255 if ((ret = dg(mark)) is 0)
256 ++mark;
257 else
258 break;
259 return ret;
260 }
261 }
262 }
263
264
265 /******************************************************************************
266
267 Convenient bundle of welterweight search utilities, without the
268 hassle of IFTI problems. Create one of these using the search()
269 function:
270 ---
271 auto match = search ("foo");
272 auto content = "wumpus foo bar"
273
274 // search in the forward direction
275 auto index = match.forward (content);
276 assert (index is 7);
277
278 // search again - returns length when no match found
279 assert (match.forward(content, index+1) is content.length);
280 ---
281
282 Searching operates both forward and backward, with an optional
283 start offset (can be more convenient than slicing the content).
284 There are methods to replace matches within given content, and
285 others which return foreach() iterators for traversing content.
286
287 FindFruct is a simpler variant, which can operate efficiently on
288 short matches and/or short content (employs brute-force strategy)
289
290 ******************************************************************************/
291
292 public struct SearchFruct
293 {
294 private cstring what;
295 private bool fore;
296 private ptrdiff_t[256] offsets = void;
297
298 /***********************************************************************
299
300 Construct the fruct
301
302 ***********************************************************************/
303
304 static SearchFruct opCall (cstring what)
305 {
306 SearchFruct find = void;
307 find.match = what;
308 return find;
309 }
310
311 /***********************************************************************
312
313 Return the match text
314
315 ***********************************************************************/
316
317 cstring match ()
318 {
319 return what;
320 }
321
322 /***********************************************************************
323
324 Reset the text to match
325
326 ***********************************************************************/
327
328 void match (cstring what)
329 {
330 offsets[] = what.length + 1;
331 (&this).fore = true;
332 (&this).what = what;
333 reset;
334 }
335
336 /***********************************************************************
337
338 Search forward in the given content, starting at the
339 optional index.
340
341 Returns the index of a match, or content.length where
342 no match was located.
343
344 ***********************************************************************/
345
346 size_t forward (cstring content, size_t ofs = 0)
347 {
348 if (! fore)
349 flip;
350
351 if (ofs > content.length)
352 ofs = content.length;
353
354 return find (cast(char*) what.ptr, what.length * char.sizeof,
355 cast(char*) content.ptr, content.length * char.sizeof,
356 ofs * char.sizeof) / char.sizeof;
357 }
358
359 /***********************************************************************
360
361 Search backward in the given content, starting at the
362 optional index.
363
364 Returns the index of a match, or content.length where
365 no match was located.
366
367 ***********************************************************************/
368
369 size_t reverse (cstring content, size_t ofs = size_t.max)
370 {
371 if (fore)
372 flip;
373
374 if (ofs > content.length)
375 ofs = content.length;
376
377 return rfind (cast(char*) what.ptr, what.length * char.sizeof,
378 cast(char*) content.ptr, content.length * char.sizeof,
379 ofs * char.sizeof) / char.sizeof;
380 }
381
382 /***********************************************************************
383
384 Returns true if there is a match within the given content
385
386 ***********************************************************************/
387
388 bool within (cstring content)
389 {
390 return forward(content) != content.length;
391 }
392
393 /***********************************************************************
394
395 Returns number of matches within the given content
396
397 ***********************************************************************/
398
399 size_t count (cstring content)
400 {
401 size_t mark, count;
402
403 while ((mark = forward (content, mark)) != content.length)
404 ++count, ++mark;
405 return count;
406 }
407
408 /***********************************************************************
409
410 Replace all matches with the given character. Use method
411 tokens() instead to avoid heap activity.
412
413 Returns a copy of the content with replacements made
414
415 ***********************************************************************/
416
417 mstring replace (cstring content, char chr)
418 {
419 return replace (content, (&chr)[0..1]);
420 }
421
422 /***********************************************************************
423
424 Replace all matches with the given substitution. Use
425 method tokens() instead to avoid heap activity.
426
427 Returns a copy of the content with replacements made
428
429 ***********************************************************************/
430
431 mstring replace (cstring content, cstring sub = null)
432 {
433 mstring output;
434
435 foreach (s; tokens (content, sub))
436 output ~= s;
437 return output;
438 }
439
440 /***********************************************************************
441
442 Returns a foreach() iterator which exposes text segments
443 between all matches within the given content. Substitution
444 text is also injected in place of each match, and null can
445 be used to indicate removal instead:
446 ---
447 char[] result;
448
449 auto match = search ("foo");
450 foreach (token; match.tokens("$foo&&foo*", "bar"))
451 result ~= token;
452 assert (result == "$bar&&bar*");
453 ---
454
455 This mechanism avoids internal heap activity
456
457 ***********************************************************************/
458
459 Substitute tokens (cstring content, cstring sub = null)
460 {
461 return Substitute (sub, what, content, &forward);
462 }
463
464 /***********************************************************************
465
466 Returns a foreach() iterator which exposes the indices of
467 all matches within the given content:
468 ---
469 int count;
470
471 auto match = search ("foo");
472 foreach (index; match.indices("$foo&&foo*"))
473 ++count;
474 assert (count is 2);
475 ---
476
477 ***********************************************************************/
478
479 Indices indices (cstring content)
480 {
481 return Indices (content, &forward);
482 }
483
484 /***********************************************************************
485
486 ***********************************************************************/
487
488 private size_t find (const(char)* what, size_t wlen, const(char)* content,
489 size_t len, size_t ofs)
490 {
491 if (len == 0 && len < wlen)
492 return len;
493
494 auto s = content;
495 content += ofs;
496 auto e = s + len - wlen;
497 while (content <= e)
498 if (*what is *content && matches(what, content, wlen))
499 return content - s;
500 else
501 content += offsets [content[wlen]];
502 return len;
503 }
504
505
506 /***********************************************************************
507
508 ***********************************************************************/
509
510 private size_t rfind (const(char)* what, size_t wlen,
511 const(char)* content, size_t len, size_t ofs)
512 {
513 if (len == 0 && len < wlen)
514 return len;
515
516 auto s = content;
517 auto e = s + ofs - wlen;
518 while (e >= content)
519 if (*what is *e && matches(what, e, wlen))
520 return e - s;
521 else
522 e -= offsets [*(e-1)];
523 return len;
524 }
525
526
527 /***********************************************************************
528
529 ***********************************************************************/
530
531 private static bool matches (const(char)* a, const(char)* b, size_t length)
532 {
533 while (length > size_t.sizeof)
534 if (*cast(size_t*) a is *cast(size_t*) b)
535 a += size_t.sizeof, b += size_t.sizeof, length -= size_t.sizeof;
536 else
537 return false;
538
539 while (length--)
540 if (*a++ != *b++)
541 return false;
542 return true;
543 }
544
545 /***********************************************************************
546
547 Construct lookup table. We force the alphabet to be char[]
548 always, and consider wider characters to be longer patterns
549 instead
550
551 ***********************************************************************/
552
553 private void reset ()
554 {
555 if (fore)
556 for (ptrdiff_t i=0; i < this.what.length; ++i)
557 offsets[this.what[i]] = this.what.length - i;
558 else
559 for (ptrdiff_t i= this.what.length; i--;)
560 offsets[this.what[i]] = i+1;
561 }
562
563 /***********************************************************************
564
565 Reverse lookup-table direction
566
567 ***********************************************************************/
568
569 private void flip ()
570 {
571 fore ^= true;
572 reset;
573 }
574
575 /***********************************************************************
576
577 Simple foreach() iterator
578
579 ***********************************************************************/
580
581 private struct Indices
582 {
583 cstring content;
584 size_t delegate(cstring, size_t) call;
585
586 int opApply (scope int delegate (ref size_t index) dg)
587 {
588 int ret;
589 size_t mark;
590
591 while ((mark = call(content, mark)) != content.length)
592 if ((ret = dg(mark)) is 0)
593 ++mark;
594 else
595 break;
596 return ret;
597 }
598 }
599
600 /***********************************************************************
601
602 Substitution foreach() iterator
603
604 ***********************************************************************/
605
606 private struct Substitute
607 {
608 private cstring sub;
609 private cstring what;
610 private cstring content;
611
612 size_t delegate(cstring, size_t) call;
613
614 int opApply (scope int delegate (ref cstring token) dg)
615 {
616 size_t ret,
617 pos,
618 mark;
619 cstring token;
620
621 while ((pos = call (content, mark)) < content.length)
622 {
623 token = content [mark .. pos];
624 if ((ret = dg(token)) != 0)
625 return cast(int) ret;
626 if (sub.ptr && (ret = dg(sub)) != 0)
627 return cast(int) ret;
628 mark = pos + what.length;
629 }
630
631 token = content [mark .. $];
632 if (mark <= content.length)
633 ret = dg (token);
634 return cast(int) ret;
635 }
636 }
637 }
638
639
640 unittest
641 {
642 auto searcher = search("aaa");
643
644 // match
645 mstring content1 = "bbaaa".dup;
646 test (
647 searcher.find(
648 searcher.what.ptr, searcher.what.length,
649 content1.ptr, content1.length, 0
650 ) == 2
651 );
652
653 // no match
654 mstring content2 = "bbbbb".dup;
655 test (
656 searcher.find(
657 searcher.what.ptr, searcher.what.length,
658 content2.ptr, content2.length, 0
659 ) == content2.length
660 );
661
662 // empty text
663 test (searcher.find(searcher.what.ptr, searcher.what.length,
664 null, 0, 0) == 0);
665 }
666
667 unittest
668 {
669 auto searcher = search("aaa");
670
671 // match
672 mstring content1 = "baaab".dup;
673 test (
674 searcher.rfind(
675 searcher.what.ptr, searcher.what.length,
676 content1.ptr, content1.length, content1.length
677 ) == 1
678 );
679
680 // no match
681 mstring content2 = "bbbbb".dup;
682 test (
683 searcher.rfind(
684 searcher.what.ptr, searcher.what.length,
685 content2.ptr, content2.length, content2.length
686 ) == content2.length
687 );
688
689 // empty text
690 test (searcher.rfind(searcher.what.ptr, searcher.what.length,
691 null, 0, 0) == 0);
692 }