1 /******************************************************************************* 2 3 Copyright: 4 Copyright (c) 2009 Kris Bell. 5 Some parts copyright (c) 2009-2016 dunnhumby Germany GmbH. 6 All rights reserved. 7 8 License: 9 Tango Dual License: 3-Clause BSD License / Academic Free License v3.0. 10 See LICENSE_TANGO.txt for details. 11 12 Version: May 2009: Initial release 13 14 Authors: Kris 15 16 *******************************************************************************/ 17 18 module ocean.text.Search; 19 20 import ocean.meta.types.Qualifiers : mstring, cstring; 21 22 import Util = ocean.text.Util; 23 24 version (unittest) import ocean.core.Test; 25 26 /****************************************************************************** 27 28 Returns a lightweight pattern matcher, good for short patterns 29 and/or short to medium length content. Brute-force approach with 30 fast multi-byte comparisons 31 32 ******************************************************************************/ 33 34 FindFruct find (cstring what) 35 { 36 return FindFruct(what); 37 } 38 39 /****************************************************************************** 40 41 Returns a welterweight pattern matcher, good for long patterns 42 and/or extensive content. Based on the QS algorithm which is a 43 Boyer-Moore variant. Does not allocate memory for the alphabet. 44 45 Generally becomes faster as the match-length grows 46 47 ******************************************************************************/ 48 49 SearchFruct search (cstring what) 50 { 51 return SearchFruct(what); 52 } 53 54 /****************************************************************************** 55 56 Convenient bundle of lightweight find utilities, without the 57 hassle of IFTI problems. Create one of these using the find() 58 function: 59 --- 60 auto match = find ("foo"); 61 auto content = "wumpus foo bar" 62 63 // search in the forward direction 64 auto index = match.forward (content); 65 assert (index is 7); 66 67 // search again - returns length when no match found 68 assert (match.forward(content, index+1) is content.length); 69 --- 70 71 Searching operates both forward and backward, with an optional 72 start offset (can be more convenient than slicing the content). 73 There are methods to replace matches within given content, and 74 others which return foreach() iterators for traversing content. 75 76 SearchFruct is a more sophisticated variant, which operates more 77 efficiently on longer matches and/or more extensive content. 78 79 ******************************************************************************/ 80 81 public struct FindFruct 82 { 83 private cstring what; 84 85 /*********************************************************************** 86 87 Search forward in the given content, starting at the 88 optional index. 89 90 Returns the index of a match, or content.length where 91 no match was located. 92 93 ***********************************************************************/ 94 95 size_t forward (cstring content, size_t ofs = 0) 96 { 97 return Util.index (content, what, ofs); 98 } 99 100 /*********************************************************************** 101 102 Search backward in the given content, starting at the 103 optional index. 104 105 Returns the index of a match, or content.length where 106 no match was located. 107 108 ***********************************************************************/ 109 110 size_t reverse (cstring content, size_t ofs = size_t.max) 111 { 112 return Util.rindex (content, what, ofs); 113 } 114 115 /*********************************************************************** 116 117 Return the match text 118 119 ***********************************************************************/ 120 121 cstring match () 122 { 123 return what; 124 } 125 126 /*********************************************************************** 127 128 Reset the text to match 129 130 ***********************************************************************/ 131 132 void match (cstring what) 133 { 134 this.what = what; 135 } 136 137 /*********************************************************************** 138 139 Returns true if there is a match within the given content 140 141 ***********************************************************************/ 142 143 bool within (cstring content) 144 { 145 return forward(content) != content.length; 146 } 147 148 /*********************************************************************** 149 150 Returns number of matches within the given content 151 152 ***********************************************************************/ 153 154 size_t count (cstring content) 155 { 156 size_t mark, count; 157 158 while ((mark = Util.index (content, what, mark)) != content.length) 159 ++count, ++mark; 160 return count; 161 } 162 163 /*********************************************************************** 164 165 Replace all matches with the given character. Use method 166 tokens() instead to avoid heap activity. 167 168 Returns a copy of the content with replacements made 169 170 ***********************************************************************/ 171 172 mstring replace (cstring content, char chr) 173 { 174 return replace (content, (&chr)[0..1]); 175 } 176 177 /*********************************************************************** 178 179 Replace all matches with the given substitution. Use 180 method tokens() instead to avoid heap activity. 181 182 Returns a copy of the content with replacements made 183 184 ***********************************************************************/ 185 186 mstring replace (cstring content, cstring sub = null) 187 { 188 mstring output; 189 190 foreach (s; tokens (content, sub)) 191 output ~= s; 192 return output; 193 } 194 195 /*********************************************************************** 196 197 Returns a foreach() iterator which exposes text segments 198 between all matches within the given content. Substitution 199 text is also injected in place of each match, and null can 200 be used to indicate removal instead: 201 --- 202 char[] result; 203 204 auto match = find ("foo"); 205 foreach (token; match.tokens ("$foo&&foo*", "bar")) 206 result ~= token; 207 assert (result == "$bar&&bar*"); 208 --- 209 210 This mechanism avoids internal heap activity. 211 212 ***********************************************************************/ 213 214 Util.PatternFruct!(const(char)) tokens (cstring content, cstring sub = null) 215 { 216 return Util.patterns (content, what, sub); 217 } 218 219 /*********************************************************************** 220 221 Returns a foreach() iterator which exposes the indices of 222 all matches within the given content: 223 --- 224 int count; 225 226 auto f = find ("foo"); 227 foreach (index; f.indices("$foo&&foo*")) 228 ++count; 229 assert (count is 2); 230 --- 231 232 ***********************************************************************/ 233 234 Indices indices (cstring content) 235 { 236 return Indices (what, content); 237 } 238 239 /*********************************************************************** 240 241 Simple foreach() iterator 242 243 ***********************************************************************/ 244 245 private struct Indices 246 { 247 cstring what, content; 248 249 int opApply (scope int delegate (ref size_t index) dg) 250 { 251 int ret; 252 size_t mark; 253 254 while ((mark = Util.index(content, what, mark)) != content.length) 255 if ((ret = dg(mark)) is 0) 256 ++mark; 257 else 258 break; 259 return ret; 260 } 261 } 262 } 263 264 265 /****************************************************************************** 266 267 Convenient bundle of welterweight search utilities, without the 268 hassle of IFTI problems. Create one of these using the search() 269 function: 270 --- 271 auto match = search ("foo"); 272 auto content = "wumpus foo bar" 273 274 // search in the forward direction 275 auto index = match.forward (content); 276 assert (index is 7); 277 278 // search again - returns length when no match found 279 assert (match.forward(content, index+1) is content.length); 280 --- 281 282 Searching operates both forward and backward, with an optional 283 start offset (can be more convenient than slicing the content). 284 There are methods to replace matches within given content, and 285 others which return foreach() iterators for traversing content. 286 287 FindFruct is a simpler variant, which can operate efficiently on 288 short matches and/or short content (employs brute-force strategy) 289 290 ******************************************************************************/ 291 292 public struct SearchFruct 293 { 294 private cstring what; 295 private bool fore; 296 private ptrdiff_t[256] offsets = void; 297 298 /*********************************************************************** 299 300 Construct the fruct 301 302 ***********************************************************************/ 303 304 static SearchFruct opCall (cstring what) 305 { 306 SearchFruct find = void; 307 find.match = what; 308 return find; 309 } 310 311 /*********************************************************************** 312 313 Return the match text 314 315 ***********************************************************************/ 316 317 cstring match () 318 { 319 return what; 320 } 321 322 /*********************************************************************** 323 324 Reset the text to match 325 326 ***********************************************************************/ 327 328 void match (cstring what) 329 { 330 offsets[] = what.length + 1; 331 this.fore = true; 332 this.what = what; 333 reset; 334 } 335 336 /*********************************************************************** 337 338 Search forward in the given content, starting at the 339 optional index. 340 341 Returns the index of a match, or content.length where 342 no match was located. 343 344 ***********************************************************************/ 345 346 size_t forward (cstring content, size_t ofs = 0) 347 { 348 if (! fore) 349 flip; 350 351 if (ofs > content.length) 352 ofs = content.length; 353 354 return find (cast(char*) what.ptr, what.length * char.sizeof, 355 cast(char*) content.ptr, content.length * char.sizeof, 356 ofs * char.sizeof) / char.sizeof; 357 } 358 359 /*********************************************************************** 360 361 Search backward in the given content, starting at the 362 optional index. 363 364 Returns the index of a match, or content.length where 365 no match was located. 366 367 ***********************************************************************/ 368 369 size_t reverse (cstring content, size_t ofs = size_t.max) 370 { 371 if (fore) 372 flip; 373 374 if (ofs > content.length) 375 ofs = content.length; 376 377 return rfind (cast(char*) what.ptr, what.length * char.sizeof, 378 cast(char*) content.ptr, content.length * char.sizeof, 379 ofs * char.sizeof) / char.sizeof; 380 } 381 382 /*********************************************************************** 383 384 Returns true if there is a match within the given content 385 386 ***********************************************************************/ 387 388 bool within (cstring content) 389 { 390 return forward(content) != content.length; 391 } 392 393 /*********************************************************************** 394 395 Returns number of matches within the given content 396 397 ***********************************************************************/ 398 399 size_t count (cstring content) 400 { 401 size_t mark, count; 402 403 while ((mark = forward (content, mark)) != content.length) 404 ++count, ++mark; 405 return count; 406 } 407 408 /*********************************************************************** 409 410 Replace all matches with the given character. Use method 411 tokens() instead to avoid heap activity. 412 413 Returns a copy of the content with replacements made 414 415 ***********************************************************************/ 416 417 mstring replace (cstring content, char chr) 418 { 419 return replace (content, (&chr)[0..1]); 420 } 421 422 /*********************************************************************** 423 424 Replace all matches with the given substitution. Use 425 method tokens() instead to avoid heap activity. 426 427 Returns a copy of the content with replacements made 428 429 ***********************************************************************/ 430 431 mstring replace (cstring content, cstring sub = null) 432 { 433 mstring output; 434 435 foreach (s; tokens (content, sub)) 436 output ~= s; 437 return output; 438 } 439 440 /*********************************************************************** 441 442 Returns a foreach() iterator which exposes text segments 443 between all matches within the given content. Substitution 444 text is also injected in place of each match, and null can 445 be used to indicate removal instead: 446 --- 447 char[] result; 448 449 auto match = search ("foo"); 450 foreach (token; match.tokens("$foo&&foo*", "bar")) 451 result ~= token; 452 assert (result == "$bar&&bar*"); 453 --- 454 455 This mechanism avoids internal heap activity 456 457 ***********************************************************************/ 458 459 Substitute tokens (cstring content, cstring sub = null) return 460 { 461 return Substitute (sub, what, content, &forward); 462 } 463 464 /*********************************************************************** 465 466 Returns a foreach() iterator which exposes the indices of 467 all matches within the given content: 468 --- 469 int count; 470 471 auto match = search ("foo"); 472 foreach (index; match.indices("$foo&&foo*")) 473 ++count; 474 assert (count is 2); 475 --- 476 477 ***********************************************************************/ 478 479 Indices indices (cstring content) return 480 { 481 return Indices (content, &forward); 482 } 483 484 /*********************************************************************** 485 486 ***********************************************************************/ 487 488 private size_t find (const(char)* what, size_t wlen, const(char)* content, 489 size_t len, size_t ofs) 490 { 491 if (len == 0 && len < wlen) 492 return len; 493 494 auto s = content; 495 content += ofs; 496 auto e = s + len - wlen; 497 while (content <= e) 498 if (*what is *content && matches(what, content, wlen)) 499 return content - s; 500 else 501 content += offsets [content[wlen]]; 502 return len; 503 } 504 505 506 /*********************************************************************** 507 508 ***********************************************************************/ 509 510 private size_t rfind (const(char)* what, size_t wlen, 511 const(char)* content, size_t len, size_t ofs) 512 { 513 if (len == 0 && len < wlen) 514 return len; 515 516 auto s = content; 517 auto e = s + ofs - wlen; 518 while (e >= content) 519 if (*what is *e && matches(what, e, wlen)) 520 return e - s; 521 else 522 e -= offsets [*(e-1)]; 523 return len; 524 } 525 526 527 /*********************************************************************** 528 529 ***********************************************************************/ 530 531 private static bool matches (const(char)* a, const(char)* b, size_t length) 532 { 533 while (length > size_t.sizeof) 534 if (*cast(size_t*) a is *cast(size_t*) b) 535 a += size_t.sizeof, b += size_t.sizeof, length -= size_t.sizeof; 536 else 537 return false; 538 539 while (length--) 540 if (*a++ != *b++) 541 return false; 542 return true; 543 } 544 545 /*********************************************************************** 546 547 Construct lookup table. We force the alphabet to be char[] 548 always, and consider wider characters to be longer patterns 549 instead 550 551 ***********************************************************************/ 552 553 private void reset () 554 { 555 if (fore) 556 for (ptrdiff_t i=0; i < this.what.length; ++i) 557 offsets[this.what[i]] = this.what.length - i; 558 else 559 for (ptrdiff_t i= this.what.length; i--;) 560 offsets[this.what[i]] = i+1; 561 } 562 563 /*********************************************************************** 564 565 Reverse lookup-table direction 566 567 ***********************************************************************/ 568 569 private void flip () 570 { 571 fore ^= true; 572 reset; 573 } 574 575 /*********************************************************************** 576 577 Simple foreach() iterator 578 579 ***********************************************************************/ 580 581 private struct Indices 582 { 583 cstring content; 584 size_t delegate(cstring, size_t) call; 585 586 int opApply (scope int delegate (ref size_t index) dg) 587 { 588 int ret; 589 size_t mark; 590 591 while ((mark = call(content, mark)) != content.length) 592 if ((ret = dg(mark)) is 0) 593 ++mark; 594 else 595 break; 596 return ret; 597 } 598 } 599 600 /*********************************************************************** 601 602 Substitution foreach() iterator 603 604 ***********************************************************************/ 605 606 private struct Substitute 607 { 608 private cstring sub; 609 private cstring what; 610 private cstring content; 611 612 size_t delegate(cstring, size_t) call; 613 614 int opApply (scope int delegate (ref cstring token) dg) 615 { 616 size_t ret, 617 pos, 618 mark; 619 cstring token; 620 621 while ((pos = call (content, mark)) < content.length) 622 { 623 token = content [mark .. pos]; 624 if ((ret = dg(token)) != 0) 625 return cast(int) ret; 626 if (sub.ptr && (ret = dg(sub)) != 0) 627 return cast(int) ret; 628 mark = pos + what.length; 629 } 630 631 token = content [mark .. $]; 632 if (mark <= content.length) 633 ret = dg (token); 634 return cast(int) ret; 635 } 636 } 637 } 638 639 640 unittest 641 { 642 auto searcher = search("aaa"); 643 644 // match 645 mstring content1 = "bbaaa".dup; 646 test ( 647 searcher.find( 648 searcher.what.ptr, searcher.what.length, 649 content1.ptr, content1.length, 0 650 ) == 2 651 ); 652 653 // no match 654 mstring content2 = "bbbbb".dup; 655 test ( 656 searcher.find( 657 searcher.what.ptr, searcher.what.length, 658 content2.ptr, content2.length, 0 659 ) == content2.length 660 ); 661 662 // empty text 663 test (searcher.find(searcher.what.ptr, searcher.what.length, 664 null, 0, 0) == 0); 665 } 666 667 unittest 668 { 669 auto searcher = search("aaa"); 670 671 // match 672 mstring content1 = "baaab".dup; 673 test ( 674 searcher.rfind( 675 searcher.what.ptr, searcher.what.length, 676 content1.ptr, content1.length, content1.length 677 ) == 1 678 ); 679 680 // no match 681 mstring content2 = "bbbbb".dup; 682 test ( 683 searcher.rfind( 684 searcher.what.ptr, searcher.what.length, 685 content2.ptr, content2.length, content2.length 686 ) == content2.length 687 ); 688 689 // empty text 690 test (searcher.rfind(searcher.what.ptr, searcher.what.length, 691 null, 0, 0) == 0); 692 }