1 /******************************************************************************* 2 3 D Library Wrapper for PCRE regular expression engine. 4 5 Requires linking with libpcre (-lpcre). 6 7 Usage example: 8 9 --- 10 11 import ocean.text.regex.PCRE; 12 13 auto pcre = new PCRE; 14 15 // Simple, one-off use 16 auto match = pcre.preg_match("Hello World!", "^Hello"); 17 18 // Compile then reuse 19 auto regex = pcre.new CompiledRegex; 20 regex.compile("^Hello"); 21 for ( int i; i < 100; i++ ) 22 { 23 auto match = regex.match("Hello World!"); 24 } 25 26 --- 27 28 29 Related: 30 http://regexkit.sourceforge.net/Documentation/pcre/pcreapi.html 31 32 Copyright: 33 Copyright (c) 2009-2016 dunnhumby Germany GmbH. 34 All rights reserved. 35 36 License: 37 Boost Software License Version 1.0. See LICENSE_BOOST.txt for details. 38 Alternatively, this file may be distributed under the terms of the Tango 39 3-Clause BSD License (see LICENSE_BSD.txt for details). 40 41 Bear in mind this module provides bindings to an external library that 42 has its own license, which might be more restrictive. Please check the 43 external library license to see which conditions apply for linking. 44 45 *******************************************************************************/ 46 47 module ocean.text.regex.PCRE; 48 49 50 51 52 import ocean.transition; 53 54 import ocean.core.Array : copy, concat; 55 import ocean.text.util.StringC; 56 import ocean.text.regex.c.pcre; 57 import ocean.core.TypeConvert; 58 59 import core.stdc.stdlib : free; 60 import ocean.text.convert.Formatter; 61 62 import ocean.core.Verify; 63 64 65 /******************************************************************************* 66 67 PCRE 68 69 *******************************************************************************/ 70 71 public class PCRE 72 { 73 /*************************************************************************** 74 75 Represents a PCRE Exception. 76 The class is re-usable exception where the error message can be 77 reset and the same instance can be re-thrown. 78 79 ***************************************************************************/ 80 81 public static class PcreException : Exception 82 { 83 /*********************************************************************** 84 85 Error code returned by pcre function 86 87 ***********************************************************************/ 88 89 public int error; 90 91 /*********************************************************************** 92 93 Constructor. 94 Just calls the super Exception constructor with initial error 95 message. 96 97 ***********************************************************************/ 98 99 public this() 100 { 101 super("Error message not yet set"); 102 } 103 104 /*********************************************************************** 105 106 Sets the error code and message. 107 108 Params: 109 code = error code to set 110 msg = the new exception message to be used 111 112 ***********************************************************************/ 113 114 private void set ( int code, istring msg ) 115 { 116 this.error = code; 117 this.msg = msg; 118 } 119 } 120 121 /*************************************************************************** 122 123 Limits the complexity of regex searches. If a regex search passes the 124 specified complexity limit without either finding a match or determining 125 that no match exists, it bails out, throwing an exception (see 126 CompiledRegex.match()). 127 128 The default value of 0 uses libpcre's built-in default complexity 129 limit (10 million, see below), which is set to be extremely permissive. 130 Any value less than 10 million will have the effect of reducing the 131 level of complexity tolerated, thus reducing the potential processing 132 time spent searching. 133 134 This field maps directly to the match_limit field in libpcre's 135 pcre_extra struct. 136 137 From http://regexkit.sourceforge.net/Documentation/pcre/pcreapi.html: 138 139 The match_limit field provides a means of preventing PCRE from using up 140 a vast amount of resources when running patterns that are not going to 141 match, but which have a very large number of possibilities in their 142 search trees. The classic example is the use of nested unlimited 143 repeats. 144 145 Internally, PCRE uses a function called match() which it calls 146 repeatedly (sometimes recursively). The limit set by match_limit is 147 imposed on the number of times this function is called during a match, 148 which has the effect of limiting the amount of backtracking that can 149 take place. For patterns that are not anchored, the count restarts from 150 zero for each position in the subject string. 151 152 The default value for the limit can be set when PCRE is built; the 153 default default is 10 million, which handles all but the most extreme 154 cases. 155 156 ***************************************************************************/ 157 158 public static immutable int DEFAULT_COMPLEXITY_LIMIT = 0; 159 160 public int complexity_limit = DEFAULT_COMPLEXITY_LIMIT; 161 162 /*************************************************************************** 163 164 String used internally for formatting. 165 166 ***************************************************************************/ 167 168 private mstring buffer_char; 169 170 /*************************************************************************** 171 172 A re-usable exception instance 173 174 ***************************************************************************/ 175 176 private PcreException exception; 177 178 /*************************************************************************** 179 180 Compiled regex class. Enables a regex pattern to be compiled once and 181 used for multiple searches. 182 183 ***************************************************************************/ 184 185 public class CompiledRegex 186 { 187 /*********************************************************************** 188 189 Pointer to C-allocated pcre regex object, created upon compilation 190 of a regex (see compile()). 191 192 ***********************************************************************/ 193 194 private pcre* pcre_object; 195 196 /*********************************************************************** 197 198 Settings used by the call to pcre_exec() in the match() method. 199 These are modified by the complexity_limit field of the outer class, 200 and by the study() method. 201 202 ***********************************************************************/ 203 204 private pcre_extra match_settings; 205 206 /*********************************************************************** 207 208 Destructor. Frees the C-allocated pcre object. 209 210 ***********************************************************************/ 211 212 ~this ( ) 213 { 214 this.cleanup(); 215 } 216 217 /*********************************************************************** 218 219 Compiles the specified regex for use in the match() method. Cleans 220 up a previously compiled regex, if this instance has been used 221 before. 222 223 Params: 224 pattern = pattern to search for, as a string 225 case_sens = case sensitive matching 226 227 Throws: 228 if the compilation of the regex fails 229 230 Out: 231 following a call to this method, the compiled regex exists 232 233 ***********************************************************************/ 234 235 public void compile ( cstring pattern, bool case_sens = true ) 236 out 237 { 238 assert(this.pcre_object !is null); 239 } 240 body 241 { 242 243 this.cleanup(); 244 245 char* errmsg; 246 int error_code; 247 int error_offset; 248 249 this.outer.buffer_char.concat(pattern, "\0"[]); 250 this.pcre_object = pcre_compile2(this.outer.buffer_char.ptr, 251 (case_sens ? 0 : PCRE_CASELESS), &error_code, &errmsg, 252 &error_offset, null); 253 if ( !this.pcre_object ) 254 { 255 this.outer.exception.msg = format( 256 "Error compiling regular expression: {} - on pattern: {} at position {}", 257 StringC.toDString(errmsg), pattern, error_offset 258 ); 259 this.outer.exception.error = error_code; 260 throw this.outer.exception; 261 } 262 } 263 264 /*********************************************************************** 265 266 Perform a regular expression match. 267 268 Params: 269 subject = the compiled patter will be matched against this string 270 271 Returns: 272 true, if matches or false if no match 273 274 Throws: 275 if an error occurs when running the regex search 276 277 In: 278 the regex must have been compiled 279 280 ***********************************************************************/ 281 282 public bool match ( cstring subject ) 283 { 284 //"" matches against the pattern "" 285 if ( this.outer.buffer_char == "\0" && subject == "" ) 286 { 287 return true; 288 } 289 290 return this.findFirst(subject) != null; 291 } 292 293 /*********************************************************************** 294 295 Performs a regular expression match and return the first found match 296 297 Params: 298 subject = input string 299 300 Returns: 301 slice to the first found match or null if no match 302 303 Throws: 304 if an error occurs when running the regex search 305 306 In: 307 the regex must have been compiled 308 309 ***********************************************************************/ 310 311 public cstring findFirst ( cstring subject ) 312 { 313 verify(this.pcre_object !is null); 314 315 // This method supports only one capture so size 3 is enough. 316 // 2/3 for the positions and 1/3 for a internal buffer 317 int[3] ovector; 318 319 if ( this.outer.complexity_limit != DEFAULT_COMPLEXITY_LIMIT ) 320 { 321 this.match_settings.flags |= PCRE_EXTRA_MATCH_LIMIT; 322 this.match_settings.match_limit = this.outer.complexity_limit; 323 } 324 325 int error_code = pcre_exec(this.pcre_object, &this.match_settings, 326 subject.ptr, castFrom!(size_t).to!(int)(subject.length), 0, 0, 327 ovector.ptr, ovector.length); 328 329 // A positive return value indicates that a single match was found 330 // and its indices stored in ovector[0] and ovector[1]. 331 // A zero return value indicates that multiple matches were found, 332 // the first stored in ovector[0] and ovector[1], and the rest 333 // discarded. 334 if ( error_code >= 0 ) 335 { 336 // we got a match but ovector is 0 so the whole subject matched 337 if ( ovector[0] == 0 && ovector[1] == 0 ) 338 { 339 return subject; 340 } 341 342 return subject[ovector[0] .. ovector[1]]; 343 } 344 // Negative return values indicate failure or error. We ignore 345 // match failures and throw on error. 346 else if ( error_code != PCRE_ERROR_NOMATCH ) 347 { 348 this.outer.exception.set(error_code, 349 "Error on executing regular expression!"); 350 throw this.outer.exception; 351 } 352 353 return null; 354 } 355 356 /*********************************************************************** 357 358 Performs a regular expression match and returns an array of slices 359 of all matches 360 361 Params: 362 subject = input string 363 matches_buffer = found matches will be stored here 364 365 Returns: 366 Array with slices of all matches 367 368 Throws: 369 if an error occurs when running the regex search 370 371 In: 372 the regex must have been compiled 373 374 ***********************************************************************/ 375 376 public cstring[] findAll ( cstring subject, ref cstring[] matches_buffer ) 377 { 378 verify(this.pcre_object !is null); 379 380 ptrdiff_t pos; 381 382 while ( pos < subject.length ) 383 { 384 if ( auto match = this.findFirst(subject[pos .. $]) ) 385 { 386 matches_buffer ~= match; 387 //set pos to end of the match 388 pos = match.ptr - subject.ptr + match.length; 389 } 390 else 391 { 392 break; 393 } 394 } 395 396 return matches_buffer; 397 } 398 399 /*********************************************************************** 400 401 Study a compiled regex in order to increase processing efficiency 402 when calling match(). This is usually only worth doing for a regex 403 which will be used many times, and does not always yield an 404 improvement in efficiency. 405 406 Throws: 407 if an error occurs when studying the regex 408 409 In: 410 the regex must have been compiled 411 412 ***********************************************************************/ 413 414 public void study ( ) 415 { 416 verify(this.pcre_object !is null); 417 418 char* errmsg; 419 auto res = pcre_study(this.pcre_object, 0, &errmsg); 420 if ( errmsg ) 421 { 422 auto derrmsg = StringC.toDString(errmsg); 423 this.outer.exception.set(0, assumeUnique(derrmsg)); 424 throw this.outer.exception; 425 } 426 if ( res ) 427 { 428 this.match_settings.study_data = res.study_data; 429 } 430 } 431 432 /*********************************************************************** 433 434 Cleans up the compiled regex object and the study data. 435 436 ***********************************************************************/ 437 438 private void cleanup ( ) 439 { 440 free(this.pcre_object); 441 this.match_settings = this.match_settings.init; 442 } 443 } 444 445 /*************************************************************************** 446 447 Constructor. Initializes the re-usable exception. 448 449 ***************************************************************************/ 450 451 public this ( ) 452 { 453 this.exception = new PcreException(); 454 } 455 456 /*************************************************************************** 457 458 Perform a regular expression match. Note that this method internally 459 allocates and then frees a C pcre object each time it is called. If you 460 want to run the same regex search multiple times on different input, you 461 are probably better off using the compile() method, above. 462 463 Usage: 464 auto regex = new PCRE; 465 bool match = regex.preg_match("Hello World!", "^Hello"); 466 467 Params: 468 subject = the compiled patter will be matched against this string 469 pattern = pattern to search for, as a string 470 case_sens = case sensitive matching 471 472 Returns: 473 true, if matches or false if no match 474 475 Throws: 476 if the compilation or running of the regex fails 477 478 ***************************************************************************/ 479 480 public bool preg_match ( cstring subject, cstring pattern, bool case_sens = true ) 481 { 482 scope regex = new CompiledRegex; 483 regex.compile(pattern, case_sens); 484 return regex.match(subject); 485 } 486 } 487 488 version ( UnitTest ) 489 { 490 import ocean.core.Test; 491 492 class CounterNamedTest : NamedTest 493 { 494 static uint test_num; 495 496 this ( ) 497 { 498 super(format("PCRE test #{}", ++test_num)); 499 } 500 } 501 } 502 503 504 /******************************************************************************* 505 506 Test for invalid pattern 507 508 *******************************************************************************/ 509 510 unittest 511 { 512 auto pcre = new PCRE; 513 testThrown!(PCRE.PcreException)(pcre.preg_match("", "(")); 514 } 515 516 /******************************************************************************* 517 518 Tests for simple boolean matching via the preg_match() method. (This 519 unittest tests only the interface of this method. It does not test the full 520 range of PCRE features as that is beyond its scope.) 521 522 *******************************************************************************/ 523 524 unittest 525 { 526 void test ( scope bool delegate ( ) dg, bool match ) 527 { 528 auto t = new CounterNamedTest; 529 t.test!("==")(match, dg()); 530 } 531 532 auto pcre = new PCRE; 533 534 // Empty pattern (matches any string) 535 test({ return pcre.preg_match("Hello World", ""); }, true); 536 537 // Empty string and empty pattern (match) 538 test({ return pcre.preg_match("", ""); }, true); 539 540 // Empty string (no match) 541 test({ return pcre.preg_match("", "a"); }, false); 542 543 // Simple string match 544 test({ return pcre.preg_match("Hello World", "Hello"); }, true); 545 546 // Simple string match (fail) 547 test({ return pcre.preg_match("Hello World", "Hallo"); }, false); 548 549 // Case-sensitive match (fail) 550 test({ return pcre.preg_match("Hello World", "hello"); }, false); 551 552 // Case-insensitive match 553 test({ return pcre.preg_match("Hello World", "hello", false); }, true); 554 } 555 556 /******************************************************************************* 557 558 Tests for single substring matching via the CompiledRegex.findFirst() 559 method. 560 561 *******************************************************************************/ 562 563 unittest 564 { 565 auto pcre = new PCRE; 566 auto regex = pcre..new CompiledRegex; 567 568 void testFind ( cstring needle, cstring pre, cstring match, cstring post ) 569 { 570 regex.compile(needle); 571 572 auto haystack = pre ~ match ~ post; 573 auto res = regex.findFirst(haystack); 574 575 auto t = new CounterNamedTest; 576 t.test!("==")(res, haystack[pre.length .. pre.length + match.length]); 577 } 578 579 // Simple match 580 testFind( 581 "a", 582 "bbb", 583 "a", 584 "ccc" 585 ); 586 587 // Match with a more complicated regex 588 testFind( 589 "(firstparam=[^l]*liza.*(secondparam=mary|secondparam=lena|secondparam=john))|((secondparam=mary|secondparam=lena|secondparam=john).*firstparam=liza)", 590 "http://example.org?", 591 "firstparam=elizabeth&rand=84527497861&secondparam=mary", 592 "&some=other¶ms=are&here=%20%21%22%12222" 593 ); 594 595 // Single match returned when multiple matches are possible 596 testFind( 597 "a", 598 "bbb", 599 "a", 600 "cacac" 601 ); 602 } 603 604 /******************************************************************************* 605 606 Tests for multiple substring matching via the CompiledRegex.findAll() 607 method. 608 609 *******************************************************************************/ 610 611 unittest 612 { 613 auto pcre = new PCRE; 614 auto regex = pcre..new CompiledRegex; 615 cstring[] matches_buffer; 616 617 auto t = new NamedTest("PCRE findAll"); 618 619 { 620 istring str = "apa bepa cepa depa epa fepa gepa hepa"; 621 regex.compile("a[ ]", false); 622 623 matches_buffer.length = 0; 624 enableStomping(matches_buffer); 625 626 foreach ( match; regex.findAll(str, matches_buffer) ) 627 { 628 t.test!("==")(match, "a "[]); 629 } 630 631 t.test!("==")(matches_buffer.length, 7); 632 } 633 634 { 635 istring[3] exp = ["ast", "at", "ast"]; 636 istring str = "en hast at en annan hast"; 637 regex.compile("a[s]*t", false); 638 639 matches_buffer = null; 640 641 642 foreach (i, match; regex.findAll(str, matches_buffer) ) 643 { 644 t.test!("==")(match, exp[i]); 645 } 646 t.test!("==")(matches_buffer.length, 3); 647 } 648 649 { 650 istring[3] exp = ["ta", "tb", "td"]; 651 istring str = "tatb t c Tf td"; 652 regex.compile("t[\\w]", true); 653 regex.study(); 654 655 matches_buffer = null; 656 657 foreach (i, match; regex.findAll(str, matches_buffer) ) 658 { 659 t.test!("==")(match, exp[i]); 660 } 661 662 t.test!("==")(matches_buffer.length, 3); 663 } 664 665 { 666 istring str = "en text"; 667 regex.compile("zzz", false); 668 matches_buffer = null; 669 auto matches = regex.findAll(str, matches_buffer); 670 671 t.test!("==")(matches_buffer.length, 0); 672 } 673 674 { 675 istring str = "en text"; 676 regex.compile("zzz", false); 677 matches_buffer = null; 678 auto matches = regex.findAll(str, matches_buffer); 679 680 t.test!("==")(matches_buffer.length, 0); 681 } 682 683 { 684 istring str ="aaaaaaaaaaaaaaaaaaaaaaAaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" 685 ~ "aaaaaaaaaaaaaaaaaaaaaaaaAaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" 686 ~ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaAaaaaaaaaaaaaaaaaaaaaaaaaa" 687 ~ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaAaaaaaaaaaaaaaaaaaaaaaaaa" 688 ~ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaAaaaaaaaaaaaaaaaaaa" 689 ~ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaAaaaaaaaaaaaaaaaaaaa" 690 ~ "aaaaaaaaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbAbbbbbbbbbbbb" 691 ~ "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbAbbbbbbb" 692 ~ "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbaaaaaaaaaaaaaaaaaAaaaaa" 693 ~ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaAaaaaa" 694 ~ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaAaa" 695 ~ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" 696 ~ "aaaaaAaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" 697 ~ "aacaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" 698 ~ "aaaacaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; 699 700 regex.compile("a", true); 701 matches_buffer.length = 0; 702 703 foreach (i, match; regex.findAll(str, matches_buffer) ) 704 { 705 t.test!("==")(match, "a"[]); 706 } 707 708 t.test!("==")(matches_buffer.length, 695); 709 } 710 } 711