1 /******************************************************************************* 2 3 D Library Wrapper for PCRE regular expression engine. 4 5 Requires linking with libpcre (-lpcre). 6 7 Usage example: 8 9 --- 10 11 import ocean.text.regex.PCRE; 12 13 auto pcre = new PCRE; 14 15 // Simple, one-off use 16 auto match = pcre.preg_match("Hello World!", "^Hello"); 17 18 // Compile then reuse 19 auto regex = pcre.new CompiledRegex; 20 regex.compile("^Hello"); 21 for ( int i; i < 100; i++ ) 22 { 23 auto match = regex.match("Hello World!"); 24 } 25 26 --- 27 28 29 Related: 30 http://regexkit.sourceforge.net/Documentation/pcre/pcreapi.html 31 32 Copyright: 33 Copyright (c) 2009-2016 dunnhumby Germany GmbH. 34 All rights reserved. 35 36 License: 37 Boost Software License Version 1.0. See LICENSE_BOOST.txt for details. 38 Alternatively, this file may be distributed under the terms of the Tango 39 3-Clause BSD License (see LICENSE_BSD.txt for details). 40 41 Bear in mind this module provides bindings to an external library that 42 has its own license, which might be more restrictive. Please check the 43 external library license to see which conditions apply for linking. 44 45 *******************************************************************************/ 46 47 module ocean.text.regex.PCRE; 48 49 50 51 52 import ocean.meta.types.Qualifiers; 53 import ocean.core.TypeConvert: assumeUnique; 54 55 import ocean.core.Array : copy, concat; 56 import ocean.text.util.StringC; 57 import ocean.text.regex.c.pcre; 58 import ocean.core.TypeConvert; 59 60 import core.stdc.stdlib : free; 61 import ocean.text.convert.Formatter; 62 63 import ocean.core.Verify; 64 65 66 /******************************************************************************* 67 68 PCRE 69 70 *******************************************************************************/ 71 72 public class PCRE 73 { 74 /*************************************************************************** 75 76 Represents a PCRE Exception. 77 The class is re-usable exception where the error message can be 78 reset and the same instance can be re-thrown. 79 80 ***************************************************************************/ 81 82 public static class PcreException : Exception 83 { 84 /*********************************************************************** 85 86 Error code returned by pcre function 87 88 ***********************************************************************/ 89 90 public int error; 91 92 /*********************************************************************** 93 94 Constructor. 95 Just calls the super Exception constructor with initial error 96 message. 97 98 ***********************************************************************/ 99 100 public this() 101 { 102 super("Error message not yet set"); 103 } 104 105 /*********************************************************************** 106 107 Sets the error code and message. 108 109 Params: 110 code = error code to set 111 msg = the new exception message to be used 112 113 ***********************************************************************/ 114 115 private void set ( int code, istring msg ) 116 { 117 this.error = code; 118 this.msg = msg; 119 } 120 } 121 122 /*************************************************************************** 123 124 Limits the complexity of regex searches. If a regex search passes the 125 specified complexity limit without either finding a match or determining 126 that no match exists, it bails out, throwing an exception (see 127 CompiledRegex.match()). 128 129 The default value of 0 uses libpcre's built-in default complexity 130 limit (10 million, see below), which is set to be extremely permissive. 131 Any value less than 10 million will have the effect of reducing the 132 level of complexity tolerated, thus reducing the potential processing 133 time spent searching. 134 135 This field maps directly to the match_limit field in libpcre's 136 pcre_extra struct. 137 138 From http://regexkit.sourceforge.net/Documentation/pcre/pcreapi.html: 139 140 The match_limit field provides a means of preventing PCRE from using up 141 a vast amount of resources when running patterns that are not going to 142 match, but which have a very large number of possibilities in their 143 search trees. The classic example is the use of nested unlimited 144 repeats. 145 146 Internally, PCRE uses a function called match() which it calls 147 repeatedly (sometimes recursively). The limit set by match_limit is 148 imposed on the number of times this function is called during a match, 149 which has the effect of limiting the amount of backtracking that can 150 take place. For patterns that are not anchored, the count restarts from 151 zero for each position in the subject string. 152 153 The default value for the limit can be set when PCRE is built; the 154 default default is 10 million, which handles all but the most extreme 155 cases. 156 157 ***************************************************************************/ 158 159 public static immutable int DEFAULT_COMPLEXITY_LIMIT = 0; 160 161 public int complexity_limit = DEFAULT_COMPLEXITY_LIMIT; 162 163 /*************************************************************************** 164 165 String used internally for formatting. 166 167 ***************************************************************************/ 168 169 private mstring buffer_char; 170 171 /*************************************************************************** 172 173 A re-usable exception instance 174 175 ***************************************************************************/ 176 177 private PcreException exception; 178 179 /*************************************************************************** 180 181 Compiled regex class. Enables a regex pattern to be compiled once and 182 used for multiple searches. 183 184 ***************************************************************************/ 185 186 public class CompiledRegex 187 { 188 /*********************************************************************** 189 190 Pointer to C-allocated pcre regex object, created upon compilation 191 of a regex (see compile()). 192 193 ***********************************************************************/ 194 195 private pcre* pcre_object; 196 197 /*********************************************************************** 198 199 Settings used by the call to pcre_exec() in the match() method. 200 These are modified by the complexity_limit field of the outer class, 201 and by the study() method. 202 203 ***********************************************************************/ 204 205 private pcre_extra match_settings; 206 207 /*********************************************************************** 208 209 Destructor. Frees the C-allocated pcre object. 210 211 ***********************************************************************/ 212 213 ~this ( ) 214 { 215 this.cleanup(); 216 } 217 218 /*********************************************************************** 219 220 Compiles the specified regex for use in the match() method. Cleans 221 up a previously compiled regex, if this instance has been used 222 before. 223 224 Params: 225 pattern = pattern to search for, as a string 226 case_sens = case sensitive matching 227 228 Throws: 229 if the compilation of the regex fails 230 231 Out: 232 following a call to this method, the compiled regex exists 233 234 ***********************************************************************/ 235 236 public void compile ( cstring pattern, bool case_sens = true ) 237 out 238 { 239 assert(this.pcre_object !is null); 240 } 241 do 242 { 243 244 this.cleanup(); 245 246 char* errmsg; 247 int error_code; 248 int error_offset; 249 250 this.outer.buffer_char.concat(pattern, "\0"[]); 251 this.pcre_object = pcre_compile2(this.outer.buffer_char.ptr, 252 (case_sens ? 0 : PCRE_CASELESS), &error_code, &errmsg, 253 &error_offset, null); 254 if ( !this.pcre_object ) 255 { 256 this.outer.exception.msg = format( 257 "Error compiling regular expression: {} - on pattern: {} at position {}", 258 StringC.toDString(errmsg), pattern, error_offset 259 ); 260 this.outer.exception.error = error_code; 261 throw this.outer.exception; 262 } 263 } 264 265 /*********************************************************************** 266 267 Perform a regular expression match. 268 269 Params: 270 subject = the compiled patter will be matched against this string 271 272 Returns: 273 true, if matches or false if no match 274 275 Throws: 276 if an error occurs when running the regex search 277 278 In: 279 the regex must have been compiled 280 281 ***********************************************************************/ 282 283 public bool match ( cstring subject ) 284 { 285 //"" matches against the pattern "" 286 if ( this.outer.buffer_char == "\0" && subject == "" ) 287 { 288 return true; 289 } 290 291 return this.findFirst(subject) != null; 292 } 293 294 /*********************************************************************** 295 296 Performs a regular expression match and return the first found match 297 298 Params: 299 subject = input string 300 301 Returns: 302 slice to the first found match or null if no match 303 304 Throws: 305 if an error occurs when running the regex search 306 307 In: 308 the regex must have been compiled 309 310 ***********************************************************************/ 311 312 public cstring findFirst ( cstring subject ) 313 { 314 verify(this.pcre_object !is null); 315 316 // This method supports only one capture so size 3 is enough. 317 // 2/3 for the positions and 1/3 for a internal buffer 318 int[3] ovector; 319 320 if ( this.outer.complexity_limit != DEFAULT_COMPLEXITY_LIMIT ) 321 { 322 this.match_settings.flags |= PCRE_EXTRA_MATCH_LIMIT; 323 this.match_settings.match_limit = this.outer.complexity_limit; 324 } 325 326 int error_code = pcre_exec(this.pcre_object, &this.match_settings, 327 subject.ptr, castFrom!(size_t).to!(int)(subject.length), 0, 0, 328 ovector.ptr, ovector.length); 329 330 // A positive return value indicates that a single match was found 331 // and its indices stored in ovector[0] and ovector[1]. 332 // A zero return value indicates that multiple matches were found, 333 // the first stored in ovector[0] and ovector[1], and the rest 334 // discarded. 335 if ( error_code >= 0 ) 336 { 337 // we got a match but ovector is 0 so the whole subject matched 338 if ( ovector[0] == 0 && ovector[1] == 0 ) 339 { 340 return subject; 341 } 342 343 return subject[ovector[0] .. ovector[1]]; 344 } 345 // Negative return values indicate failure or error. We ignore 346 // match failures and throw on error. 347 else if ( error_code != PCRE_ERROR_NOMATCH ) 348 { 349 this.outer.exception.set(error_code, 350 "Error on executing regular expression!"); 351 throw this.outer.exception; 352 } 353 354 return null; 355 } 356 357 /*********************************************************************** 358 359 Performs a regular expression match and returns an array of slices 360 of all matches 361 362 Params: 363 subject = input string 364 matches_buffer = found matches will be stored here 365 366 Returns: 367 Array with slices of all matches 368 369 Throws: 370 if an error occurs when running the regex search 371 372 In: 373 the regex must have been compiled 374 375 ***********************************************************************/ 376 377 public cstring[] findAll ( cstring subject, ref cstring[] matches_buffer ) 378 { 379 verify(this.pcre_object !is null); 380 381 ptrdiff_t pos; 382 383 while ( pos < subject.length ) 384 { 385 if ( auto match = this.findFirst(subject[pos .. $]) ) 386 { 387 matches_buffer ~= match; 388 //set pos to end of the match 389 pos = match.ptr - subject.ptr + match.length; 390 } 391 else 392 { 393 break; 394 } 395 } 396 397 return matches_buffer; 398 } 399 400 /*********************************************************************** 401 402 Study a compiled regex in order to increase processing efficiency 403 when calling match(). This is usually only worth doing for a regex 404 which will be used many times, and does not always yield an 405 improvement in efficiency. 406 407 Throws: 408 if an error occurs when studying the regex 409 410 In: 411 the regex must have been compiled 412 413 ***********************************************************************/ 414 415 public void study ( ) 416 { 417 verify(this.pcre_object !is null); 418 419 char* errmsg; 420 auto res = pcre_study(this.pcre_object, 0, &errmsg); 421 if ( errmsg ) 422 { 423 auto derrmsg = StringC.toDString(errmsg); 424 this.outer.exception.set(0, assumeUnique(derrmsg)); 425 throw this.outer.exception; 426 } 427 if ( res ) 428 { 429 this.match_settings.study_data = res.study_data; 430 } 431 } 432 433 /*********************************************************************** 434 435 Cleans up the compiled regex object and the study data. 436 437 ***********************************************************************/ 438 439 private void cleanup ( ) 440 { 441 free(this.pcre_object); 442 this.match_settings = this.match_settings.init; 443 } 444 } 445 446 /*************************************************************************** 447 448 Constructor. Initializes the re-usable exception. 449 450 ***************************************************************************/ 451 452 public this ( ) 453 { 454 this.exception = new PcreException(); 455 } 456 457 /*************************************************************************** 458 459 Perform a regular expression match. Note that this method internally 460 allocates and then frees a C pcre object each time it is called. If you 461 want to run the same regex search multiple times on different input, you 462 are probably better off using the compile() method, above. 463 464 Usage: 465 auto regex = new PCRE; 466 bool match = regex.preg_match("Hello World!", "^Hello"); 467 468 Params: 469 subject = the compiled patter will be matched against this string 470 pattern = pattern to search for, as a string 471 case_sens = case sensitive matching 472 473 Returns: 474 true, if matches or false if no match 475 476 Throws: 477 if the compilation or running of the regex fails 478 479 ***************************************************************************/ 480 481 public bool preg_match ( cstring subject, cstring pattern, bool case_sens = true ) 482 { 483 scope regex = new CompiledRegex; 484 regex.compile(pattern, case_sens); 485 return regex.match(subject); 486 } 487 } 488 489 version (unittest) 490 { 491 import ocean.core.Test; 492 493 class CounterNamedTest : NamedTest 494 { 495 static uint test_num; 496 497 this ( ) 498 { 499 super(format("PCRE test #{}", ++test_num)); 500 } 501 } 502 } 503 504 505 /******************************************************************************* 506 507 Test for invalid pattern 508 509 *******************************************************************************/ 510 511 unittest 512 { 513 auto pcre = new PCRE; 514 testThrown!(PCRE.PcreException)(pcre.preg_match("", "(")); 515 } 516 517 /******************************************************************************* 518 519 Tests for simple boolean matching via the preg_match() method. (This 520 unittest tests only the interface of this method. It does not test the full 521 range of PCRE features as that is beyond its scope.) 522 523 *******************************************************************************/ 524 525 unittest 526 { 527 void test ( scope bool delegate ( ) dg, bool match ) 528 { 529 auto t = new CounterNamedTest; 530 t.test!("==")(match, dg()); 531 } 532 533 auto pcre = new PCRE; 534 535 // Empty pattern (matches any string) 536 test({ return pcre.preg_match("Hello World", ""); }, true); 537 538 // Empty string and empty pattern (match) 539 test({ return pcre.preg_match("", ""); }, true); 540 541 // Empty string (no match) 542 test({ return pcre.preg_match("", "a"); }, false); 543 544 // Simple string match 545 test({ return pcre.preg_match("Hello World", "Hello"); }, true); 546 547 // Simple string match (fail) 548 test({ return pcre.preg_match("Hello World", "Hallo"); }, false); 549 550 // Case-sensitive match (fail) 551 test({ return pcre.preg_match("Hello World", "hello"); }, false); 552 553 // Case-insensitive match 554 test({ return pcre.preg_match("Hello World", "hello", false); }, true); 555 } 556 557 /******************************************************************************* 558 559 Tests for single substring matching via the CompiledRegex.findFirst() 560 method. 561 562 *******************************************************************************/ 563 564 unittest 565 { 566 auto pcre = new PCRE; 567 auto regex = pcre..new CompiledRegex; 568 569 void testFind ( cstring needle, cstring pre, cstring match, cstring post ) 570 { 571 regex.compile(needle); 572 573 auto haystack = pre ~ match ~ post; 574 auto res = regex.findFirst(haystack); 575 576 auto t = new CounterNamedTest; 577 t.test!("==")(res, haystack[pre.length .. pre.length + match.length]); 578 } 579 580 // Simple match 581 testFind( 582 "a", 583 "bbb", 584 "a", 585 "ccc" 586 ); 587 588 // Match with a more complicated regex 589 testFind( 590 "(firstparam=[^l]*liza.*(secondparam=mary|secondparam=lena|secondparam=john))|((secondparam=mary|secondparam=lena|secondparam=john).*firstparam=liza)", 591 "http://example.org?", 592 "firstparam=elizabeth&rand=84527497861&secondparam=mary", 593 "&some=other¶ms=are&here=%20%21%22%12222" 594 ); 595 596 // Single match returned when multiple matches are possible 597 testFind( 598 "a", 599 "bbb", 600 "a", 601 "cacac" 602 ); 603 } 604 605 /******************************************************************************* 606 607 Tests for multiple substring matching via the CompiledRegex.findAll() 608 method. 609 610 *******************************************************************************/ 611 612 unittest 613 { 614 auto pcre = new PCRE; 615 auto regex = pcre..new CompiledRegex; 616 cstring[] matches_buffer; 617 618 auto t = new NamedTest("PCRE findAll"); 619 620 { 621 istring str = "apa bepa cepa depa epa fepa gepa hepa"; 622 regex.compile("a[ ]", false); 623 624 matches_buffer.length = 0; 625 assumeSafeAppend(matches_buffer); 626 627 foreach ( match; regex.findAll(str, matches_buffer) ) 628 { 629 t.test!("==")(match, "a "[]); 630 } 631 632 t.test!("==")(matches_buffer.length, 7); 633 } 634 635 { 636 istring[3] exp = ["ast", "at", "ast"]; 637 istring str = "en hast at en annan hast"; 638 regex.compile("a[s]*t", false); 639 640 matches_buffer = null; 641 642 643 foreach (i, match; regex.findAll(str, matches_buffer) ) 644 { 645 t.test!("==")(match, exp[i]); 646 } 647 t.test!("==")(matches_buffer.length, 3); 648 } 649 650 { 651 istring[3] exp = ["ta", "tb", "td"]; 652 istring str = "tatb t c Tf td"; 653 regex.compile("t[\\w]", true); 654 regex.study(); 655 656 matches_buffer = null; 657 658 foreach (i, match; regex.findAll(str, matches_buffer) ) 659 { 660 t.test!("==")(match, exp[i]); 661 } 662 663 t.test!("==")(matches_buffer.length, 3); 664 } 665 666 { 667 istring str = "en text"; 668 regex.compile("zzz", false); 669 matches_buffer = null; 670 auto matches = regex.findAll(str, matches_buffer); 671 672 t.test!("==")(matches_buffer.length, 0); 673 } 674 675 { 676 istring str = "en text"; 677 regex.compile("zzz", false); 678 matches_buffer = null; 679 auto matches = regex.findAll(str, matches_buffer); 680 681 t.test!("==")(matches_buffer.length, 0); 682 } 683 684 { 685 istring str ="aaaaaaaaaaaaaaaaaaaaaaAaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" 686 ~ "aaaaaaaaaaaaaaaaaaaaaaaaAaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" 687 ~ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaAaaaaaaaaaaaaaaaaaaaaaaaaa" 688 ~ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaAaaaaaaaaaaaaaaaaaaaaaaaa" 689 ~ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaAaaaaaaaaaaaaaaaaaa" 690 ~ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaAaaaaaaaaaaaaaaaaaaa" 691 ~ "aaaaaaaaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbAbbbbbbbbbbbb" 692 ~ "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbAbbbbbbb" 693 ~ "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbaaaaaaaaaaaaaaaaaAaaaaa" 694 ~ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaAaaaaa" 695 ~ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaAaa" 696 ~ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" 697 ~ "aaaaaAaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" 698 ~ "aacaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" 699 ~ "aaaacaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; 700 701 regex.compile("a", true); 702 matches_buffer.length = 0; 703 704 foreach (i, match; regex.findAll(str, matches_buffer) ) 705 { 706 t.test!("==")(match, "a"[]); 707 } 708 709 t.test!("==")(matches_buffer.length, 695); 710 } 711 }