1 /*******************************************************************************
2 3 D Library Wrapper for PCRE regular expression engine.
4 5 Requires linking with libpcre (-lpcre).
6 7 Usage example:
8 9 ---
10 11 import ocean.text.regex.PCRE;
12 13 auto pcre = new PCRE;
14 15 // Simple, one-off use
16 auto match = pcre.preg_match("Hello World!", "^Hello");
17 18 // Compile then reuse
19 auto regex = pcre.new CompiledRegex;
20 regex.compile("^Hello");
21 for ( int i; i < 100; i++ )
22 {
23 auto match = regex.match("Hello World!");
24 }
25 26 ---
27 28 29 Related:
30 http://regexkit.sourceforge.net/Documentation/pcre/pcreapi.html
31 32 Copyright:
33 Copyright (c) 2009-2016 dunnhumby Germany GmbH.
34 All rights reserved.
35 36 License:
37 Boost Software License Version 1.0. See LICENSE_BOOST.txt for details.
38 Alternatively, this file may be distributed under the terms of the Tango
39 3-Clause BSD License (see LICENSE_BSD.txt for details).
40 41 Bear in mind this module provides bindings to an external library that
42 has its own license, which might be more restrictive. Please check the
43 external library license to see which conditions apply for linking.
44 45 *******************************************************************************/46 47 moduleocean.text.regex.PCRE;
48 49 50 51 52 importocean.transition;
53 54 importocean.core.Array : copy, concat;
55 importocean.text.util.StringC;
56 importocean.text.regex.c.pcre;
57 importocean.core.TypeConvert;
58 59 importcore.stdc.stdlib : free;
60 importocean.text.convert.Formatter;
61 62 importocean.core.Verify;
63 64 65 /*******************************************************************************
66 67 PCRE
68 69 *******************************************************************************/70 71 publicclassPCRE72 {
73 /***************************************************************************
74 75 Represents a PCRE Exception.
76 The class is re-usable exception where the error message can be
77 reset and the same instance can be re-thrown.
78 79 ***************************************************************************/80 81 publicstaticclassPcreException : Exception82 {
83 /***********************************************************************
84 85 Error code returned by pcre function
86 87 ***********************************************************************/88 89 publicinterror;
90 91 /***********************************************************************
92 93 Constructor.
94 Just calls the super Exception constructor with initial error
95 message.
96 97 ***********************************************************************/98 99 publicthis()
100 {
101 super("Error message not yet set");
102 }
103 104 /***********************************************************************
105 106 Sets the error code and message.
107 108 Params:
109 code = error code to set
110 msg = the new exception message to be used
111 112 ***********************************************************************/113 114 privatevoidset ( intcode, istringmsg )
115 {
116 this.error = code;
117 this.msg = msg;
118 }
119 }
120 121 /***************************************************************************
122 123 Limits the complexity of regex searches. If a regex search passes the
124 specified complexity limit without either finding a match or determining
125 that no match exists, it bails out, throwing an exception (see
126 CompiledRegex.match()).
127 128 The default value of 0 uses libpcre's built-in default complexity
129 limit (10 million, see below), which is set to be extremely permissive.
130 Any value less than 10 million will have the effect of reducing the
131 level of complexity tolerated, thus reducing the potential processing
132 time spent searching.
133 134 This field maps directly to the match_limit field in libpcre's
135 pcre_extra struct.
136 137 From http://regexkit.sourceforge.net/Documentation/pcre/pcreapi.html:
138 139 The match_limit field provides a means of preventing PCRE from using up
140 a vast amount of resources when running patterns that are not going to
141 match, but which have a very large number of possibilities in their
142 search trees. The classic example is the use of nested unlimited
143 repeats.
144 145 Internally, PCRE uses a function called match() which it calls
146 repeatedly (sometimes recursively). The limit set by match_limit is
147 imposed on the number of times this function is called during a match,
148 which has the effect of limiting the amount of backtracking that can
149 take place. For patterns that are not anchored, the count restarts from
150 zero for each position in the subject string.
151 152 The default value for the limit can be set when PCRE is built; the
153 default default is 10 million, which handles all but the most extreme
154 cases.
155 156 ***************************************************************************/157 158 publicstaticimmutableintDEFAULT_COMPLEXITY_LIMIT = 0;
159 160 publicintcomplexity_limit = DEFAULT_COMPLEXITY_LIMIT;
161 162 /***************************************************************************
163 164 String used internally for formatting.
165 166 ***************************************************************************/167 168 privatemstringbuffer_char;
169 170 /***************************************************************************
171 172 A re-usable exception instance
173 174 ***************************************************************************/175 176 privatePcreExceptionexception;
177 178 /***************************************************************************
179 180 Compiled regex class. Enables a regex pattern to be compiled once and
181 used for multiple searches.
182 183 ***************************************************************************/184 185 publicclassCompiledRegex186 {
187 /***********************************************************************
188 189 Pointer to C-allocated pcre regex object, created upon compilation
190 of a regex (see compile()).
191 192 ***********************************************************************/193 194 privatepcre* pcre_object;
195 196 /***********************************************************************
197 198 Settings used by the call to pcre_exec() in the match() method.
199 These are modified by the complexity_limit field of the outer class,
200 and by the study() method.
201 202 ***********************************************************************/203 204 privatepcre_extramatch_settings;
205 206 /***********************************************************************
207 208 Destructor. Frees the C-allocated pcre object.
209 210 ***********************************************************************/211 212 ~this ( )
213 {
214 this.cleanup();
215 }
216 217 /***********************************************************************
218 219 Compiles the specified regex for use in the match() method. Cleans
220 up a previously compiled regex, if this instance has been used
221 before.
222 223 Params:
224 pattern = pattern to search for, as a string
225 case_sens = case sensitive matching
226 227 Throws:
228 if the compilation of the regex fails
229 230 Out:
231 following a call to this method, the compiled regex exists
232 233 ***********************************************************************/234 235 publicvoidcompile ( cstringpattern, boolcase_sens = true )
236 out237 {
238 assert(this.pcre_object !isnull);
239 }
240 body241 {
242 243 this.cleanup();
244 245 char* errmsg;
246 interror_code;
247 interror_offset;
248 249 this.outer.buffer_char.concat(pattern, "\0"[]);
250 this.pcre_object = pcre_compile2(this.outer.buffer_char.ptr,
251 (case_sens ? 0 : PCRE_CASELESS), &error_code, &errmsg,
252 &error_offset, null);
253 if ( !this.pcre_object )
254 {
255 this.outer.exception.msg = format(
256 "Error compiling regular expression: {} - on pattern: {} at position {}",
257 StringC.toDString(errmsg), pattern, error_offset258 );
259 this.outer.exception.error = error_code;
260 throwthis.outer.exception;
261 }
262 }
263 264 /***********************************************************************
265 266 Perform a regular expression match.
267 268 Params:
269 subject = the compiled patter will be matched against this string
270 271 Returns:
272 true, if matches or false if no match
273 274 Throws:
275 if an error occurs when running the regex search
276 277 In:
278 the regex must have been compiled
279 280 ***********************************************************************/281 282 publicboolmatch ( cstringsubject )
283 {
284 //"" matches against the pattern ""285 if ( this.outer.buffer_char == "\0" && subject == "" )
286 {
287 returntrue;
288 }
289 290 returnthis.findFirst(subject) != null;
291 }
292 293 /***********************************************************************
294 295 Performs a regular expression match and return the first found match
296 297 Params:
298 subject = input string
299 300 Returns:
301 slice to the first found match or null if no match
302 303 Throws:
304 if an error occurs when running the regex search
305 306 In:
307 the regex must have been compiled
308 309 ***********************************************************************/310 311 publiccstringfindFirst ( cstringsubject )
312 {
313 verify(this.pcre_object !isnull);
314 315 // This method supports only one capture so size 3 is enough.316 // 2/3 for the positions and 1/3 for a internal buffer317 int[3] ovector;
318 319 if ( this.outer.complexity_limit != DEFAULT_COMPLEXITY_LIMIT )
320 {
321 this.match_settings.flags |= PCRE_EXTRA_MATCH_LIMIT;
322 this.match_settings.match_limit = this.outer.complexity_limit;
323 }
324 325 interror_code = pcre_exec(this.pcre_object, &this.match_settings,
326 subject.ptr, castFrom!(size_t).to!(int)(subject.length), 0, 0,
327 ovector.ptr, ovector.length);
328 329 // A positive return value indicates that a single match was found330 // and its indices stored in ovector[0] and ovector[1].331 // A zero return value indicates that multiple matches were found,332 // the first stored in ovector[0] and ovector[1], and the rest333 // discarded.334 if ( error_code >= 0 )
335 {
336 // we got a match but ovector is 0 so the whole subject matched337 if ( ovector[0] == 0 && ovector[1] == 0 )
338 {
339 returnsubject;
340 }
341 342 returnsubject[ovector[0] .. ovector[1]];
343 }
344 // Negative return values indicate failure or error. We ignore345 // match failures and throw on error.346 elseif ( error_code != PCRE_ERROR_NOMATCH )
347 {
348 this.outer.exception.set(error_code,
349 "Error on executing regular expression!");
350 throwthis.outer.exception;
351 }
352 353 returnnull;
354 }
355 356 /***********************************************************************
357 358 Performs a regular expression match and returns an array of slices
359 of all matches
360 361 Params:
362 subject = input string
363 matches_buffer = found matches will be stored here
364 365 Returns:
366 Array with slices of all matches
367 368 Throws:
369 if an error occurs when running the regex search
370 371 In:
372 the regex must have been compiled
373 374 ***********************************************************************/375 376 publiccstring[] findAll ( cstringsubject, refcstring[] matches_buffer )
377 {
378 verify(this.pcre_object !isnull);
379 380 ptrdiff_tpos;
381 382 while ( pos < subject.length )
383 {
384 if ( automatch = this.findFirst(subject[pos .. $]) )
385 {
386 matches_buffer ~= match;
387 //set pos to end of the match388 pos = match.ptr - subject.ptr + match.length;
389 }
390 else391 {
392 break;
393 }
394 }
395 396 returnmatches_buffer;
397 }
398 399 /***********************************************************************
400 401 Study a compiled regex in order to increase processing efficiency
402 when calling match(). This is usually only worth doing for a regex
403 which will be used many times, and does not always yield an
404 improvement in efficiency.
405 406 Throws:
407 if an error occurs when studying the regex
408 409 In:
410 the regex must have been compiled
411 412 ***********************************************************************/413 414 publicvoidstudy ( )
415 {
416 verify(this.pcre_object !isnull);
417 418 char* errmsg;
419 autores = pcre_study(this.pcre_object, 0, &errmsg);
420 if ( errmsg )
421 {
422 autoderrmsg = StringC.toDString(errmsg);
423 this.outer.exception.set(0, assumeUnique(derrmsg));
424 throwthis.outer.exception;
425 }
426 if ( res )
427 {
428 this.match_settings.study_data = res.study_data;
429 }
430 }
431 432 /***********************************************************************
433 434 Cleans up the compiled regex object and the study data.
435 436 ***********************************************************************/437 438 privatevoidcleanup ( )
439 {
440 free(this.pcre_object);
441 this.match_settings = this.match_settings.init;
442 }
443 }
444 445 /***************************************************************************
446 447 Constructor. Initializes the re-usable exception.
448 449 ***************************************************************************/450 451 publicthis ( )
452 {
453 this.exception = newPcreException();
454 }
455 456 /***************************************************************************
457 458 Perform a regular expression match. Note that this method internally
459 allocates and then frees a C pcre object each time it is called. If you
460 want to run the same regex search multiple times on different input, you
461 are probably better off using the compile() method, above.
462 463 Usage:
464 auto regex = new PCRE;
465 bool match = regex.preg_match("Hello World!", "^Hello");
466 467 Params:
468 subject = the compiled patter will be matched against this string
469 pattern = pattern to search for, as a string
470 case_sens = case sensitive matching
471 472 Returns:
473 true, if matches or false if no match
474 475 Throws:
476 if the compilation or running of the regex fails
477 478 ***************************************************************************/479 480 publicboolpreg_match ( cstringsubject, cstringpattern, boolcase_sens = true )
481 {
482 scoperegex = newCompiledRegex;
483 regex.compile(pattern, case_sens);
484 returnregex.match(subject);
485 }
486 }
487 488 version ( UnitTest )
489 {
490 importocean.core.Test;
491 492 classCounterNamedTest : NamedTest493 {
494 staticuinttest_num;
495 496 this ( )
497 {
498 super(format("PCRE test #{}", ++test_num));
499 }
500 }
501 }
502 503 504 /*******************************************************************************
505 506 Test for invalid pattern
507 508 *******************************************************************************/509 510 unittest511 {
512 autopcre = newPCRE;
513 testThrown!(PCRE.PcreException)(pcre.preg_match("", "("));
514 }
515 516 /*******************************************************************************
517 518 Tests for simple boolean matching via the preg_match() method. (This
519 unittest tests only the interface of this method. It does not test the full
520 range of PCRE features as that is beyond its scope.)
521 522 *******************************************************************************/523 524 unittest525 {
526 voidtest ( scopebooldelegate ( ) dg, boolmatch )
527 {
528 autot = newCounterNamedTest;
529 t.test!("==")(match, dg());
530 }
531 532 autopcre = newPCRE;
533 534 // Empty pattern (matches any string)535 test({ returnpcre.preg_match("Hello World", ""); }, true);
536 537 // Empty string and empty pattern (match)538 test({ returnpcre.preg_match("", ""); }, true);
539 540 // Empty string (no match)541 test({ returnpcre.preg_match("", "a"); }, false);
542 543 // Simple string match544 test({ returnpcre.preg_match("Hello World", "Hello"); }, true);
545 546 // Simple string match (fail)547 test({ returnpcre.preg_match("Hello World", "Hallo"); }, false);
548 549 // Case-sensitive match (fail)550 test({ returnpcre.preg_match("Hello World", "hello"); }, false);
551 552 // Case-insensitive match553 test({ returnpcre.preg_match("Hello World", "hello", false); }, true);
554 }
555 556 /*******************************************************************************
557 558 Tests for single substring matching via the CompiledRegex.findFirst()
559 method.
560 561 *******************************************************************************/562 563 unittest564 {
565 autopcre = newPCRE;
566 autoregex = pcre..newCompiledRegex;
567 568 voidtestFind ( cstringneedle, cstringpre, cstringmatch, cstringpost )
569 {
570 regex.compile(needle);
571 572 autohaystack = pre ~ match ~ post;
573 autores = regex.findFirst(haystack);
574 575 autot = newCounterNamedTest;
576 t.test!("==")(res, haystack[pre.length .. pre.length + match.length]);
577 }
578 579 // Simple match580 testFind(
581 "a",
582 "bbb",
583 "a",
584 "ccc"585 );
586 587 // Match with a more complicated regex588 testFind(
589 "(firstparam=[^l]*liza.*(secondparam=mary|secondparam=lena|secondparam=john))|((secondparam=mary|secondparam=lena|secondparam=john).*firstparam=liza)",
590 "http://example.org?",
591 "firstparam=elizabeth&rand=84527497861&secondparam=mary",
592 "&some=other¶ms=are&here=%20%21%22%12222"593 );
594 595 // Single match returned when multiple matches are possible596 testFind(
597 "a",
598 "bbb",
599 "a",
600 "cacac"601 );
602 }
603 604 /*******************************************************************************
605 606 Tests for multiple substring matching via the CompiledRegex.findAll()
607 method.
608 609 *******************************************************************************/610 611 unittest612 {
613 autopcre = newPCRE;
614 autoregex = pcre..newCompiledRegex;
615 cstring[] matches_buffer;
616 617 autot = newNamedTest("PCRE findAll");
618 619 {
620 istringstr = "apa bepa cepa depa epa fepa gepa hepa";
621 regex.compile("a[ ]", false);
622 623 matches_buffer.length = 0;
624 enableStomping(matches_buffer);
625 626 foreach ( match; regex.findAll(str, matches_buffer) )
627 {
628 t.test!("==")(match, "a "[]);
629 }
630 631 t.test!("==")(matches_buffer.length, 7);
632 }
633 634 {
635 istring[3] exp = ["ast", "at", "ast"];
636 istringstr = "en hast at en annan hast";
637 regex.compile("a[s]*t", false);
638 639 matches_buffer = null;
640 641 642 foreach (i, match; regex.findAll(str, matches_buffer) )
643 {
644 t.test!("==")(match, exp[i]);
645 }
646 t.test!("==")(matches_buffer.length, 3);
647 }
648 649 {
650 istring[3] exp = ["ta", "tb", "td"];
651 istringstr = "tatb t c Tf td";
652 regex.compile("t[\\w]", true);
653 regex.study();
654 655 matches_buffer = null;
656 657 foreach (i, match; regex.findAll(str, matches_buffer) )
658 {
659 t.test!("==")(match, exp[i]);
660 }
661 662 t.test!("==")(matches_buffer.length, 3);
663 }
664 665 {
666 istringstr = "en text";
667 regex.compile("zzz", false);
668 matches_buffer = null;
669 automatches = regex.findAll(str, matches_buffer);
670 671 t.test!("==")(matches_buffer.length, 0);
672 }
673 674 {
675 istringstr = "en text";
676 regex.compile("zzz", false);
677 matches_buffer = null;
678 automatches = regex.findAll(str, matches_buffer);
679 680 t.test!("==")(matches_buffer.length, 0);
681 }
682 683 {
684 istringstr ="aaaaaaaaaaaaaaaaaaaaaaAaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"685 ~ "aaaaaaaaaaaaaaaaaaaaaaaaAaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"686 ~ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaAaaaaaaaaaaaaaaaaaaaaaaaaa"687 ~ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaAaaaaaaaaaaaaaaaaaaaaaaaa"688 ~ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaAaaaaaaaaaaaaaaaaaa"689 ~ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaAaaaaaaaaaaaaaaaaaaa"690 ~ "aaaaaaaaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbAbbbbbbbbbbbb"691 ~ "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbAbbbbbbb"692 ~ "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbaaaaaaaaaaaaaaaaaAaaaaa"693 ~ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaAaaaaa"694 ~ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaAaa"695 ~ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"696 ~ "aaaaaAaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"697 ~ "aacaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"698 ~ "aaaacaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
699 700 regex.compile("a", true);
701 matches_buffer.length = 0;
702 703 foreach (i, match; regex.findAll(str, matches_buffer) )
704 {
705 t.test!("==")(match, "a"[]);
706 }
707 708 t.test!("==")(matches_buffer.length, 695);
709 }
710 }
711