1 /*******************************************************************************
2 
3     D Library Wrapper for PCRE regular expression engine.
4 
5     Requires linking with libpcre (-lpcre).
6 
7     Usage example:
8 
9     ---
10 
11         import ocean.text.regex.PCRE;
12 
13         auto pcre = new PCRE;
14 
15         // Simple, one-off use
16         auto match = pcre.preg_match("Hello World!", "^Hello");
17 
18         // Compile then reuse
19         auto regex = pcre.new CompiledRegex;
20         regex.compile("^Hello");
21         for ( int i; i < 100; i++ )
22         {
23             auto match = regex.match("Hello World!");
24         }
25 
26     ---
27 
28 
29     Related:
30         http://regexkit.sourceforge.net/Documentation/pcre/pcreapi.html
31 
32     Copyright:
33         Copyright (c) 2009-2016 dunnhumby Germany GmbH.
34         All rights reserved.
35 
36     License:
37         Boost Software License Version 1.0. See LICENSE_BOOST.txt for details.
38         Alternatively, this file may be distributed under the terms of the Tango
39         3-Clause BSD License (see LICENSE_BSD.txt for details).
40 
41         Bear in mind this module provides bindings to an external library that
42         has its own license, which might be more restrictive. Please check the
43         external library license to see which conditions apply for linking.
44 
45 *******************************************************************************/
46 
47 module ocean.text.regex.PCRE;
48 
49 
50 
51 
52 import ocean.transition;
53 
54 import ocean.core.Array : copy, concat;
55 import ocean.text.util.StringC;
56 import ocean.text.regex.c.pcre;
57 import ocean.core.TypeConvert;
58 
59 import core.stdc.stdlib : free;
60 import ocean.text.convert.Formatter;
61 
62 import ocean.core.Verify;
63 
64 
65 /*******************************************************************************
66 
67     PCRE
68 
69 *******************************************************************************/
70 
71 public class PCRE
72 {
73     /***************************************************************************
74 
75         Represents a PCRE Exception.
76         The class is re-usable exception where the error message can be
77         reset and the same instance can be re-thrown.
78 
79     ***************************************************************************/
80 
81     public static class PcreException : Exception
82     {
83         /***********************************************************************
84 
85             Error code returned by pcre function
86 
87         ***********************************************************************/
88 
89         public int error;
90 
91         /***********************************************************************
92 
93             Constructor.
94             Just calls the super Exception constructor with initial error
95             message.
96 
97         ***********************************************************************/
98 
99         public this()
100         {
101             super("Error message not yet set");
102         }
103 
104         /***********************************************************************
105 
106             Sets the error code and message.
107 
108             Params:
109                 code = error code to set
110                 msg = the new exception message to be used
111 
112         ***********************************************************************/
113 
114         private void set ( int code, istring msg )
115         {
116             this.error = code;
117             this.msg = msg;
118         }
119     }
120 
121     /***************************************************************************
122 
123         Limits the complexity of regex searches. If a regex search passes the
124         specified complexity limit without either finding a match or determining
125         that no match exists, it bails out, throwing an exception (see
126         CompiledRegex.match()).
127 
128         The default value of 0 uses libpcre's built-in default complexity
129         limit (10 million, see below), which is set to be extremely permissive.
130         Any value less than 10 million will have the effect of reducing the
131         level of complexity tolerated, thus reducing the potential processing
132         time spent searching.
133 
134         This field maps directly to the match_limit field in libpcre's
135         pcre_extra struct.
136 
137         From http://regexkit.sourceforge.net/Documentation/pcre/pcreapi.html:
138 
139         The match_limit field provides a means of preventing PCRE from using up
140         a vast amount of resources when running patterns that are not going to
141         match, but which have a very large number of possibilities in their
142         search trees. The classic example is the use of nested unlimited
143         repeats.
144 
145         Internally, PCRE uses a function called match() which it calls
146         repeatedly (sometimes recursively). The limit set by match_limit is
147         imposed on the number of times this function is called during a match,
148         which has the effect of limiting the amount of backtracking that can
149         take place. For patterns that are not anchored, the count restarts from
150         zero for each position in the subject string.
151 
152         The default value for the limit can be set when PCRE is built; the
153         default default is 10 million, which handles all but the most extreme
154         cases.
155 
156     ***************************************************************************/
157 
158     public static immutable int DEFAULT_COMPLEXITY_LIMIT = 0;
159 
160     public int complexity_limit = DEFAULT_COMPLEXITY_LIMIT;
161 
162     /***************************************************************************
163 
164         String used internally for formatting.
165 
166     ***************************************************************************/
167 
168     private mstring buffer_char;
169 
170     /***************************************************************************
171 
172         A re-usable exception instance
173 
174     ***************************************************************************/
175 
176     private PcreException exception;
177 
178     /***************************************************************************
179 
180         Compiled regex class. Enables a regex pattern to be compiled once and
181         used for multiple searches.
182 
183     ***************************************************************************/
184 
185     public class CompiledRegex
186     {
187         /***********************************************************************
188 
189             Pointer to C-allocated pcre regex object, created upon compilation
190             of a regex (see compile()).
191 
192         ***********************************************************************/
193 
194         private pcre* pcre_object;
195 
196         /***********************************************************************
197 
198             Settings used by the call to pcre_exec() in the match() method.
199             These are modified by the complexity_limit field of the outer class,
200             and by the study() method.
201 
202         ***********************************************************************/
203 
204         private pcre_extra match_settings;
205 
206         /***********************************************************************
207 
208             Destructor. Frees the C-allocated pcre object.
209 
210         ***********************************************************************/
211 
212         ~this ( )
213         {
214             this.cleanup();
215         }
216 
217         /***********************************************************************
218 
219             Compiles the specified regex for use in the match() method. Cleans
220             up a previously compiled regex, if this instance has been used
221             before.
222 
223             Params:
224                 pattern = pattern to search for, as a string
225                 case_sens = case sensitive matching
226 
227             Throws:
228                 if the compilation of the regex fails
229 
230             Out:
231                 following a call to this method, the compiled regex exists
232 
233         ***********************************************************************/
234 
235         public void compile ( cstring pattern, bool case_sens = true )
236         out
237         {
238             assert(this.pcre_object !is null);
239         }
240         body
241         {
242 
243             this.cleanup();
244 
245             char* errmsg;
246             int error_code;
247             int error_offset;
248 
249             this.outer.buffer_char.concat(pattern, "\0"[]);
250             this.pcre_object = pcre_compile2(this.outer.buffer_char.ptr,
251                     (case_sens ? 0 : PCRE_CASELESS), &error_code, &errmsg,
252                     &error_offset, null);
253             if ( !this.pcre_object )
254             {
255                 this.outer.exception.msg = format(
256                     "Error compiling regular expression: {} - on pattern: {} at position {}",
257                     StringC.toDString(errmsg), pattern, error_offset
258                 );
259                 this.outer.exception.error = error_code;
260                 throw this.outer.exception;
261             }
262         }
263 
264         /***********************************************************************
265 
266             Perform a regular expression match.
267 
268             Params:
269                 subject = the compiled patter will be matched against this string
270 
271             Returns:
272                 true, if matches or false if no match
273 
274             Throws:
275                 if an error occurs when running the regex search
276 
277             In:
278                 the regex must have been compiled
279 
280         ***********************************************************************/
281 
282         public bool match ( cstring subject )
283         {
284             //"" matches against the pattern ""
285             if ( this.outer.buffer_char == "\0" && subject == "" )
286             {
287                 return true;
288             }
289 
290             return this.findFirst(subject) != null;
291         }
292 
293         /***********************************************************************
294 
295             Performs a regular expression match and return the first found match
296 
297             Params:
298                 subject  = input string
299 
300             Returns:
301                 slice to the first found match or null if no match
302 
303             Throws:
304                 if an error occurs when running the regex search
305 
306             In:
307                 the regex must have been compiled
308 
309         ***********************************************************************/
310 
311         public cstring findFirst ( cstring subject )
312         {
313             verify(this.pcre_object !is null);
314 
315             // This method supports only one capture so size 3 is enough.
316             // 2/3 for the positions and 1/3 for a internal buffer
317             int[3] ovector;
318 
319             if ( this.outer.complexity_limit != DEFAULT_COMPLEXITY_LIMIT )
320             {
321                 this.match_settings.flags |= PCRE_EXTRA_MATCH_LIMIT;
322                 this.match_settings.match_limit = this.outer.complexity_limit;
323             }
324 
325             int error_code = pcre_exec(this.pcre_object, &this.match_settings,
326                 subject.ptr, castFrom!(size_t).to!(int)(subject.length), 0, 0,
327                 ovector.ptr, ovector.length);
328 
329             // A positive return value indicates that a single match was found
330             // and its indices stored in ovector[0] and ovector[1].
331             // A zero return value indicates that multiple matches were found,
332             // the first stored in ovector[0] and ovector[1], and the rest
333             // discarded.
334             if ( error_code >= 0 )
335             {
336                 // we got a match but ovector is 0 so the whole subject matched
337                 if ( ovector[0] == 0 && ovector[1] == 0 )
338                 {
339                     return subject;
340                 }
341 
342                 return subject[ovector[0] .. ovector[1]];
343             }
344             // Negative return values indicate failure or error. We ignore
345             // match failures and throw on error.
346             else if ( error_code != PCRE_ERROR_NOMATCH )
347             {
348                 this.outer.exception.set(error_code,
349                     "Error on executing regular expression!");
350                 throw this.outer.exception;
351             }
352 
353             return null;
354         }
355 
356         /***********************************************************************
357 
358             Performs a regular expression match and returns an array of slices
359             of all matches
360 
361             Params:
362                 subject  = input string
363                 matches_buffer = found matches will be stored here
364 
365             Returns:
366                 Array with slices of all matches
367 
368             Throws:
369                 if an error occurs when running the regex search
370 
371             In:
372                 the regex must have been compiled
373 
374         ***********************************************************************/
375 
376         public cstring[] findAll ( cstring subject, ref cstring[] matches_buffer )
377         {
378             verify(this.pcre_object !is null);
379 
380             ptrdiff_t pos;
381 
382             while ( pos < subject.length )
383             {
384                 if ( auto match = this.findFirst(subject[pos .. $]) )
385                 {
386                     matches_buffer ~= match;
387                     //set pos to end of the match
388                     pos = match.ptr - subject.ptr + match.length;
389                 }
390                 else
391                 {
392                     break;
393                 }
394             }
395 
396             return matches_buffer;
397         }
398 
399         /***********************************************************************
400 
401             Study a compiled regex in order to increase processing efficiency
402             when calling match(). This is usually only worth doing for a regex
403             which will be used many times, and does not always yield an
404             improvement in efficiency.
405 
406             Throws:
407                 if an error occurs when studying the regex
408 
409             In:
410                 the regex must have been compiled
411 
412         ***********************************************************************/
413 
414         public void study ( )
415         {
416             verify(this.pcre_object !is null);
417 
418             char* errmsg;
419             auto res = pcre_study(this.pcre_object, 0, &errmsg);
420             if ( errmsg )
421             {
422                 auto derrmsg = StringC.toDString(errmsg);
423                 this.outer.exception.set(0, assumeUnique(derrmsg));
424                 throw this.outer.exception;
425             }
426             if ( res )
427             {
428                 this.match_settings.study_data = res.study_data;
429             }
430         }
431 
432         /***********************************************************************
433 
434             Cleans up the compiled regex object and the study data.
435 
436         ***********************************************************************/
437 
438         private void cleanup ( )
439         {
440             free(this.pcre_object);
441             this.match_settings = this.match_settings.init;
442         }
443     }
444 
445     /***************************************************************************
446 
447         Constructor. Initializes the re-usable exception.
448 
449     ***************************************************************************/
450 
451     public this ( )
452     {
453         this.exception = new PcreException();
454     }
455 
456     /***************************************************************************
457 
458         Perform a regular expression match. Note that this method internally
459         allocates and then frees a C pcre object each time it is called. If you
460         want to run the same regex search multiple times on different input, you
461         are probably better off using the compile() method, above.
462 
463         Usage:
464             auto regex = new PCRE;
465             bool match = regex.preg_match("Hello World!", "^Hello");
466 
467         Params:
468             subject = the compiled patter will be matched against this string
469             pattern = pattern to search for, as a string
470             case_sens = case sensitive matching
471 
472         Returns:
473             true, if matches or false if no match
474 
475         Throws:
476             if the compilation or running of the regex fails
477 
478     ***************************************************************************/
479 
480     public bool preg_match ( cstring subject, cstring pattern, bool case_sens = true )
481     {
482         scope regex = new CompiledRegex;
483         regex.compile(pattern, case_sens);
484         return regex.match(subject);
485     }
486 }
487 
488 version ( UnitTest )
489 {
490     import ocean.core.Test;
491 
492     class CounterNamedTest : NamedTest
493     {
494         static uint test_num;
495 
496         this ( )
497         {
498             super(format("PCRE test #{}", ++test_num));
499         }
500     }
501 }
502 
503 
504 /*******************************************************************************
505 
506     Test for invalid pattern
507 
508 *******************************************************************************/
509 
510 unittest
511 {
512     auto pcre = new PCRE;
513     testThrown!(PCRE.PcreException)(pcre.preg_match("", "("));
514 }
515 
516 /*******************************************************************************
517 
518     Tests for simple boolean matching via the preg_match() method. (This
519     unittest tests only the interface of this method. It does not test the full
520     range of PCRE features as that is beyond its scope.)
521 
522 *******************************************************************************/
523 
524 unittest
525 {
526     void test ( scope bool delegate ( ) dg, bool match )
527     {
528         auto t = new CounterNamedTest;
529         t.test!("==")(match, dg());
530     }
531 
532     auto pcre = new PCRE;
533 
534     // Empty pattern (matches any string)
535     test({ return pcre.preg_match("Hello World", ""); }, true);
536 
537     // Empty string and empty pattern (match)
538     test({ return pcre.preg_match("", ""); }, true);
539 
540     // Empty string (no match)
541     test({ return pcre.preg_match("", "a"); }, false);
542 
543     // Simple string match
544     test({ return pcre.preg_match("Hello World", "Hello"); }, true);
545 
546     // Simple string match (fail)
547     test({ return pcre.preg_match("Hello World", "Hallo"); }, false);
548 
549     // Case-sensitive match (fail)
550     test({ return pcre.preg_match("Hello World", "hello"); }, false);
551 
552     // Case-insensitive match
553     test({ return pcre.preg_match("Hello World", "hello", false); }, true);
554 }
555 
556 /*******************************************************************************
557 
558     Tests for single substring matching via the CompiledRegex.findFirst()
559     method.
560 
561 *******************************************************************************/
562 
563 unittest
564 {
565     auto pcre = new PCRE;
566     auto regex = pcre..new CompiledRegex;
567 
568     void testFind ( cstring needle, cstring pre, cstring match, cstring post )
569     {
570         regex.compile(needle);
571 
572         auto haystack = pre ~ match ~ post;
573         auto res = regex.findFirst(haystack);
574 
575         auto t = new CounterNamedTest;
576         t.test!("==")(res, haystack[pre.length .. pre.length + match.length]);
577     }
578 
579     // Simple match
580     testFind(
581         "a",
582         "bbb",
583         "a",
584         "ccc"
585     );
586 
587     // Match with a more complicated regex
588     testFind(
589        "(firstparam=[^l]*liza.*(secondparam=mary|secondparam=lena|secondparam=john))|((secondparam=mary|secondparam=lena|secondparam=john).*firstparam=liza)",
590         "http://example.org?",
591         "firstparam=elizabeth&rand=84527497861&secondparam=mary",
592         "&some=other&params=are&here=%20%21%22%12222"
593     );
594 
595     // Single match returned when multiple matches are possible
596     testFind(
597         "a",
598         "bbb",
599         "a",
600         "cacac"
601     );
602 }
603 
604 /*******************************************************************************
605 
606     Tests for multiple substring matching via the CompiledRegex.findAll()
607     method.
608 
609 *******************************************************************************/
610 
611 unittest
612 {
613     auto pcre = new PCRE;
614     auto regex = pcre..new CompiledRegex;
615     cstring[] matches_buffer;
616 
617     auto t = new NamedTest("PCRE findAll");
618 
619     {
620         istring str = "apa bepa cepa depa epa fepa gepa hepa";
621         regex.compile("a[ ]", false);
622 
623         matches_buffer.length = 0;
624         enableStomping(matches_buffer);
625 
626         foreach ( match; regex.findAll(str, matches_buffer) )
627         {
628             t.test!("==")(match, "a "[]);
629         }
630 
631         t.test!("==")(matches_buffer.length, 7);
632     }
633 
634     {
635         istring[3] exp = ["ast", "at", "ast"];
636         istring str = "en hast at en annan hast";
637         regex.compile("a[s]*t", false);
638 
639         matches_buffer = null;
640 
641 
642         foreach (i, match; regex.findAll(str, matches_buffer) )
643         {
644             t.test!("==")(match, exp[i]);
645         }
646         t.test!("==")(matches_buffer.length, 3);
647     }
648 
649     {
650         istring[3] exp = ["ta", "tb", "td"];
651         istring str = "tatb t c Tf td";
652         regex.compile("t[\\w]", true);
653         regex.study();
654 
655         matches_buffer = null;
656 
657         foreach (i, match; regex.findAll(str, matches_buffer) )
658         {
659             t.test!("==")(match, exp[i]);
660         }
661 
662         t.test!("==")(matches_buffer.length, 3);
663     }
664 
665     {
666         istring str = "en text";
667         regex.compile("zzz", false);
668         matches_buffer = null;
669         auto matches = regex.findAll(str, matches_buffer);
670 
671         t.test!("==")(matches_buffer.length, 0);
672     }
673 
674     {
675         istring str = "en text";
676         regex.compile("zzz", false);
677         matches_buffer = null;
678         auto matches = regex.findAll(str, matches_buffer);
679 
680         t.test!("==")(matches_buffer.length, 0);
681     }
682 
683     {
684         istring str ="aaaaaaaaaaaaaaaaaaaaaaAaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
685                  ~ "aaaaaaaaaaaaaaaaaaaaaaaaAaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
686                  ~ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaAaaaaaaaaaaaaaaaaaaaaaaaaa"
687                  ~ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaAaaaaaaaaaaaaaaaaaaaaaaaa"
688                  ~ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaAaaaaaaaaaaaaaaaaaa"
689                  ~ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaAaaaaaaaaaaaaaaaaaaa"
690                  ~ "aaaaaaaaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbAbbbbbbbbbbbb"
691                  ~ "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbAbbbbbbb"
692                  ~ "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbaaaaaaaaaaaaaaaaaAaaaaa"
693                  ~ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaAaaaaa"
694                  ~ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaAaa"
695                  ~ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
696                  ~ "aaaaaAaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
697                  ~ "aacaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
698                  ~ "aaaacaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
699 
700         regex.compile("a", true);
701         matches_buffer.length = 0;
702 
703         foreach (i, match; regex.findAll(str, matches_buffer) )
704         {
705             t.test!("==")(match, "a"[]);
706         }
707 
708         t.test!("==")(matches_buffer.length, 695);
709     }
710 }
711