1 /*******************************************************************************
2 
3     D Library Wrapper for PCRE regular expression engine.
4 
5     Requires linking with libpcre (-lpcre).
6 
7     Usage example:
8 
9     ---
10 
11         import ocean.text.regex.PCRE;
12 
13         auto pcre = new PCRE;
14 
15         // Simple, one-off use
16         auto match = pcre.preg_match("Hello World!", "^Hello");
17 
18         // Compile then reuse
19         auto regex = pcre.new CompiledRegex;
20         regex.compile("^Hello");
21         for ( int i; i < 100; i++ )
22         {
23             auto match = regex.match("Hello World!");
24         }
25 
26     ---
27 
28 
29     Related:
30         http://regexkit.sourceforge.net/Documentation/pcre/pcreapi.html
31 
32     Copyright:
33         Copyright (c) 2009-2016 dunnhumby Germany GmbH.
34         All rights reserved.
35 
36     License:
37         Boost Software License Version 1.0. See LICENSE_BOOST.txt for details.
38         Alternatively, this file may be distributed under the terms of the Tango
39         3-Clause BSD License (see LICENSE_BSD.txt for details).
40 
41         Bear in mind this module provides bindings to an external library that
42         has its own license, which might be more restrictive. Please check the
43         external library license to see which conditions apply for linking.
44 
45 *******************************************************************************/
46 
47 module ocean.text.regex.PCRE;
48 
49 
50 
51 
52 import ocean.meta.types.Qualifiers;
53 import ocean.core.TypeConvert: assumeUnique;
54 
55 import ocean.core.Array : copy, concat;
56 import ocean.text.util.StringC;
57 import ocean.text.regex.c.pcre;
58 import ocean.core.TypeConvert;
59 
60 import core.stdc.stdlib : free;
61 import ocean.text.convert.Formatter;
62 
63 import ocean.core.Verify;
64 
65 
66 /*******************************************************************************
67 
68     PCRE
69 
70 *******************************************************************************/
71 
72 public class PCRE
73 {
74     /***************************************************************************
75 
76         Represents a PCRE Exception.
77         The class is re-usable exception where the error message can be
78         reset and the same instance can be re-thrown.
79 
80     ***************************************************************************/
81 
82     public static class PcreException : Exception
83     {
84         /***********************************************************************
85 
86             Error code returned by pcre function
87 
88         ***********************************************************************/
89 
90         public int error;
91 
92         /***********************************************************************
93 
94             Constructor.
95             Just calls the super Exception constructor with initial error
96             message.
97 
98         ***********************************************************************/
99 
100         public this()
101         {
102             super("Error message not yet set");
103         }
104 
105         /***********************************************************************
106 
107             Sets the error code and message.
108 
109             Params:
110                 code = error code to set
111                 msg = the new exception message to be used
112 
113         ***********************************************************************/
114 
115         private void set ( int code, istring msg )
116         {
117             this.error = code;
118             this.msg = msg;
119         }
120     }
121 
122     /***************************************************************************
123 
124         Limits the complexity of regex searches. If a regex search passes the
125         specified complexity limit without either finding a match or determining
126         that no match exists, it bails out, throwing an exception (see
127         CompiledRegex.match()).
128 
129         The default value of 0 uses libpcre's built-in default complexity
130         limit (10 million, see below), which is set to be extremely permissive.
131         Any value less than 10 million will have the effect of reducing the
132         level of complexity tolerated, thus reducing the potential processing
133         time spent searching.
134 
135         This field maps directly to the match_limit field in libpcre's
136         pcre_extra struct.
137 
138         From http://regexkit.sourceforge.net/Documentation/pcre/pcreapi.html:
139 
140         The match_limit field provides a means of preventing PCRE from using up
141         a vast amount of resources when running patterns that are not going to
142         match, but which have a very large number of possibilities in their
143         search trees. The classic example is the use of nested unlimited
144         repeats.
145 
146         Internally, PCRE uses a function called match() which it calls
147         repeatedly (sometimes recursively). The limit set by match_limit is
148         imposed on the number of times this function is called during a match,
149         which has the effect of limiting the amount of backtracking that can
150         take place. For patterns that are not anchored, the count restarts from
151         zero for each position in the subject string.
152 
153         The default value for the limit can be set when PCRE is built; the
154         default default is 10 million, which handles all but the most extreme
155         cases.
156 
157     ***************************************************************************/
158 
159     public static immutable int DEFAULT_COMPLEXITY_LIMIT = 0;
160 
161     public int complexity_limit = DEFAULT_COMPLEXITY_LIMIT;
162 
163     /***************************************************************************
164 
165         String used internally for formatting.
166 
167     ***************************************************************************/
168 
169     private mstring buffer_char;
170 
171     /***************************************************************************
172 
173         A re-usable exception instance
174 
175     ***************************************************************************/
176 
177     private PcreException exception;
178 
179     /***************************************************************************
180 
181         Compiled regex class. Enables a regex pattern to be compiled once and
182         used for multiple searches.
183 
184     ***************************************************************************/
185 
186     public class CompiledRegex
187     {
188         /***********************************************************************
189 
190             Pointer to C-allocated pcre regex object, created upon compilation
191             of a regex (see compile()).
192 
193         ***********************************************************************/
194 
195         private pcre* pcre_object;
196 
197         /***********************************************************************
198 
199             Settings used by the call to pcre_exec() in the match() method.
200             These are modified by the complexity_limit field of the outer class,
201             and by the study() method.
202 
203         ***********************************************************************/
204 
205         private pcre_extra match_settings;
206 
207         /***********************************************************************
208 
209             Destructor. Frees the C-allocated pcre object.
210 
211         ***********************************************************************/
212 
213         ~this ( )
214         {
215             this.cleanup();
216         }
217 
218         /***********************************************************************
219 
220             Compiles the specified regex for use in the match() method. Cleans
221             up a previously compiled regex, if this instance has been used
222             before.
223 
224             Params:
225                 pattern = pattern to search for, as a string
226                 case_sens = case sensitive matching
227 
228             Throws:
229                 if the compilation of the regex fails
230 
231             Out:
232                 following a call to this method, the compiled regex exists
233 
234         ***********************************************************************/
235 
236         public void compile ( cstring pattern, bool case_sens = true )
237         out
238         {
239             assert(this.pcre_object !is null);
240         }
241         do
242         {
243 
244             this.cleanup();
245 
246             char* errmsg;
247             int error_code;
248             int error_offset;
249 
250             this.outer.buffer_char.concat(pattern, "\0"[]);
251             this.pcre_object = pcre_compile2(this.outer.buffer_char.ptr,
252                     (case_sens ? 0 : PCRE_CASELESS), &error_code, &errmsg,
253                     &error_offset, null);
254             if ( !this.pcre_object )
255             {
256                 this.outer.exception.msg = format(
257                     "Error compiling regular expression: {} - on pattern: {} at position {}",
258                     StringC.toDString(errmsg), pattern, error_offset
259                 );
260                 this.outer.exception.error = error_code;
261                 throw this.outer.exception;
262             }
263         }
264 
265         /***********************************************************************
266 
267             Perform a regular expression match.
268 
269             Params:
270                 subject = the compiled patter will be matched against this string
271 
272             Returns:
273                 true, if matches or false if no match
274 
275             Throws:
276                 if an error occurs when running the regex search
277 
278             In:
279                 the regex must have been compiled
280 
281         ***********************************************************************/
282 
283         public bool match ( cstring subject )
284         {
285             //"" matches against the pattern ""
286             if ( this.outer.buffer_char == "\0" && subject == "" )
287             {
288                 return true;
289             }
290 
291             return this.findFirst(subject) != null;
292         }
293 
294         /***********************************************************************
295 
296             Performs a regular expression match and return the first found match
297 
298             Params:
299                 subject  = input string
300 
301             Returns:
302                 slice to the first found match or null if no match
303 
304             Throws:
305                 if an error occurs when running the regex search
306 
307             In:
308                 the regex must have been compiled
309 
310         ***********************************************************************/
311 
312         public cstring findFirst ( cstring subject )
313         {
314             verify(this.pcre_object !is null);
315 
316             // This method supports only one capture so size 3 is enough.
317             // 2/3 for the positions and 1/3 for a internal buffer
318             int[3] ovector;
319 
320             if ( this.outer.complexity_limit != DEFAULT_COMPLEXITY_LIMIT )
321             {
322                 this.match_settings.flags |= PCRE_EXTRA_MATCH_LIMIT;
323                 this.match_settings.match_limit = this.outer.complexity_limit;
324             }
325 
326             int error_code = pcre_exec(this.pcre_object, &this.match_settings,
327                 subject.ptr, castFrom!(size_t).to!(int)(subject.length), 0, 0,
328                 ovector.ptr, ovector.length);
329 
330             // A positive return value indicates that a single match was found
331             // and its indices stored in ovector[0] and ovector[1].
332             // A zero return value indicates that multiple matches were found,
333             // the first stored in ovector[0] and ovector[1], and the rest
334             // discarded.
335             if ( error_code >= 0 )
336             {
337                 // we got a match but ovector is 0 so the whole subject matched
338                 if ( ovector[0] == 0 && ovector[1] == 0 )
339                 {
340                     return subject;
341                 }
342 
343                 return subject[ovector[0] .. ovector[1]];
344             }
345             // Negative return values indicate failure or error. We ignore
346             // match failures and throw on error.
347             else if ( error_code != PCRE_ERROR_NOMATCH )
348             {
349                 this.outer.exception.set(error_code,
350                     "Error on executing regular expression!");
351                 throw this.outer.exception;
352             }
353 
354             return null;
355         }
356 
357         /***********************************************************************
358 
359             Performs a regular expression match and returns an array of slices
360             of all matches
361 
362             Params:
363                 subject  = input string
364                 matches_buffer = found matches will be stored here
365 
366             Returns:
367                 Array with slices of all matches
368 
369             Throws:
370                 if an error occurs when running the regex search
371 
372             In:
373                 the regex must have been compiled
374 
375         ***********************************************************************/
376 
377         public cstring[] findAll ( cstring subject, ref cstring[] matches_buffer )
378         {
379             verify(this.pcre_object !is null);
380 
381             ptrdiff_t pos;
382 
383             while ( pos < subject.length )
384             {
385                 if ( auto match = this.findFirst(subject[pos .. $]) )
386                 {
387                     matches_buffer ~= match;
388                     //set pos to end of the match
389                     pos = match.ptr - subject.ptr + match.length;
390                 }
391                 else
392                 {
393                     break;
394                 }
395             }
396 
397             return matches_buffer;
398         }
399 
400         /***********************************************************************
401 
402             Study a compiled regex in order to increase processing efficiency
403             when calling match(). This is usually only worth doing for a regex
404             which will be used many times, and does not always yield an
405             improvement in efficiency.
406 
407             Throws:
408                 if an error occurs when studying the regex
409 
410             In:
411                 the regex must have been compiled
412 
413         ***********************************************************************/
414 
415         public void study ( )
416         {
417             verify(this.pcre_object !is null);
418 
419             char* errmsg;
420             auto res = pcre_study(this.pcre_object, 0, &errmsg);
421             if ( errmsg )
422             {
423                 auto derrmsg = StringC.toDString(errmsg);
424                 this.outer.exception.set(0, assumeUnique(derrmsg));
425                 throw this.outer.exception;
426             }
427             if ( res )
428             {
429                 this.match_settings.study_data = res.study_data;
430             }
431         }
432 
433         /***********************************************************************
434 
435             Cleans up the compiled regex object and the study data.
436 
437         ***********************************************************************/
438 
439         private void cleanup ( )
440         {
441             free(this.pcre_object);
442             this.match_settings = this.match_settings.init;
443         }
444     }
445 
446     /***************************************************************************
447 
448         Constructor. Initializes the re-usable exception.
449 
450     ***************************************************************************/
451 
452     public this ( )
453     {
454         this.exception = new PcreException();
455     }
456 
457     /***************************************************************************
458 
459         Perform a regular expression match. Note that this method internally
460         allocates and then frees a C pcre object each time it is called. If you
461         want to run the same regex search multiple times on different input, you
462         are probably better off using the compile() method, above.
463 
464         Usage:
465             auto regex = new PCRE;
466             bool match = regex.preg_match("Hello World!", "^Hello");
467 
468         Params:
469             subject = the compiled patter will be matched against this string
470             pattern = pattern to search for, as a string
471             case_sens = case sensitive matching
472 
473         Returns:
474             true, if matches or false if no match
475 
476         Throws:
477             if the compilation or running of the regex fails
478 
479     ***************************************************************************/
480 
481     public bool preg_match ( cstring subject, cstring pattern, bool case_sens = true )
482     {
483         scope regex = new CompiledRegex;
484         regex.compile(pattern, case_sens);
485         return regex.match(subject);
486     }
487 }
488 
489 version (unittest)
490 {
491     import ocean.core.Test;
492 
493     class CounterNamedTest : NamedTest
494     {
495         static uint test_num;
496 
497         this ( )
498         {
499             super(format("PCRE test #{}", ++test_num));
500         }
501     }
502 }
503 
504 
505 /*******************************************************************************
506 
507     Test for invalid pattern
508 
509 *******************************************************************************/
510 
511 unittest
512 {
513     auto pcre = new PCRE;
514     testThrown!(PCRE.PcreException)(pcre.preg_match("", "("));
515 }
516 
517 /*******************************************************************************
518 
519     Tests for simple boolean matching via the preg_match() method. (This
520     unittest tests only the interface of this method. It does not test the full
521     range of PCRE features as that is beyond its scope.)
522 
523 *******************************************************************************/
524 
525 unittest
526 {
527     void test ( scope bool delegate ( ) dg, bool match )
528     {
529         auto t = new CounterNamedTest;
530         t.test!("==")(match, dg());
531     }
532 
533     auto pcre = new PCRE;
534 
535     // Empty pattern (matches any string)
536     test({ return pcre.preg_match("Hello World", ""); }, true);
537 
538     // Empty string and empty pattern (match)
539     test({ return pcre.preg_match("", ""); }, true);
540 
541     // Empty string (no match)
542     test({ return pcre.preg_match("", "a"); }, false);
543 
544     // Simple string match
545     test({ return pcre.preg_match("Hello World", "Hello"); }, true);
546 
547     // Simple string match (fail)
548     test({ return pcre.preg_match("Hello World", "Hallo"); }, false);
549 
550     // Case-sensitive match (fail)
551     test({ return pcre.preg_match("Hello World", "hello"); }, false);
552 
553     // Case-insensitive match
554     test({ return pcre.preg_match("Hello World", "hello", false); }, true);
555 }
556 
557 /*******************************************************************************
558 
559     Tests for single substring matching via the CompiledRegex.findFirst()
560     method.
561 
562 *******************************************************************************/
563 
564 unittest
565 {
566     auto pcre = new PCRE;
567     auto regex = pcre..new CompiledRegex;
568 
569     void testFind ( cstring needle, cstring pre, cstring match, cstring post )
570     {
571         regex.compile(needle);
572 
573         auto haystack = pre ~ match ~ post;
574         auto res = regex.findFirst(haystack);
575 
576         auto t = new CounterNamedTest;
577         t.test!("==")(res, haystack[pre.length .. pre.length + match.length]);
578     }
579 
580     // Simple match
581     testFind(
582         "a",
583         "bbb",
584         "a",
585         "ccc"
586     );
587 
588     // Match with a more complicated regex
589     testFind(
590        "(firstparam=[^l]*liza.*(secondparam=mary|secondparam=lena|secondparam=john))|((secondparam=mary|secondparam=lena|secondparam=john).*firstparam=liza)",
591         "http://example.org?",
592         "firstparam=elizabeth&rand=84527497861&secondparam=mary",
593         "&some=other&params=are&here=%20%21%22%12222"
594     );
595 
596     // Single match returned when multiple matches are possible
597     testFind(
598         "a",
599         "bbb",
600         "a",
601         "cacac"
602     );
603 }
604 
605 /*******************************************************************************
606 
607     Tests for multiple substring matching via the CompiledRegex.findAll()
608     method.
609 
610 *******************************************************************************/
611 
612 unittest
613 {
614     auto pcre = new PCRE;
615     auto regex = pcre..new CompiledRegex;
616     cstring[] matches_buffer;
617 
618     auto t = new NamedTest("PCRE findAll");
619 
620     {
621         istring str = "apa bepa cepa depa epa fepa gepa hepa";
622         regex.compile("a[ ]", false);
623 
624         matches_buffer.length = 0;
625         assumeSafeAppend(matches_buffer);
626 
627         foreach ( match; regex.findAll(str, matches_buffer) )
628         {
629             t.test!("==")(match, "a "[]);
630         }
631 
632         t.test!("==")(matches_buffer.length, 7);
633     }
634 
635     {
636         istring[3] exp = ["ast", "at", "ast"];
637         istring str = "en hast at en annan hast";
638         regex.compile("a[s]*t", false);
639 
640         matches_buffer = null;
641 
642 
643         foreach (i, match; regex.findAll(str, matches_buffer) )
644         {
645             t.test!("==")(match, exp[i]);
646         }
647         t.test!("==")(matches_buffer.length, 3);
648     }
649 
650     {
651         istring[3] exp = ["ta", "tb", "td"];
652         istring str = "tatb t c Tf td";
653         regex.compile("t[\\w]", true);
654         regex.study();
655 
656         matches_buffer = null;
657 
658         foreach (i, match; regex.findAll(str, matches_buffer) )
659         {
660             t.test!("==")(match, exp[i]);
661         }
662 
663         t.test!("==")(matches_buffer.length, 3);
664     }
665 
666     {
667         istring str = "en text";
668         regex.compile("zzz", false);
669         matches_buffer = null;
670         auto matches = regex.findAll(str, matches_buffer);
671 
672         t.test!("==")(matches_buffer.length, 0);
673     }
674 
675     {
676         istring str = "en text";
677         regex.compile("zzz", false);
678         matches_buffer = null;
679         auto matches = regex.findAll(str, matches_buffer);
680 
681         t.test!("==")(matches_buffer.length, 0);
682     }
683 
684     {
685         istring str ="aaaaaaaaaaaaaaaaaaaaaaAaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
686                  ~ "aaaaaaaaaaaaaaaaaaaaaaaaAaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
687                  ~ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaAaaaaaaaaaaaaaaaaaaaaaaaaa"
688                  ~ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaAaaaaaaaaaaaaaaaaaaaaaaaa"
689                  ~ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaAaaaaaaaaaaaaaaaaaa"
690                  ~ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaAaaaaaaaaaaaaaaaaaaa"
691                  ~ "aaaaaaaaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbAbbbbbbbbbbbb"
692                  ~ "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbAbbbbbbb"
693                  ~ "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbaaaaaaaaaaaaaaaaaAaaaaa"
694                  ~ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaAaaaaa"
695                  ~ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaAaa"
696                  ~ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
697                  ~ "aaaaaAaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
698                  ~ "aacaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
699                  ~ "aaaacaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
700 
701         regex.compile("a", true);
702         matches_buffer.length = 0;
703 
704         foreach (i, match; regex.findAll(str, matches_buffer) )
705         {
706             t.test!("==")(match, "a"[]);
707         }
708 
709         t.test!("==")(matches_buffer.length, 695);
710     }
711 }