// bdlpcre_regex.h -*-C++-*- #ifndef INCLUDED_BDLPCRE_REGEX #define INCLUDED_BDLPCRE_REGEX #include <bsls_ident.h> BSLS_IDENT("$Id$ $CSID$") //@PURPOSE: Provide a mechanism for regular expression pattern matching. // //@CLASSES: // bdlpcre::RegEx: mechanism for compiling and matching regular expressions // //@SEE_ALSO: http://www.pcre.org/ // //@DESCRIPTION: This component provides a mechanism, 'bdlpcre::RegEx', for // compiling (or "preparing") regular expressions, and subsequently matching // subject strings against a prepared expression and replacing the matching // parts with the replacement string. The regular expressions supported by // this component correspond approximately with Perl 5.10. See the appendix // entitled "Perl Compatibility" below for more information. // // Upon construction, a 'bdlpcre::RegEx' object is initially not associated // with a regular expression. A regular expression pattern is compiled for use // by the object using the 'prepare' method. Subject strings may then be // matched against the prepared pattern using the set of overloaded 'match' // methods. // // The component provides the following groups of 'match' overloads (and // similarly for 'matchRaw'): // //: 1 The first group of 'match' overloads simply returns 0 if a given subject //: string matches the prepared regular expression, and returns a non-zero //: value otherwise. //: //: 2 The second group of 'match' overloads returns the substring of the //: subject that was matched, either as a 'bsl::string_view', or as a //: 'bsl::pair<size_t, size_t>' holding the (offset, length) pair. //: //: 3 The third group of 'match' overloads returns a vector of either //: 'bsl::string_view' or 'bsl::pair<size_t, size_t>' holding the matched //: substrings. The first element of the vector indicate the substring of //: the subject that matched the entire pattern. Subsequent elements //: indicate the substrings of the subject that matched respective //: sub-patterns. // // The matched parts of subjects strings can be replaced with the replacement // string using the set of overloaded 'replace' and 'replaceRaw' methods. // ///"Prepared" State ///---------------- // A 'bdlpcre::RegEx' object must first be prepared with a valid regular // expression before attempting to match subject strings or replace the matched // parts. We say that an instance of 'bdlpcre::RegEx' is in the "prepared" // state if the object holds a valid regular expression, in which case calls to // the overloaded 'match' or 'replace' methods of that instance are valid. // Otherwise, the object is in the "unprepared" state. Upon construction, an // 'bdlpcre::RegEx' object is in the "unprepared" state. A successful call to // the 'prepare' method puts the object into the "prepared" state. The 'clear' // method, as well as an unsuccessful call to 'prepare', puts the object into // the "unprepared" state. The 'isPrepared' accessor may be used to determine // whether an object is prepared. // ///Prepare-Time Flags ///------------------ // A set of flags may be optionally supplied to the 'prepare' method to affect // specific pattern matching behavior. The flags recognized by 'prepare' are // defined in an enumeration declared within the 'bdlpcre::RegEx'. The // following describes these flags and their effects. // ///Case-Insensitive Matching ///- - - - - - - - - - - - - // If 'RegEx::k_FLAG_CASELESS' is included in the flags supplied to 'prepare', // then letters in the regular expression pattern supplied to 'prepare' match // both lower- and upper-case letters in subject strings subsequently supplied // to 'match'. This is equivalent to Perl's '/i' option, and can be turned off // within a pattern by a '(?i)' option setting. // ///Multi-Line Matching ///- - - - - - - - - - // By default, a subject string supplied to 'match' or 'replace' is treated as // consisting of a single line of characters (even if it actually contains '\n' // characters). The start-of-line meta-character '^' matches only at the // beginning of the string, and the end-of-line meta-character '$' matches only // at the end of the string (or before a terminating '\n', if present). This // matches the behavior of Perl. // // If 'RegEx::k_FLAG_MULTILINE' is included in the flags supplied to 'prepare', // then start-of-line and end-of-line meta-characters match immediately // following or immediately before any '\n' characters in subject strings // supplied to 'match', respectively (as well as at the very start and end of // subject strings). This is equivalent to Perl's '/m' option, and can be // turned off within a pattern by a '(?m)' option setting. If there are no // '\n' characters in the subject string, or if there are no occurrences of '^' // or '$' in the prepared pattern, then including 'k_FLAG_MULTILINE' has no // effect. // ///UTF-8 Support ///- - - - - - - // If 'RegEx::k_FLAG_UTF8' is included in the flags supplied to 'prepare', then // the regular expression pattern supplied to 'prepare', the subject strings // subsequently supplied to 'match', 'matchRaw', 'replace', and 'replaceRaw' as // well as the replacement string supplied to 'replace' and 'replaceRaw' are // interpreted as strings of UTF-8 characters instead of strings of ASCII // characters. 'match' and 'replace' return a non-zero value if 'pattern()' // was prepared with 'k_FLAG_UTF8', but the subject or the replacement are not // a valid UTF-8 string. The behavior of 'matchRaw' is undefined if // 'pattern()' was prepared with 'k_FLAG_UTF8', but the subject is not a valid // UTF-8 string. Note that JIT optimization (see below) is disabled for // 'match' if 'pattern()' was prepared with 'k_FLAG_UTF8'. // ///Dot Matches All ///- - - - - - - - // If 'RegEx::k_FLAG_DOTMATCHESALL' is included in the flags supplied to // 'prepare', then a dot metacharacter in the pattern matches a character of // any value, including one that indicates a newline. However, it only ever // matches one character, even if newlines are encoded as '\r\n'. If // 'k_FLAG_DOTMATCHESALL' is not used to prepare a regular expression, a dot // metacharacter will *not* match a newline; hence, patterns expected to match // across lines will fail to do so. This flag is equivalent to Perl's '/s' // option, and can be changed within a pattern by a '(?s)' option setting. A // negative class such as '[^a]' always matches newline characters, independent // of the setting of this option. // /// Creating a New String with Replacement ///--------------------------------------- // A new string can be created by applying the regular expression pattern to // the subject string in which the matching parts are replaced with the // replacement string supplied to the 'replace' and 'replaceRaw' methods. // ///Group Insertion Forms ///- - - - - - - - - - - // By default, a dollar character ('$') is an escape character that can specify // the insertion of characters from capture groups and names from '(*MARK)' or // other control verbs in the pattern (see // https://perldoc.perl.org/perlre#Special-Backtracking-Control-Verbs for // details). The following forms are always recognized: //.. // $$ insert a dollar character // $<n> or ${<n>} insert the contents of group <n> // $*MARK or ${*MARK} insert a control verb name //.. // Either a group number or a group name can be given for '<n>'. Curly braces // are required only if the following character would be interpreted as part of // the number or name. The number may be zero to include the entire matched // string. For example, if the pattern 'a(b)c' is matched with '=abc=' and the // replacement string '+$1$0$1+', the result is '=+babcb+='. // ///Replacement Flags ///- - - - - - - - - // A set of flags may be optionally supplied to the 'replace' and 'replaceRaw' // method to affect specific substitution behavior. The flags recognized by // 'replace' and 'replaceRaw' are defined in an enumeration declared within the // 'bdlpcre::RegEx'. The flags are passed as a bitwise combination of OR bits // in the 'options' argument to 'replace' and 'replaceRaw' (e.g., // 'k_REPLACE_GLOBAL | k_REPLACE_LITERAL). The flags reflect // 'PCRE_SUBSTITUTE_*' flags and are propagated to the underlying PCRE2 library // substitute function. See // {https://www.pcre.org/current/doc/html/pcre2api.html#SEC36} for details. The // following describes these flags and their effects. // ///Global Replacement /// - - - - - - // The default action of 'replace' and 'replaceRaw' is to perform just one // replacement if the pattern matches. The 'RegEx::k_REPLACE_GLOBAL' flag // requests multiple replacements in the subject string. // ///The Replacement String is Literal ///- - - - - - - - - - - - - - - - - // If 'RegEx::k_REPLACE_LITERAL' is set, the replacement string is not // interpreted in any way. // ///Extended Replacement Processing ///- - - - - - - - - - - // If 'RegEx::k_REPLACE_EXTENDED' is set, extra processing is applied to the // replacement string. Without this option, only the dollar character ('$') is // special, and only the group insertion forms listed above (see // {Group Insertion Forms}) are valid. When this flag is set, two things // change: // //: o Firstly, backslash in a replacement string is interpreted as an escape //: character. The usual forms such as '\n' or '\x{ddd}' can be used to //: specify particular character codes, and backslash followed by any //: non-alphanumeric character quotes that character. Extended quoting can //: be coded using '\Q...\E', exactly as in the pattern string. //: //: o The second effect is to add more flexibility to capture group //: substitution. The syntax is similar to that used by Bash: //:.. //: ${<n>:-<string>} //: ${<n>:+<string1>:<string2>} //:.. //: As before, '<n>' may be a group number or a name. The first form //: specifies a default value. If group '<n>' is set, its value is inserted; //: if not, '<string>' is expanded and the result inserted. The second form //: specifies strings that are expanded and inserted when group '<n>' is set //: or unset, respectively. The first form is just a convenient shorthand //: for '${<n>:+${<n>}:<string>}'. // ///Treat Unknown Group As Unset ///- - - - - - - - - - // The 'RegEx::k_REPLACE_UNKNOWN_UNSET' causes references to capture groups // that do not appear in the pattern to be treated as unset groups. // ///Insert An Empty String For Unset Group /// - - - - - - - - - - - - - // The 'RegEx::k_REPLACE_UNSET_EMPTY' causes unset capture groups (including // unknown groups when 'RegEx::k_REPLACE_UNKNOWN_UNSET' is set) to be treated // as empty strings when inserted as described in {Group Insertion Forms}. If // this option is not set, an attempt to insert an unset group causes 'replace' // and 'replaceRaw' to return an error. This option does not influence the // extended substitution syntax described in {Extended Replacement Processing}. // ///JIT Compiling Optimization ///-------------------------- // Just-in-time compiling is a heavyweight optimization that can greatly speed // up pattern matching on supported platforms. However, it comes at the cost // of extra processing before the match is performed, so it is of most benefit // when the same pattern is going to be matched many times. This does not // necessarily mean many calls of a matching function; if the pattern is not // anchored, matching attempts may take place many times at various positions // in the subject, even for a single call. Therefore, if the subject string is // very long, it may still pay to use JIT even for one-off matches. // // If 'RegEx::k_FLAG_JIT' is included in the flags supplied to 'prepare', then // all following matches performed by 'matchRaw' will be JIT optimized. // Matches performed by 'match' will also be JIT optimized provided that // 'RegEx::k_FLAG_UTF8' was not supplied to 'prepare' (since UTF-8 string // validity checking is not done during JIT compilation). To disable JIT // optimization for all matches, prepare the regular expression again omitting // the 'k_FLAG_JIT' flag. // // JIT is supported on the following platforms: //.. // ARM 32-bit (v5, v7, and Thumb2) // ARM 64-bit // Intel x86 32-bit and 64-bit // MIPS 32-bit and 64-bit // Power PC 32-bit and 64-bit // SPARC 32-bit //.. // // The tables below demonstrate the benefit of the 'match' method with JIT // optimizations, as well as the increased cost for 'prepare' when enabling JIT // optimizations: //.. // Legend // ------ // 'SIMPLE_PATTERN': // Pattern - X(abc)*Z // Subject - XXXabcabcZZZ // // 'EMAIL_PATTERN': // Pattern - [A-Za-z0-9._-]+@[[A-Za-z0-9.-]+ // Subject - john.dow@bloomberg.net // // 'IP_ADDRESS_PATTERN': // Pattern - (?:[0-9]{1,3}\.){3}[0-9]{1,3} // Subject - 255.255.255.255 // // Each pattern/subject returns 1 match. //.. // In this first table, for each pattern, 'prepare' was called once, and match // was called 100000 times (measurements are in seconds): //.. // Table 1: Performance Improvement for 'match' using k_JIT_FLAG // +--------------------+---------------------+---------------------+ // | Pattern | 'match' without-JIT | 'match' using-JIT | // +====================+=====================+=====================+ // | SIMPLE_PATTERN | 0.0559 (~5.1x) | 0.0108 | // +--------------------+---------------------+---------------------+ // | EMAIL_PATTERN | 0.0222 (~2.6x) | 0.0086 | // +--------------------+---------------------+---------------------+ // | IP_ADDRESS_PATTERN | 0.0331 (~5.3x) | 0.0062 | // +--------------------+---------------------+---------------------+ //.. // In this second table, for each pattern, we measured 10000 iterations, where // 'prepare' was called once, and 'match' was called once (measurements are in // seconds): //.. // Table 2: Performance Cost for 'prepare' using k_JIT_FLAG // +--------------------+-----------------------+-----------------------+ // | Pattern | 'prepare' without-JIT | 'prepare' using-JIT | // +====================+=======================+=======================+ // | SIMPLE_PATTERN | 0.2514 | 2.1426 (~8.5x) | // +--------------------+-----------------------+-----------------------+ // | EMAIL_PATTERN | 0.3386 | 2.5758 (~7.6x) | // +--------------------+-----------------------+-----------------------+ // | IP_ADDRESS_PATTERN | 0.3016 | 2.4433 (~8.1x) | // +--------------------+-----------------------+-----------------------+ //.. // Note that the tests were run on Linux / Intel Xeon CPU (3.47GHz, 64-bit), // compiled with gcc-4.8.2 in optimized mode. // ///Thread Safety ///------------- // 'bdlpcre::RegEx' is *const* *thread-safe*, meaning that accessors may be // invoked concurrently from different threads, but it is not safe to access or // modify a 'bdlpcre::RegEx' in one thread while another thread modifies the // same object. Specifically, the 'match' method can be called from multiple // threads after the pattern has been prepared. // // Note that 'bdlpcre::RegEx' incurs some overhead in order to provide // thread-safe pattern matching functionality. To perform the pattern match, // the underlying PCRE2 library requires a set of buffers that cannot be shared // between threads. // // The table below demonstrate the difference of invoking the 'match' method // from main (thread that invokes 'prepare') and other threads: //.. // Table 3: Performance cost for 'match' in multi-threaded application // +--------------------+-----------------------+----------------------------+ // | Pattern | 'match' (main thread) | 'match' (other thread(s)) | // +====================+=======================+============================+ // | SIMPLE_PATTERN | 0.0549 (~1.4x) | 0.0759 | // +--------------------+-----------------------+----------------------------+ // | EMAIL_PATTERN | 0.0259 (~1.8x) | 0.0464 | // +--------------------+-----------------------+----------------------------+ // | IP_ADDRESS_PATTERN | 0.0377 (~1.5x) | 0.0560 | // +--------------------+-----------------------+----------------------------+ //.. // Note that JIT stack is functionally part of the match context. Using large // JIT stack can incur additional performance penalty in the multi-threaded // applications. // ///Note on Memory Allocation Exceptions ///------------------------------------ // PCRE2 library supports memory allocation/deallocation functions supplied by // the client. 'bdlpcre_regex' provides wrappers around 'bslma' allocators // that are called from the context of the PCRE2 library (C linkage). Any // exceptions thrown during memory allocation are caught by the wrapper // functions and are not propagated to the PCRE2 library. // ///Usage ///----- // The following snippets of code illustrate using this component to extract // the text of the "Subject:" field from an Internet e-mail message (RFC822). // The following 'parseSubject' function accepts an RFC822-compliant message of // a specified length and returns the text of the message's subject in the // 'result' "out" parameter: //.. // int parseSubject(bsl::string *result, // const char *message, // bsl::size_t messageLength) // // Parse the specified 'message' of the specified 'messageLength' for // // the "Subject:" field of 'message'. Return 0 on success and load the // // specified 'result' with the text of the subject of 'message'; return // // a non-zero value otherwise with no effect on 'result'. // { //.. // The following is the regular expression that will be used to find the // subject text of 'message'. The "?P<subjectText>" syntax, borrowed from // Python, allows us later to refer to a particular matched sub-pattern (i.e., // the text between the ':' and the '\r' in the "Subject:" field of the header) // by the name "subjectText": //.. // const char PATTERN[] = "^subject:(?P<subjectText>[^\r]*)"; //.. // First we compile the 'PATTERN', using the 'prepare' method, in order to // match subject strings against it. In the event that 'prepare' fails, the // first two arguments will be loaded with diagnostic information (an // informational string and an index into the pattern at which the error // occurred, respectively). Two flags, 'RegEx::k_FLAG_CASELESS' and // 'RegEx::k_FLAG_MULTILINE', are used in preparing the pattern since Internet // message headers contain case-insensitive content as well as '\n' characters. // The 'prepare' method returns 0 on success, and a non-zero value otherwise: //.. // RegEx regEx; // bsl::string errorMessage; // size_t errorOffset; // // int returnValue = regEx.prepare(&errorMessage, // &errorOffset, // PATTERN, // RegEx::k_FLAG_CASELESS | // RegEx::k_FLAG_MULTILINE); // assert(0 == returnValue); //.. // Next we call 'match' supplying 'message' and its length. The 'matchVector' // will be populated with (offset, length) pairs describing substrings in // 'message' that match the prepared 'PATTERN'. All variants of the overloaded // 'match' method return 0 if a match is found, and return a non-zero value // otherwise: //.. // bsl::vector<bsl::pair<size_t, size_t> > matchVector; // returnValue = regEx.match(&matchVector, message, messageLength); // // if (0 != returnValue) { // return returnValue; // no match // } //.. // Then we pass "subjectText" to the 'subpatternIndex' method to obtain the // index into 'matchVector' that describes how to locate the subject text // within 'message'. The text is then extracted from 'message' and assigned to // the 'result' "out" parameter: //.. // const bsl::pair<size_t, size_t> capturedSubject = // matchVector[regEx.subpatternIndex("subjectText")]; // // *result = bsl::string(&message[capturedSubject.first], // capturedSubject.second); // // return 0; // } //.. // The following array contains the sample Internet e-mail message from which // we will extract the subject: //.. // const char RFC822_MESSAGE[] = // "Received: ; Fri, 23 Apr 2004 14:30:00 -0400\r\n" // "Message-ID: <12345@mailgate.bloomberg.net>\r\n" // "Date: Fri, 23 Apr 2004 14:30:00 -0400\r\n" // "From: <someone@bloomberg.net>\r\n" // "To: <someone_else@bloomberg.net>\r\n" // "Subject: This is the subject text\r\n" // "MIME-Version: 1.0\r\n" // "Content-Type: text/plain\r\n" // "\r\n" // "This is the message body.\r\n" // ".\r\n"; //.. // Finally, we call 'parseSubject' to extract the subject from // 'RFC822_MESSAGE'. The assertions verify that the subject of the message is // correctly extracted and assigned to the local 'subject' variable: //.. // int main() // { // bsl::string subject; // const int returnValue = parseSubject(&subject, // RFC822_MESSAGE, // sizeof(RFC822_MESSAGE) - 1); // assert(0 == returnValue); // assert(" This is the subject text" == subject); // } //.. // ///Appendix: Perl Compatibility /// - - - - - - - - - - - - - - // This section describes the differences in the ways that PCRE2 and Perl // handle regular expressions. The differences described here are with respect // to Perl versions 5.10 and above. // // 1) PCRE2 has only a subset of Perl's Unicode support. // // 2) PCRE2 allows repeat quantifiers only on parenthesized assertions, but // they do not mean what you might think. For example, '(?!a){3}' does not // assert that the next three characters are not '"a"'. It just asserts that // the next character is not '"a"' three times (in principle: PCRE2 optimizes // this to run the assertion just once). Perl allows repeat quantifiers on // other assertions such as '\b', but these do not seem to have any use. // // 3) Capturing subpatterns that occur inside negative lookahead assertions are // counted, but their entries in the offsets vector are never set. Perl // sometimes (but not always) sets its numerical variables from inside negative // assertions. // // 4) The following Perl escape sequences are not supported: '\l', '\u', '\L', // '\U', and '\N' when followed by a character name or Unicode value. ('\N' on // its own, matching a non-newline character, is supported.) In fact these are // implemented by Perl's general string-handling and are not part of its // pattern matching engine. If any of these are encountered by PCRE2, an error // is generated by default. // // 5) The Perl escape sequences '\p,' '\P,' and '\X' are supported only if // PCRE2 is built with Unicode support. The properties that can be tested with // '\p' and '\P' are limited to the general category properties such as 'Lu' // and 'Nd', script names such as Greek or Han, and the derived properties // 'Any' and 'L&'. PCRE2 does support the 'Cs' (surrogate) property, which // Perl does not; the Perl documentation says "Because Perl hides the need for // the user to understand the internal representation of Unicode characters, // there is no need to implement the somewhat messy concept of surrogates." // // 6) PCRE2 does support the '\Q...\E' escape for quoting substrings. // Characters in between are treated as literals. This is slightly different // from Perl in that '$' and '@' are also handled as literals inside the // quotes. In Perl, they cause variable interpolation (but of course PCRE2 // does not have variables). Note the following examples: //.. // Pattern PCRE2 matches Perl matches // ---------------- ------------- ------------------------------------ // \Qabc$xyz\E abc$xyz abc followed by the contents of $xyz // \Qabc\$xyz\E abc\$xyz abc\$xyz // \Qabc\E\$\Qxyz\E abc$xyz abc$xyz //.. // The '\Q...\E' sequence is recognized both inside and outside character // classes. // // 7) PCRE2 does not support the '(?{code})' and '(??{code})' constructions. // However, there is support for recursive patterns. This is not available in // Perl 5.8, but it is in Perl 5.10. // // 8) Subroutine calls (whether recursive or not) are treated as atomic groups. // Atomic recursion is like Python, but unlike Perl. Captured values that are // set outside a subroutine call can be referenced from inside in PCRE2, but // not in Perl. // // 9) If any of the backtracking control verbs are used in a subpattern that is // called as a subroutine (whether or not recursively), their effect is // confined to that subpattern; it does not extend to the surrounding pattern. // This is not always the case in Perl. In particular, if '(*THEN)' is present // in a group that is called as a subroutine, its action is limited to that // group, even if the group does not contain any '|' characters. Note that // such subpatterns are processed as anchored at the point where they are // tested. // // 10) If a pattern contains more than one backtracking control verb, the first // one that is backtracked onto acts. For example, in the pattern // 'A(*COMMIT)B(*PRUNE)C' a failure in 'B' triggers '(*COMMIT),' but a failure // in 'C' triggers '(*PRUNE)'. Perl's behaviour is more complex; in many cases // it is the same as PCRE2, but there are examples where it differs. // // 11) Most backtracking verbs in assertions have their normal actions. They // are not confined to the assertion. // // 12) There are some differences that are concerned with the settings of // captured strings when part of a pattern is repeated. For example, matching // '"aba"' against the pattern '/^(a(b)?)+$/' in Perl leaves '$2' unset, but in // PCRE2 it is set to '"b"'. // // 13) PCRE2's handling of duplicate subpattern numbers and duplicate // subpattern names is not as general as Perl's. This is a consequence of the // fact the PCRE2 works internally just with numbers, using an external table // to translate between numbers and names. In particular, a pattern such as // '(?|(?<a>A)|(?<b)B)', where the two capturing parentheses have the same // number but different names, is not supported, and causes an error at compile // time. If it were allowed, it would not be possible to distinguish which // parentheses matched, because both names map to capturing subpattern number // 1. To avoid this confusing situation, an error is given at compile time. // // 14) Perl recognizes comments in some places that PCRE2 does not, for // example, between the '(' and '?' at the start of a subpattern. If the '/x' // modifier is set, Perl allows white space between '(' and '?' (though current // Perls warn that this is deprecated) but PCRE2 never does, even if the // 'PCRE2_EXTENDED' option is set. // // 15) Perl, when in warning mode, gives warnings for character classes such as // '[A-\d]' or '[a-[:digit:]]'. It then treats the hyphens as literals. PCRE2 // has no warning features, so it gives an error in these cases because they // are almost certainly user mistakes. // // 16) In PCRE2, the upper/lower case character properties 'Lu' and 'Ll' are // not affected when case-independent matching is specified. For example, // '\p{Lu}' always matches an upper case letter. // // 17) PCRE2 provides some extensions to the Perl regular expression // facilities. This list is with respect to Perl 5.10: // // (a) Although lookbehind assertions in PCRE2 must match fixed length strings, // each alternative branch of a lookbehind assertion can match a different // length of string. Perl requires them all to have the same length. // // (b) If 'PCRE2_DOLLAR_ENDONLY' is set and 'PCRE2_MULTILINE' is not set, the // '$' meta-character matches only at the very end of the string. // // (c) A backslash followed by a letter with no special meaning is faulted. // (Perl can be made to issue a warning.) // // (d) If 'PCRE2_UNGREEDY' is set, the greediness of the repetition quantifiers // is inverted, that is, by default they are not greedy, but if followed by a // question mark they are. // // (e) 'PCRE2_ANCHORED' can be used at matching time to force a pattern to be // tried only at the first matching position in the subject string. // // (f) The 'PCRE2_NOTBOL', 'PCRE2_NOTEOL', 'PCRE2_NOTEMPTY', // 'PCRE2_NOTEMPTY_ATSTART', and 'PCRE2_NO_AUTO_CAPTURE' options have no Perl // equivalents. // // (g) The '\R' escape sequence can be restricted to match only 'CR,' 'LF,' or // 'CRLF' by the 'PCRE2_BSR_ANYCRLF' option. // // (h) The callout facility is PCRE2-specific. // // (i) The partial matching facility is PCRE2-specific. // // (j) The alternative matching function ('pcre2_dfa_match()' matches in a // different way and is not Perl-compatible. // // (k) PCRE2 recognizes some special sequences such as '(*CR)' at the start of // a pattern that set overall options that cannot be changed within the // pattern. // ///Additional Copyright Notice ///- - - - - - - - - - - - - - //.. // Copyright (c) 1997-2015 University of Cambridge // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // // * Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // // * Neither the name of the University of Cambridge nor the names of any // contributors may be used to endorse or promote products derived from // this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. // // Copyright (c) 1997-2015 University of Cambridge //.. #include <bdlscm_version.h> #include <bslma_allocator.h> #include <bslma_managedptr.h> #include <bslma_usesbslmaallocator.h> #include <bslmf_enableif.h> #include <bslmf_issame.h> #include <bslmf_nestedtraitdeclaration.h> #include <bsls_atomicoperations.h> #include <bsls_libraryfeatures.h> #include <bsl_cstddef.h> #include <bsl_string.h> #include <bsl_string_view.h> #include <bsl_utility.h> // 'bsl::pair' #include <bsl_vector.h> #ifdef BSLS_LIBRARYFEATURES_HAS_CPP17_PMR #include <memory_resource> #endif #include <string> #include <vector> #ifndef _PCRE2_H #define PCRE2_CODE_UNIT_WIDTH 8 #define PCRE2_STATIC #include <pcre2/pcre2.h> #endif #ifndef BDE_DONT_ALLOW_TRANSITIVE_INCLUDES #include <bsls_types.h> #endif namespace BloombergLP { namespace bdlpcre { class RegEx_MatchContext; // =========== // class RegEx // =========== class RegEx { // This class provides a mechanism for compiling and matching regular // expressions. A regular expression approximately compatible with Perl // 5.10 is compiled with the 'prepare' method. Subsequently, strings are // matched against the compiled (prepared) pattern using the overloaded // 'match' and 'matchRaw' methods. Note that the underlying implementation // uses the open-source Perl Compatible Regular Expressions (PCRE2) library // that was developed at the University of Cambridge // ('http://www.pcre.org/'). // CLASS DATA static bsls::AtomicOperations::AtomicTypes::Int s_depthLimit; // process-wide // default maximum // evaluation // recursion depth // PRIVATE DATA int d_flags; // prepare/match flags bsl::string d_pattern; // regular expression pattern pcre2_general_context *d_pcre2Context_p; // PCRE2 general context pcre2_compile_context *d_compileContext_p; // PCRE2 compile context pcre2_code *d_patternCode_p; // PCRE2 compiled pattern int d_depthLimit; // evaluation recursion depth size_t d_jitStackSize; // PCRE JIT stack size bslma::ManagedPtr<RegEx_MatchContext> d_matchContext; // match context helper bslma::Allocator *d_allocator_p; // allocator to supply memory private: // NOT IMPLEMENTED RegEx(const RegEx&); RegEx& operator=(const RegEx&); // PRIVATE MANIPULATORS int prepareImp(char *errorBuffer, size_t errorBufferLength, size_t *errorOffset, const char *pattern, int flags, size_t jitStackSize); // Prepare this regular-expression object with the specified 'pattern', // 'flags', and 'jitStackSize' that indicates the size of the allocated // JIT stack to be used for 'pattern'. On success, put this object // into the "prepared" state and return 0, with no effect on the // specified 'errorBuffer' and 'errorOffset'. Otherwise, (1) put this // object into the "unprepared" state, (2) load 'errorBuffer' with a // message describing the error detected truncated to the specified // 'errorBufferLength' (including a null terminator), (3) load // 'errorOffset' with the offset in 'pattern' at which the error was // detected, and (4) return a non-zero value. The behavior is // undefined unless 'flags' is the bit-wise inclusive-or of 0 or more // of the following values: //.. // k_FLAG_CASELESS // k_FLAG_DOTMATCHESALL // k_FLAG_MULTILINE // k_FLAG_UTF8 // k_FLAG_JIT //.. // Note that the flag 'k_FLAG_JIT' is ignored if 'isJitAvailable()' is // 'false'. // PRIVATE ACCESSORS template <class RESULT_EXTRACTOR> int matchImp(const RESULT_EXTRACTOR& extractor, const char *subject, size_t subjectLength, size_t subjectOffset, bool skipUTF8Validation) const; // Match the specified 'subject', having the specified 'subjectLength', // against the pattern held by this regular-expression object // ('pattern()'). 'subject' need not be null-terminated and may // contain embedded null characters. The specified // 'skipUTF8Validation' flag indicates whether UTF-8 string validity // checking is skipped. Begin matching at the specified // 'subjectOffset' in 'subject'. Return: // //: o 0 on success and invoke the specified 'extractor' to extract the //: result of the match //: //: o 1 if the 'depthLimit()' was exceeded //: //: o 2 if memory available for the JIT stack is not large enough //: (applicable only if 'pattern()' was prepared with 'k_FLAG_JIT') //: //: o another non-zero value, otherwise // // The behavior is undefined unless 'true == isPrepared()', // 'subject || 0 == subjectLength', 'subjectOffset <= subjectLength', // and 'subject' is valid UTF-8 if 'pattern()' was prepared with // 'k_FLAG_UTF8' but 'false == skipUTF8Validation'. template <class STRING> int replaceImp(STRING *result, int *errorOffset, const bsl::string_view& subject, const bsl::string_view& replacement, size_t options, bool skipUTF8Validation) const; // Replace parts of the specified 'subject' that are matched with the // specified 'replacement'. The specified bit mask of 'options' flags // is used to configure the behavior of the replacement. 'options' // should contain a bit-wise OR of the 'k_REPLACE_*' constants defined // by this class, which indicate additional configuration parameters // for the replacement. If 'options' has 'k_REPLACE_GLOBAL' flag then // this function iterates over 'subject', replacing every matching // substring. If 'k_REPLACE_GLOBAL' flag is not set, only the first // matching substring is replaced. The specified 'skipUTF8Validation' // flag indicates whether UTF-8 'replacment' validity checking is // skipped. Return the number of substitutions that were carried out, // and load the specified 'result' with the result of the replacement. // Otherwise, if an error occurs, return a negative value. If that // error is a syntax error in 'replacement', load the specified // 'errorOffset' (if non-null) with the offset in'replacement' where // the error was detected; for other errors, such as invalid 'subject' // or 'replacement' UTF-8 string, load 'errorOffset' with a negative // value. The behavior is undefined unless 'true == isPrepared()'. // Note that if the size of 'result' is too small to fit the resultant // string then this method computes the size of 'result' and adjusts it // to the size that is needed. To avoid automatic calculation and // adjustment which may introduce a performace penalty, it is // recommended that the size of 'result' has enough room to fit the // zero-terminating character. public: // TRAITS BSLMF_NESTED_TRAIT_DECLARATION(RegEx, bslma::UsesBslmaAllocator); // CONSTANTS enum { // This enumeration defines the flags that may be supplied to 'prepare' // to affect specific pattern matching behavior. k_FLAG_CASELESS = 1 << 0, // case-insensitive matching k_FLAG_DOTMATCHESALL = 1 << 1, // dot metacharacter matches all chars // (including newlines) k_FLAG_MULTILINE = 1 << 2, // multi-line matching k_FLAG_UTF8 = 1 << 3, // UTF-8 support k_FLAG_JIT = 1 << 4 // just-in-time compiling optimization // requested }; enum { // This enumeration defines the flags that may be supplied to 'replace' // to affect specific replacement behavior. k_REPLACE_LITERAL = 1 << 0, // the replacement string is literal k_REPLACE_GLOBAL = 1 << 1, // replace all occurrences in the // subject k_REPLACE_EXTENDED = 1 << 2, // do extended replacement // processing k_REPLACE_UNKNOWN_UNSET = 1 << 3, // treat unknown group as unset k_REPLACE_UNSET_EMPTY = 1 << 4 // simple unset insert = empty // string }; static const size_t k_INVALID_OFFSET; // Value used to denote an invalid offset for match methods returning // pairs. // CLASS METHODS static int defaultDepthLimit(); // Return the process-wide default evaluation recursion depth limit. static bool isJitAvailable(); // Return 'true' if just-in-time compiling optimization is supported by // current hardware platform and 'false' otherwise. Note that JIT // support is limited to the following hardware platforms: //.. // ARM 32-bit (v5, v7, and Thumb2) // ARM 64-bit // Intel x86 32-bit and 64-bit // MIPS 32-bit and 64-bit // Power PC 32-bit and 64-bit // SPARC 32-bit //.. static int setDefaultDepthLimit(int depthLimit); // Set the process-wide default evaluation recursion depth limit to the // specified 'depthLimit'. Return the previous depth limit. // CREATORS RegEx(bslma::Allocator *basicAllocator = 0); // IMPLICIT // Create a regular-expression object in the "unprepared" state. // Optionally specify a 'basicAllocator' used to supply memory. The // alignment strategy of the allocator must be "maximum" or "natural". // If 'basicAllocator' is 0, the currently installed default allocator // is used. ~RegEx(); // Destroy this regular-expression object. // MANIPULATORS void clear(); // Free resources used by this regular-expression object and put this // object into the "unprepared" state. This method has no effect if // this object is already in the "unprepared" state. int prepare(bsl::nullptr_t errorMessage, size_t *errorOffset, const char *pattern, int flags = 0, size_t jitStackSize = 0); template <class STRING> typename bsl::enable_if< bsl::is_same<STRING, bsl::string>::value || bsl::is_same<STRING, std::string>::value #ifdef BSLS_LIBRARYFEATURES_HAS_CPP17_PMR || bsl::is_same<STRING, std::pmr::string>::value #endif , int>::type prepare(STRING *errorMessage, size_t *errorOffset, const char *pattern, int flags = 0, size_t jitStackSize = 0); // Prepare this regular-expression object with the specified 'pattern' // and the optionally specified 'flags'. 'flags', if supplied, should // contain a bit-wise or of the 'k_FLAG_*' constants defined by this // class, which indicate additional configuration parameters for the // regular expression. Optionally specify 'jitStackSize'. If 'flags' // has the 'k_FLAG_JIT' flag set, 'jitStackSize' indicates the size of // the allocated JIT stack to be used for this pattern. If 'flags' // has the 'k_FLAG_JIT' bit set and 'jitStackSize' is 0 (or not // supplied), no memory will be allocated for the JIT stack and the // program stack will be used as the JIT stack. If 'flags' does not // have 'k_FLAG_JIT' set, or 'isJitAvailable()' is 'false', the // 'jitStackSize' parameter, if supplied, is ignored. On success, put // this object into the "prepared" state and return 0, with no effect // on the specified 'errorMessage' and 'errorOffset'. Otherwise, (1) // put this object into the "unprepared" state, (2) load 'errorMessage' // (if non-null) with a string describing the error detected, (3) load // 'errorOffset' (if non-null) with the offset in 'pattern' at which // the error was detected, and (4) return a non-zero value. The // behavior is undefined unless 'flags' is the bit-wise inclusive-or of // 0 or more of the following values: //.. // k_FLAG_CASELESS // k_FLAG_DOTMATCHESALL // k_FLAG_MULTILINE // k_FLAG_UTF8 // k_FLAG_JIT //.. // Note that the flag 'k_FLAG_JIT' is ignored if 'isJitAvailable()' is // 'false'. int setDepthLimit(int depthLimit); // Set the evaluation recursion depth limit for this regular-expression // object to the specified 'depthLimit'. Return the previous depth // limit. // ACCESSORS int depthLimit() const; // Return the evaluation recursion depth limit for this // regular-expression object. int flags() const; // Return the flags that were supplied to the most recent successful // call to the 'prepare' method of this regular-expression object. The // behavior is undefined unless 'isPrepared() == true'. Note that the // returned value will be the bit-wise inclusive-or of 0 or more of the // following values: //.. // k_FLAG_CASELESS // k_FLAG_DOTMATCHESALL // k_FLAG_MULTILINE // k_FLAG_UTF8 // k_FLAG_JIT //.. // Also note that 'k_FLAG_JIT' is ignored, but still returned by this // method, if 'isJitAvailable()' is 'false'. bool isPrepared() const; // Return 'true' if this regular-expression object is in the "prepared" // state, and 'false' otherwise. size_t jitStackSize() const; // Return the size of the dynamically allocated JIT stack if it has // been specified explicitly with the 'prepare' method. Return 0 if a // zero 'jitStackSize' value was passed to the 'prepare' method (or not // supplied at all) or if 'isPrepared()' is 'false'. int match(const bsl::string_view& subject, size_t subjectOffset = 0) const; // Match the specified 'subject' against 'pattern()'. Begin matching // at the optionally specified 'subjectOffset' in 'subject'. If // 'subjectOffset' is not specified, matching begins at the start of // 'subject'. UTF-8 validity checking is performed on 'subject' if // 'pattern()' was prepared with 'k_FLAG_UTF8'. Return: // //: o 0 on success //: //: o 1 if 'depthLimit()' was exceeded //: //: o 2 if memory available for the JIT stack is not large enough //: (applicable only if 'pattern()' was prepared with 'k_FLAG_JIT') //: //: o another non-zero value, for example, if 'pattern()' was prepared //: with 'k_FLAG_UTF8', but 'subject' is not valid UTF-8 // // The behavior is undefined unless 'true == isPrepared()' and // 'subjectOffset <= subject.length()'. Note that JIT optimization is // disabled if 'pattern()' was prepared with 'k_FLAG_UTF8'; use // 'matchRaw' if JIT is preferred and UTF-8 validation of 'subject' is // not required. int match(const char *subject, size_t subjectLength, size_t subjectOffset = 0) const; // Match the specified 'subject' having the specified 'subjectLength' // against 'pattern()'. Begin matching at the optionally specified // 'subjectOffset' in 'subject'. If 'subjectOffset' is not specified, // matching begins at the start of 'subject'. 'subject' may contain // embedded null characters. UTF-8 validity checking is performed on // 'subject' if 'pattern()' was prepared with 'k_FLAG_UTF8'. Return: // //: o 0 on success //: //: o 1 if 'depthLimit()' was exceeded //: //: o 2 if memory available for the JIT stack is not large enough //: (applicable only if 'pattern()' was prepared with 'k_FLAG_JIT') //: //: o another non-zero value, for example, if 'pattern()' was prepared //: with 'k_FLAG_UTF8', but 'subject' is not valid UTF-8 // // The behavior is undefined unless 'true == isPrepared()', // 'subject || 0 == subjectLength', and // 'subjectOffset <= subjectLength'. Note that JIT optimization is // disabled if 'pattern()' was prepared with 'k_FLAG_UTF8'; use // 'matchRaw' if JIT is preferred and UTF-8 validation of 'subject' is // not required. int match(bsl::pair<size_t, size_t> *result, const char *subject, size_t subjectLength, size_t subjectOffset = 0) const; int match(bsl::string_view *result, const char *subject, size_t subjectLength, size_t subjectOffset = 0) const; // Match the specified 'subject' having the specified 'subjectLength' // against 'pattern()'. Begin matching at the optionally specified // 'subjectOffset' in 'subject'. If 'subjectOffset' is not specified, // matching begins at the start of 'subject'. 'subject' may contain // embedded null characters. UTF-8 validity checking is performed on // 'subject' if 'pattern()' was prepared with 'k_FLAG_UTF8'. Return: // //: o 0 on success and load the specified 'result' with, respectively, //: a '(offset, length)' pair or a 'bsl::string_view' indicating the //: leftmost match of 'pattern()' //: //: o 1 if 'depthLimit()' was exceeded //: //: o 2 if memory available for the JIT stack is not large enough //: (applicable only if 'pattern()' was prepared with 'k_FLAG_JIT') //: //: o another non-zero value, for example, if 'pattern()' was prepared //: with 'k_FLAG_UTF8', but 'subject' is not valid UTF-8 // // 'result' is unchanged if a non-zero value is returned. The behavior // is undefined unless 'true == isPrepared()', // 'subject || 0 == subjectLength', and // 'subjectOffset <= subjectLength'. Note that JIT optimization is // disabled if 'pattern()' was prepared with 'k_FLAG_UTF8'; use // 'matchRaw' if JIT is preferred and UTF-8 validation of 'subject' is // not required. int match(bsl::string_view *result, const bsl::string_view& subject, size_t subjectOffset = 0) const; // Match the specified 'subject' against 'pattern()'. Begin matching // at the optionally specified 'subjectOffset' in 'subject'. If // 'subjectOffset' is not specified, matching begins at the start of // 'subject'. UTF-8 validity checking is performed on 'subject' if // 'pattern()' was prepared with 'k_FLAG_UTF8'. Return: // //: o 0 on success and load the specified 'result' with a //: 'bsl::string_view' indicating the leftmost match of 'pattern()' //: //: o 1 if 'depthLimit()' was exceeded //: //: o 2 if memory available for the JIT stack is not large enough //: (applicable only if 'pattern()' was prepared with 'k_FLAG_JIT') //: //: o another non-zero value, for example, if 'pattern()' was prepared //: with 'k_FLAG_UTF8', but 'subject' is not valid UTF-8 // // 'result' is unchanged if a non-zero value is returned. The behavior // is undefined unless 'true == isPrepared()' and // 'subjectOffset <= subject.length()'. Note that JIT optimization is // disabled if 'pattern()' was prepared with 'k_FLAG_UTF8'; use // 'matchRaw' if JIT is preferred and UTF-8 validation of 'subject' is // not required. int match(bsl::vector<bsl::pair<size_t, size_t> > *result, const char *subject, size_t subjectLength, size_t subjectOffset = 0) const; int match(bsl::vector<bslstl::StringRef> *result, const char *subject, size_t subjectLength, size_t subjectOffset = 0) const; // Match the specified 'subject' having the specified 'subjectLength' // against 'pattern()'. Begin matching at the optionally specified // 'subjectOffset' in 'subject'. If 'subjectOffset' is not specified, // matching begins at the start of 'subject'. 'subject' may contain // embedded null characters. UTF-8 validity checking is performed on // 'subject' if 'pattern()' was prepared with 'k_FLAG_UTF8'. On // success: // //: 1 Load the first element of the specified 'result' with, //: respectively, a '(offset, length)' pair or a 'bslstl::StringRef' //: indicating the leftmost match of 'pattern()'. //: //: 2 Load elements of 'result' in the range '[1 .. numSubpatterns()]' //: with, respectively, a '(offset, length)' pair or a //: 'bslstl::StringRef' indicating the respective matches of //: sub-patterns (unmatched sub-patterns have their respective //: 'result' elements loaded with either the '(k_INVALID_OFFSET, 0)' //: pair or an empty 'bslstl::StringRef'); sub-patterns matching //: multiple times have their respective 'result' elements loaded //: with the pairs or 'bslstl::StringRef' indicating the rightmost //: match, and return 0. // // Otherwise, return: // //: o 1 if 'depthLimit()' was exceeded //: //: o 2 if memory available for the JIT stack is not large enough //: (applicable only if 'pattern()' was prepared with 'k_FLAG_JIT') //: //: o another non-zero value, for example, if 'pattern()' was prepared //: with 'k_FLAG_UTF8', but 'subject' is not valid UTF-8 // // 'result' is unchanged if a non-zero value is returned. The behavior // is undefined unless 'true == isPrepared()', // 'subject || 0 == subjectLength', and // 'subjectOffset <= subjectLength'. Note that JIT optimization is // disabled if 'pattern()' was prepared with 'k_FLAG_UTF8'; use // 'matchRaw' if JIT is preferred and UTF-8 validation of 'subject' is // not required. Also note that after a successful call, 'result' will // contain exactly 'numSubpatterns() + 1' elements. int match(bsl::vector<bsl::string_view> *result, const bsl::string_view& subject, size_t subjectOffset = 0) const; int match(std::vector<bsl::string_view> *result, const bsl::string_view& subject, size_t subjectOffset = 0) const; #ifdef BSLS_LIBRARYFEATURES_HAS_CPP17_PMR int match(std::pmr::vector<bsl::string_view> *result, const bsl::string_view& subject, size_t subjectOffset = 0) const; #endif // Match the specified 'subject' against 'pattern()'. Begin matching // at the optionally specified 'subjectOffset' in 'subject'. If // 'subjectOffset' is not specified, matching begins at the start of // 'subject'. UTF-8 validity checking is performed on 'subject' if // 'pattern()' was prepared with 'k_FLAG_UTF8'. On success: // //: 1 Load the first element of the specified 'result' with a //: 'bsl::string_view' indicating the leftmost match of 'pattern()'. //: //: 2 Load elements of 'result' in the range '[1 .. numSubpatterns()]' //: with a 'bsl::string_view' indicating the respective matches of //: sub-patterns (unmatched sub-patterns have their respective //: 'result' elements loaded with an empty 'bsl::string_view'); //: sub-patterns matching multiple times have their respective //: 'result' elements loaded with a 'bsl::string_view' indicating the //: rightmost match, and return 0. // // Otherwise, return: // //: o 1 if 'depthLimit()' was exceeded //: //: o 2 if memory available for the JIT stack is not large enough //: (applicable only if 'pattern()' was prepared with 'k_FLAG_JIT') //: //: o another non-zero value, for example, if 'pattern()' was prepared //: with 'k_FLAG_UTF8', but 'subject' is not valid UTF-8 // // 'result' is unchanged if a non-zero value is returned. The behavior // is undefined unless 'true == isPrepared()' and // 'subjectOffset <= subject.length()'. Note that JIT optimization is // disabled if 'pattern()' was prepared with 'k_FLAG_UTF8'; use // 'matchRaw' if JIT is preferred and UTF-8 validation of 'subject' is // not required. Also note that after a successful call, 'result' // will contain exactly 'numSubpatterns() + 1' elements. int matchRaw(const bsl::string_view& subject, size_t subjectOffset = 0) const; // Match the specified 'subject' against 'pattern()'. Begin matching // at the optionally specified 'subjectOffset' in 'subject'. If // 'subjectOffset' is not specified, matching begins at the start of // 'subject'. Return: // //: o 0 on success //: //: o 1 if 'depthLimit()' was exceeded //: //: o 2 if memory available for the JIT stack is not large enough //: (applicable only if 'pattern()' was prepared with 'k_FLAG_JIT') //: //: o another non-zero value, otherwise // // The behavior is undefined unless 'true == isPrepared()', // 'subjectOffset <= subject.length()', and 'subject' is valid UTF-8 if // 'pattern()' was prepared with 'k_FLAG_UTF8'. int matchRaw(const char *subject, size_t subjectLength, size_t subjectOffset = 0) const; // Match the specified 'subject' having the specified 'subjectLength' // against 'pattern()'. Begin matching at the optionally specified // 'subjectOffset' in 'subject'. If 'subjectOffset' is not specified, // matching begins at the start of 'subject'. 'subject' may contain // embedded null characters. Return: // //: o 0 on success //: //: o 1 if 'depthLimit()' was exceeded //: //: o 2 if memory available for the JIT stack is not large enough //: (applicable only if 'pattern()' was prepared with 'k_FLAG_JIT') //: //: o another non-zero value, otherwise // // The behavior is undefined unless 'true == isPrepared()', // 'subject || 0 == subjectLength', 'subjectOffset <= subjectLength', // and 'subject' is valid UTF-8 if 'pattern()' was prepared with // 'k_FLAG_UTF8'. int matchRaw(bsl::pair<size_t, size_t> *result, const char *subject, size_t subjectLength, size_t subjectOffset = 0) const; int matchRaw(bsl::string_view *result, const char *subject, size_t subjectLength, size_t subjectOffset = 0) const; // Match the specified 'subject' having the specified 'subjectLength' // against 'pattern()'. Begin matching at the optionally specified // 'subjectOffset' in 'subject'. If 'subjectOffset' is not specified, // matching begins at the start of 'subject'. 'subject' may contain // embedded null characters. Return: // //: o 0 on success and load the specified 'result' with, respectively, //: a '(offset, length)' pair or a 'bsl::string_view' indicating the //: leftmost match of 'pattern()' //: //: o 1 if 'depthLimit()' was exceeded //: //: o 2 if memory available for the JIT stack is not large enough //: (applicable only if 'pattern()' was prepared with 'k_FLAG_JIT') //: //: o another non-zero value, otherwise // // 'result' is unchanged if a non-zero value is returned. The behavior // is undefined unless 'true == isPrepared()', // 'subject || 0 == subjectLength', 'subjectOffset <= subjectLength', // and 'subject' is valid UTF-8 if 'pattern()' was prepared with // 'k_FLAG_UTF8'. int matchRaw(bsl::string_view *result, const bsl::string_view& subject, size_t subjectOffset = 0) const; // Match the specified 'subject' against 'pattern()'. Begin matching // at the optionally specified 'subjectOffset' in 'subject'. If // 'subjectOffset' is not specified, matching begins at the start of // 'subject'. Return: // //: o 0 on success and load the specified 'result' with a //: 'bsl::string_view' indicating the leftmost match of 'pattern()' //: //: o 1 if 'depthLimit()' was exceeded //: //: o 2 if memory available for the JIT stack is not large enough //: (applicable only if 'pattern()' was prepared with 'k_FLAG_JIT') //: //: o another non-zero value, otherwise // // 'result' is unchanged if a non-zero value is returned. The behavior // is undefined unless 'true == isPrepared()', // 'subjectOffset <= subject.length()', and 'subject' is valid UTF-8 if // 'pattern()' was prepared with 'k_FLAG_UTF8'. int matchRaw(bsl::vector<bsl::pair<size_t, size_t> > *result, const char *subject, size_t subjectLength, size_t subjectOffset = 0) const; int matchRaw(bsl::vector<bslstl::StringRef> *result, const char *subject, size_t subjectLength, size_t subjectOffset = 0) const; // Match the specified 'subject' having the specified 'subjectLength' // against 'pattern()'. Begin matching at the optionally specified // 'subjectOffset' in 'subject'. If 'subjectOffset' is not specified, // matching begins at the start of 'subject'. 'subject' may contain // embedded null characters. On success: // //: 1 Load the first element of the specified 'result' with, //: respectively, a '(offset, length)' pair or a 'bslstl::StringRef' //: indicating the leftmost match of 'pattern()'. //: //: 2 Load elements of 'result' in the range '[1 .. numSubpatterns()]' //: with, respectively, a '(offset, length)' pair or a //: 'bslstl::StringRef' indicating the respective matches of //: sub-patterns (unmatched sub-patterns have their respective //: 'result' elements loaded with either the '(k_INVALID_OFFSET, 0)' //: pair or an empty 'bslstl::StringRef'); sub-patterns matching //: multiple times have their respective 'result' elements loaded //: with the pairs or 'bslstl::StringRef' indicating the rightmost //: match, and return 0. // // Otherwise, return: // //: o 1 if 'depthLimit()' was exceeded //: //: o 2 if memory available for the JIT stack is not large enough //: (applicable only if 'pattern()' was prepared with 'k_FLAG_JIT') //: //: o another non-zero value // // 'result' is unchanged if a non-zero value is returned. The behavior // is undefined unless 'true == isPrepared()', // 'subject || 0 == subjectLength', 'subjectOffset <= subjectLength', // and 'subject' is valid UTF-8 if 'pattern()' was prepared with // 'k_FLAG_UTF8'. Note that after a successful call, 'result' will // contain exactly 'numSubpatterns() + 1' elements. int matchRaw(bsl::vector<bsl::string_view> *result, const bsl::string_view& subject, size_t subjectOffset = 0) const; int matchRaw(std::vector<bsl::string_view> *result, const bsl::string_view& subject, size_t subjectOffset = 0) const; #ifdef BSLS_LIBRARYFEATURES_HAS_CPP17_PMR int matchRaw(std::pmr::vector<bsl::string_view> *result, const bsl::string_view& subject, size_t subjectOffset = 0) const; #endif // Match the specified 'subject' against 'pattern()'. Begin matching // at the optionally specified 'subjectOffset' in 'subject'. If // 'subjectOffset' is not specified, matching begins at the start of // 'subject'. On success: // //: 1 Load the first element of the specified 'result' with a //: 'bsl::string_view' indicating the leftmost match of 'pattern()'. //: //: 2 Load elements of 'result' in the range '[1 .. numSubpatterns()]' //: with a 'bsl::string_view' indicating the respective matches of //: sub-patterns (unmatched sub-patterns have their respective //: 'result' elements loaded with an empty 'bsl::string_view'); //: sub-patterns matching multiple times have their respective //: 'result' elements loaded with a 'bsl::string_view' indicating the //: rightmost match, and return 0. // // Otherwise, return: // //: o 1 if 'depthLimit()' was exceeded //: //: o 2 if memory available for the JIT stack is not large enough //: (applicable only if 'pattern()' was prepared with 'k_FLAG_JIT') //: //: o another non-zero value // // 'result' is unchanged if a non-zero value is returned. The behavior // is undefined unless 'true == isPrepared()', // 'subjectOffset <= subject.length()', and 'subject' is valid UTF-8 if // 'pattern()' was prepared with 'k_FLAG_UTF8'. Also note that after a // successful call, 'result' will contain exactly // 'numSubpatterns() + 1' elements. int numSubpatterns() const; // Return the number of sub-patterns in the pattern held by this // regular-expression object ('pattern()'). The behavior is undefined // unless 'isPrepared() == true'. const bsl::string& pattern() const; // Return a reference to the non-modifiable pattern held by this // regular-expression object. The behavior is undefined unless // 'isPrepared() == true'. int replace(bsl::string *result, int *errorOffset, const bsl::string_view& subject, const bsl::string_view& replacement, size_t options = 0) const; int replace(std::string *result, int *errorOffset, const bsl::string_view& subject, const bsl::string_view& replacement, size_t options = 0) const; #ifdef BSLS_LIBRARYFEATURES_HAS_CPP17_PMR int replace(std::pmr::string *result, int *errorOffset, const bsl::string_view& subject, const bsl::string_view& replacement, size_t options = 0) const; #endif // Replace parts of the specified 'subject' that are matched with the // specified 'replacement'. Optionally specify a bit mask of 'options' // flags that configure the behavior of the replacement. 'options' // should contain a bit-wise OR of the 'k_REPLACE_*' constants defined // by this class, which indicate additional configuration parameters // for the replacement. If 'options' has 'k_REPLACE_GLOBAL' flag then // this function iterates over 'subject', replacing every matching // substring. If 'k_REPLACE_GLOBAL' flag is not set, only the first // matching substring is replaced. UTF-8 validity checking is // performed on 'subject' and 'replacement' if 'pattern()' was prepared // with 'k_FLAG_UTF8'. Return the number of substitutions that were // carried out on success, and load the specified 'result' with the // result of the replacement. Otherwise, if an error occurs, return a // negative value. If that error is a syntax error in 'replacement', // load the specified 'errorOffset' (if non-null) with the offset in // 'replacement' where the error was detected; for other errors, such // as invalid 'subject' or 'replacement' UTF-8 string, load // 'errorOffset' with a negative value. The behavior is undefined // unless 'true == isPrepared()'. Note that if the size of 'result' is // too small to fit the resultant string then this method computes the // size of 'result' and adjusts it to the size that is needed. To // avoid automatic calculation and adjustment which may introduce a // performance penalty, it is recommended that the size of 'result' has // enough room to fit the resulting string including a zero-terminating // character. int replaceRaw(bsl::string *result, int *errorOffset, const bsl::string_view& subject, const bsl::string_view& replacement, size_t options = 0) const; int replaceRaw(std::string *result, int *errorOffset, const bsl::string_view& subject, const bsl::string_view& replacement, size_t options = 0) const; #ifdef BSLS_LIBRARYFEATURES_HAS_CPP17_PMR int replaceRaw(std::pmr::string *result, int *errorOffset, const bsl::string_view& subject, const bsl::string_view& replacement, size_t options = 0) const; #endif // Replace parts of the specified 'subject' that are matched with the // specified 'replacement'. Optionally specify a bit mask of 'options' // flags that configure the behavior of the replacement. 'options' // should contain a bit-wise OR of the 'k_REPLACE_*' constants defined // by this class, which indicate additional configuration parameters // for the replacement. If 'options' has 'k_REPLACE_GLOBAL' flag then // this function iterates over 'subject', replacing every matching // substring. If 'k_REPLACE_GLOBAL' flag is not set, only the first // matching substring is replaced. UTF-8 validity checking is // performed on 'subject' if 'pattern()' was prepared with // 'k_FLAG_UTF8'. Return the number of substitutions that were carried // out on success, and load the specified 'result' with the result of // the replacement. Otherwise, if an error occurs, return a negative // value. If that error is a syntax error in 'replacement', load the // specified 'errorOffset' (if non-null) with the offset in // 'replacement' where the error was detected; for other errors, such // as invalid 'subject' UTF-8 string, load 'errorOffset' with a // negative value. The behavior is undefined unless // 'true == isPrepared()'. Note that if the size of 'result' is too // small to fit the resultant string then this method computes the size // of 'result' and adjusts it to the size that is needed. To avoid // automatic calculation and adjustment which may introduce a // performance penalty, it is recommended that the size of 'result' has // enough room to fit the resulting string including a zero-terminating // character. int subpatternIndex(const char *name) const; // Return the 1-based index of the sub-pattern having the specified // 'name' in the pattern held by this regular-expression object // ('pattern()'); return -1 if 'pattern()' does not contain a // sub-pattern identified by 'name'. The behavior is undefined unless // 'isPrepared() == true'. Note that the returned value is intended to // be used as an index into the 'bsl::vector<bsl::pair<int, int> >' // returned by 'match'. }; // ============================================================================ // INLINE DEFINITIONS // ============================================================================ // ----------- // class RegEx // ----------- // CLASS METHODS inline int RegEx::defaultDepthLimit() { return bsls::AtomicOperations::getIntRelaxed(&s_depthLimit); } inline int RegEx::setDefaultDepthLimit(int depthLimit) { int previous = defaultDepthLimit(); bsls::AtomicOperations::setIntRelaxed(&s_depthLimit, depthLimit); return previous; } // CREATORS inline RegEx::~RegEx() { clear(); pcre2_compile_context_free(d_compileContext_p); pcre2_general_context_free(d_pcre2Context_p); } // MANIPULATORS template <class STRING> typename bsl::enable_if< bsl::is_same<STRING, bsl::string>::value || bsl::is_same<STRING, std::string>::value #ifdef BSLS_LIBRARYFEATURES_HAS_CPP17_PMR || bsl::is_same<STRING, std::pmr::string>::value #endif , int>::type RegEx::prepare(STRING *errorMessage, size_t *errorOffset, const char *pattern, int flags, size_t jitStackSize) { const int k_BUFFER_LEN = 256; char buffer[k_BUFFER_LEN] = {0}; size_t offset; int ret = prepareImp(&buffer[0], k_BUFFER_LEN - 1, &offset, pattern, flags, jitStackSize); if (ret) { if (errorMessage) { errorMessage->assign(&buffer[0]); } if (errorOffset) { *errorOffset = offset; } } return ret; } // ACCESSORS inline int RegEx::depthLimit() const { return d_depthLimit; } inline int RegEx::flags() const { return d_flags; } inline bool RegEx::isPrepared() const { return (0 != d_patternCode_p); } inline size_t RegEx::jitStackSize() const { return d_jitStackSize; } inline const bsl::string& RegEx::pattern() const { return d_pattern; } } // close package namespace } // close enterprise namespace #endif // ---------------------------------------------------------------------------- // Copyright 2016 Bloomberg Finance L.P. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ----------------------------- END-OF-FILE ----------------------------------