From a9aa206b7b8ac4e69f8c46233b4080e00e845ff5 Mon Sep 17 00:00:00 2001 From: Edward Welbourne Date: Mon, 27 May 2019 19:13:54 +0200 Subject: Move text-related code out of corelib/tools/ to corelib/text/ This includes byte array, string, char, unicode, locale, collation and regular expressions. Change-Id: I8b125fa52c8c513eb57a0f1298b91910e5a0d786 Reviewed-by: Volker Hilsheimer --- src/corelib/text/qregexp.cpp | 4609 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 4609 insertions(+) create mode 100644 src/corelib/text/qregexp.cpp (limited to 'src/corelib/text/qregexp.cpp') diff --git a/src/corelib/text/qregexp.cpp b/src/corelib/text/qregexp.cpp new file mode 100644 index 0000000000..d81826b47b --- /dev/null +++ b/src/corelib/text/qregexp.cpp @@ -0,0 +1,4609 @@ +/**************************************************************************** +** +** Copyright (C) 2016 The Qt Company Ltd. +** Contact: https://www.qt.io/licensing/ +** +** This file is part of the QtCore module of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL$ +** Commercial License Usage +** Licensees holding valid commercial Qt licenses may use this file in +** accordance with the commercial license agreement provided with the +** Software or, alternatively, in accordance with the terms contained in +** a written agreement between you and The Qt Company. For licensing terms +** and conditions see https://www.qt.io/terms-conditions. For further +** information use the contact form at https://www.qt.io/contact-us. +** +** GNU Lesser General Public License Usage +** Alternatively, this file may be used under the terms of the GNU Lesser +** General Public License version 3 as published by the Free Software +** Foundation and appearing in the file LICENSE.LGPL3 included in the +** packaging of this file. Please review the following information to +** ensure the GNU Lesser General Public License version 3 requirements +** will be met: https://www.gnu.org/licenses/lgpl-3.0.html. +** +** GNU General Public License Usage +** Alternatively, this file may be used under the terms of the GNU +** General Public License version 2.0 or (at your option) the GNU General +** Public license version 3 or any later version approved by the KDE Free +** Qt Foundation. The licenses are as published by the Free Software +** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3 +** included in the packaging of this file. Please review the following +** information to ensure the GNU General Public License requirements will +** be met: https://www.gnu.org/licenses/gpl-2.0.html and +** https://www.gnu.org/licenses/gpl-3.0.html. +** +** $QT_END_LICENSE$ +** +****************************************************************************/ + +#include "qregexp.h" + +#include "qalgorithms.h" +#include "qbitarray.h" +#include "qcache.h" +#include "qdatastream.h" +#include "qdebug.h" +#include "qhashfunctions.h" +#include "qlist.h" +#include "qmap.h" +#include "qmutex.h" +#include "qstring.h" +#include "qstringlist.h" +#include "qstringmatcher.h" +#include "qvector.h" + +#include +#include + +QT_BEGIN_NAMESPACE + +// error strings for the regexp parser +#define RXERR_OK QT_TRANSLATE_NOOP("QRegExp", "no error occurred") +#define RXERR_DISABLED QT_TRANSLATE_NOOP("QRegExp", "disabled feature used") +#define RXERR_CHARCLASS QT_TRANSLATE_NOOP("QRegExp", "bad char class syntax") +#define RXERR_LOOKAHEAD QT_TRANSLATE_NOOP("QRegExp", "bad lookahead syntax") +#define RXERR_LOOKBEHIND QT_TRANSLATE_NOOP("QRegExp", "lookbehinds not supported, see QTBUG-2371") +#define RXERR_REPETITION QT_TRANSLATE_NOOP("QRegExp", "bad repetition syntax") +#define RXERR_OCTAL QT_TRANSLATE_NOOP("QRegExp", "invalid octal value") +#define RXERR_LEFTDELIM QT_TRANSLATE_NOOP("QRegExp", "missing left delim") +#define RXERR_END QT_TRANSLATE_NOOP("QRegExp", "unexpected end") +#define RXERR_LIMIT QT_TRANSLATE_NOOP("QRegExp", "met internal limit") +#define RXERR_INTERVAL QT_TRANSLATE_NOOP("QRegExp", "invalid interval") +#define RXERR_CATEGORY QT_TRANSLATE_NOOP("QRegExp", "invalid category") + +/*! + \class QRegExp + \inmodule QtCore + \reentrant + \brief The QRegExp class provides pattern matching using regular expressions. + + \ingroup tools + \ingroup shared + + \keyword regular expression + + A regular expression, or "regexp", is a pattern for matching + substrings in a text. This is useful in many contexts, e.g., + + \table + \row \li Validation + \li A regexp can test whether a substring meets some criteria, + e.g. is an integer or contains no whitespace. + \row \li Searching + \li A regexp provides more powerful pattern matching than + simple substring matching, e.g., match one of the words + \e{mail}, \e{letter} or \e{correspondence}, but none of the + words \e{email}, \e{mailman}, \e{mailer}, \e{letterbox}, etc. + \row \li Search and Replace + \li A regexp can replace all occurrences of a substring with a + different substring, e.g., replace all occurrences of \e{&} + with \e{\&} except where the \e{&} is already followed by + an \e{amp;}. + \row \li String Splitting + \li A regexp can be used to identify where a string should be + split apart, e.g. splitting tab-delimited strings. + \endtable + + A brief introduction to regexps is presented, a description of + Qt's regexp language, some examples, and the function + documentation itself. QRegExp is modeled on Perl's regexp + language. It fully supports Unicode. QRegExp can also be used in a + simpler, \e{wildcard mode} that is similar to the functionality + found in command shells. The syntax rules used by QRegExp can be + changed with setPatternSyntax(). In particular, the pattern syntax + can be set to QRegExp::FixedString, which means the pattern to be + matched is interpreted as a plain string, i.e., special characters + (e.g., backslash) are not escaped. + + A good text on regexps is \e {Mastering Regular Expressions} + (Third Edition) by Jeffrey E. F. Friedl, ISBN 0-596-52812-4. + + \note In Qt 5, the new QRegularExpression class provides a Perl + compatible implementation of regular expressions and is recommended + in place of QRegExp. + + \tableofcontents + + \section1 Introduction + + Regexps are built up from expressions, quantifiers, and + assertions. The simplest expression is a character, e.g. \b{x} + or \b{5}. An expression can also be a set of characters + enclosed in square brackets. \b{[ABCD]} will match an \b{A} + or a \b{B} or a \b{C} or a \b{D}. We can write this same + expression as \b{[A-D]}, and an expression to match any + capital letter in the English alphabet is written as + \b{[A-Z]}. + + A quantifier specifies the number of occurrences of an expression + that must be matched. \b{x{1,1}} means match one and only one + \b{x}. \b{x{1,5}} means match a sequence of \b{x} + characters that contains at least one \b{x} but no more than + five. + + Note that in general regexps cannot be used to check for balanced + brackets or tags. For example, a regexp can be written to match an + opening html \c{} and its closing \c{}, if the \c{} tags + are not nested, but if the \c{} tags are nested, that same + regexp will match an opening \c{} tag with the wrong closing + \c{}. For the fragment \c{bold bolder}, the + first \c{} would be matched with the first \c{}, which is + not correct. However, it is possible to write a regexp that will + match nested brackets or tags correctly, but only if the number of + nesting levels is fixed and known. If the number of nesting levels + is not fixed and known, it is impossible to write a regexp that + will not fail. + + Suppose we want a regexp to match integers in the range 0 to 99. + At least one digit is required, so we start with the expression + \b{[0-9]{1,1}}, which matches a single digit exactly once. This + regexp matches integers in the range 0 to 9. To match integers up + to 99, increase the maximum number of occurrences to 2, so the + regexp becomes \b{[0-9]{1,2}}. This regexp satisfies the + original requirement to match integers from 0 to 99, but it will + also match integers that occur in the middle of strings. If we + want the matched integer to be the whole string, we must use the + anchor assertions, \b{^} (caret) and \b{$} (dollar). When + \b{^} is the first character in a regexp, it means the regexp + must match from the beginning of the string. When \b{$} is the + last character of the regexp, it means the regexp must match to + the end of the string. The regexp becomes \b{^[0-9]{1,2}$}. + Note that assertions, e.g. \b{^} and \b{$}, do not match + characters but locations in the string. + + If you have seen regexps described elsewhere, they may have looked + different from the ones shown here. This is because some sets of + characters and some quantifiers are so common that they have been + given special symbols to represent them. \b{[0-9]} can be + replaced with the symbol \b{\\d}. The quantifier to match + exactly one occurrence, \b{{1,1}}, can be replaced with the + expression itself, i.e. \b{x{1,1}} is the same as \b{x}. So + our 0 to 99 matcher could be written as \b{^\\d{1,2}$}. It can + also be written \b{^\\d\\d{0,1}$}, i.e. \e{From the start of + the string, match a digit, followed immediately by 0 or 1 digits}. + In practice, it would be written as \b{^\\d\\d?$}. The \b{?} + is shorthand for the quantifier \b{{0,1}}, i.e. 0 or 1 + occurrences. \b{?} makes an expression optional. The regexp + \b{^\\d\\d?$} means \e{From the beginning of the string, match + one digit, followed immediately by 0 or 1 more digit, followed + immediately by end of string}. + + To write a regexp that matches one of the words 'mail' \e or + 'letter' \e or 'correspondence' but does not match words that + contain these words, e.g., 'email', 'mailman', 'mailer', and + 'letterbox', start with a regexp that matches 'mail'. Expressed + fully, the regexp is \b{m{1,1}a{1,1}i{1,1}l{1,1}}, but because + a character expression is automatically quantified by + \b{{1,1}}, we can simplify the regexp to \b{mail}, i.e., an + 'm' followed by an 'a' followed by an 'i' followed by an 'l'. Now + we can use the vertical bar \b{|}, which means \b{or}, to + include the other two words, so our regexp for matching any of the + three words becomes \b{mail|letter|correspondence}. Match + 'mail' \b{or} 'letter' \b{or} 'correspondence'. While this + regexp will match one of the three words we want to match, it will + also match words we don't want to match, e.g., 'email'. To + prevent the regexp from matching unwanted words, we must tell it + to begin and end the match at word boundaries. First we enclose + our regexp in parentheses, \b{(mail|letter|correspondence)}. + Parentheses group expressions together, and they identify a part + of the regexp that we wish to \l{capturing text}{capture}. + Enclosing the expression in parentheses allows us to use it as a + component in more complex regexps. It also allows us to examine + which of the three words was actually matched. To force the match + to begin and end on word boundaries, we enclose the regexp in + \b{\\b} \e{word boundary} assertions: + \b{\\b(mail|letter|correspondence)\\b}. Now the regexp means: + \e{Match a word boundary, followed by the regexp in parentheses, + followed by a word boundary}. The \b{\\b} assertion matches a + \e position in the regexp, not a \e character. A word boundary is + any non-word character, e.g., a space, newline, or the beginning + or ending of a string. + + If we want to replace ampersand characters with the HTML entity + \b{\&}, the regexp to match is simply \b{\&}. But this + regexp will also match ampersands that have already been converted + to HTML entities. We want to replace only ampersands that are not + already followed by \b{amp;}. For this, we need the negative + lookahead assertion, \b{(?!}__\b{)}. The regexp can then be + written as \b{\&(?!amp;)}, i.e. \e{Match an ampersand that is} + \b{not} \e{followed by} \b{amp;}. + + If we want to count all the occurrences of 'Eric' and 'Eirik' in a + string, two valid solutions are \b{\\b(Eric|Eirik)\\b} and + \b{\\bEi?ri[ck]\\b}. The word boundary assertion '\\b' is + required to avoid matching words that contain either name, + e.g. 'Ericsson'. Note that the second regexp matches more + spellings than we want: 'Eric', 'Erik', 'Eiric' and 'Eirik'. + + Some of the examples discussed above are implemented in the + \l{#code-examples}{code examples} section. + + \target characters-and-abbreviations-for-sets-of-characters + \section1 Characters and Abbreviations for Sets of Characters + + \table + \header \li Element \li Meaning + \row \li \b{c} + \li A character represents itself unless it has a special + regexp meaning. e.g. \b{c} matches the character \e c. + \row \li \b{\\c} + \li A character that follows a backslash matches the character + itself, except as specified below. e.g., To match a literal + caret at the beginning of a string, write \b{\\^}. + \row \li \b{\\a} + \li Matches the ASCII bell (BEL, 0x07). + \row \li \b{\\f} + \li Matches the ASCII form feed (FF, 0x0C). + \row \li \b{\\n} + \li Matches the ASCII line feed (LF, 0x0A, Unix newline). + \row \li \b{\\r} + \li Matches the ASCII carriage return (CR, 0x0D). + \row \li \b{\\t} + \li Matches the ASCII horizontal tab (HT, 0x09). + \row \li \b{\\v} + \li Matches the ASCII vertical tab (VT, 0x0B). + \row \li \b{\\x\e{hhhh}} + \li Matches the Unicode character corresponding to the + hexadecimal number \e{hhhh} (between 0x0000 and 0xFFFF). + \row \li \b{\\0\e{ooo}} (i.e., \\zero \e{ooo}) + \li matches the ASCII/Latin1 character for the octal number + \e{ooo} (between 0 and 0377). + \row \li \b{. (dot)} + \li Matches any character (including newline). + \row \li \b{\\d} + \li Matches a digit (QChar::isDigit()). + \row \li \b{\\D} + \li Matches a non-digit. + \row \li \b{\\s} + \li Matches a whitespace character (QChar::isSpace()). + \row \li \b{\\S} + \li Matches a non-whitespace character. + \row \li \b{\\w} + \li Matches a word character (QChar::isLetterOrNumber(), QChar::isMark(), or '_'). + \row \li \b{\\W} + \li Matches a non-word character. + \row \li \b{\\\e{n}} + \li The \e{n}-th backreference, e.g. \\1, \\2, etc. + \endtable + + \b{Note:} The C++ compiler transforms backslashes in strings. + To include a \b{\\} in a regexp, enter it twice, i.e. \c{\\}. + To match the backslash character itself, enter it four times, i.e. + \c{\\\\}. + + \target sets-of-characters + \section1 Sets of Characters + + Square brackets mean match any character contained in the square + brackets. The character set abbreviations described above can + appear in a character set in square brackets. Except for the + character set abbreviations and the following two exceptions, + characters do not have special meanings in square brackets. + + \table + \row \li \b{^} + + \li The caret negates the character set if it occurs as the + first character (i.e. immediately after the opening square + bracket). \b{[abc]} matches 'a' or 'b' or 'c', but + \b{[^abc]} matches anything \e but 'a' or 'b' or 'c'. + + \row \li \b{-} + + \li The dash indicates a range of characters. \b{[W-Z]} + matches 'W' or 'X' or 'Y' or 'Z'. + + \endtable + + Using the predefined character set abbreviations is more portable + than using character ranges across platforms and languages. For + example, \b{[0-9]} matches a digit in Western alphabets but + \b{\\d} matches a digit in \e any alphabet. + + Note: In other regexp documentation, sets of characters are often + called "character classes". + + \target quantifiers + \section1 Quantifiers + + By default, an expression is automatically quantified by + \b{{1,1}}, i.e. it should occur exactly once. In the following + list, \b{\e {E}} stands for expression. An expression is a + character, or an abbreviation for a set of characters, or a set of + characters in square brackets, or an expression in parentheses. + + \table + \row \li \b{\e {E}?} + + \li Matches zero or one occurrences of \e E. This quantifier + means \e{The previous expression is optional}, because it + will match whether or not the expression is found. \b{\e + {E}?} is the same as \b{\e {E}{0,1}}. e.g., \b{dents?} + matches 'dent' or 'dents'. + + \row \li \b{\e {E}+} + + \li Matches one or more occurrences of \e E. \b{\e {E}+} is + the same as \b{\e {E}{1,}}. e.g., \b{0+} matches '0', + '00', '000', etc. + + \row \li \b{\e {E}*} + + \li Matches zero or more occurrences of \e E. It is the same + as \b{\e {E}{0,}}. The \b{*} quantifier is often used + in error where \b{+} should be used. For example, if + \b{\\s*$} is used in an expression to match strings that + end in whitespace, it will match every string because + \b{\\s*$} means \e{Match zero or more whitespaces followed + by end of string}. The correct regexp to match strings that + have at least one trailing whitespace character is + \b{\\s+$}. + + \row \li \b{\e {E}{n}} + + \li Matches exactly \e n occurrences of \e E. \b{\e {E}{n}} + is the same as repeating \e E \e n times. For example, + \b{x{5}} is the same as \b{xxxxx}. It is also the same + as \b{\e {E}{n,n}}, e.g. \b{x{5,5}}. + + \row \li \b{\e {E}{n,}} + \li Matches at least \e n occurrences of \e E. + + \row \li \b{\e {E}{,m}} + \li Matches at most \e m occurrences of \e E. \b{\e {E}{,m}} + is the same as \b{\e {E}{0,m}}. + + \row \li \b{\e {E}{n,m}} + \li Matches at least \e n and at most \e m occurrences of \e E. + \endtable + + To apply a quantifier to more than just the preceding character, + use parentheses to group characters together in an expression. For + example, \b{tag+} matches a 't' followed by an 'a' followed by + at least one 'g', whereas \b{(tag)+} matches at least one + occurrence of 'tag'. + + Note: Quantifiers are normally "greedy". They always match as much + text as they can. For example, \b{0+} matches the first zero it + finds and all the consecutive zeros after the first zero. Applied + to '20005', it matches '2\underline{000}5'. Quantifiers can be made + non-greedy, see setMinimal(). + + \target capturing parentheses + \target backreferences + \section1 Capturing Text + + Parentheses allow us to group elements together so that we can + quantify and capture them. For example if we have the expression + \b{mail|letter|correspondence} that matches a string we know + that \e one of the words matched but not which one. Using + parentheses allows us to "capture" whatever is matched within + their bounds, so if we used \b{(mail|letter|correspondence)} + and matched this regexp against the string "I sent you some email" + we can use the cap() or capturedTexts() functions to extract the + matched characters, in this case 'mail'. + + We can use captured text within the regexp itself. To refer to the + captured text we use \e backreferences which are indexed from 1, + the same as for cap(). For example we could search for duplicate + words in a string using \b{\\b(\\w+)\\W+\\1\\b} which means match a + word boundary followed by one or more word characters followed by + one or more non-word characters followed by the same text as the + first parenthesized expression followed by a word boundary. + + If we want to use parentheses purely for grouping and not for + capturing we can use the non-capturing syntax, e.g. + \b{(?:green|blue)}. Non-capturing parentheses begin '(?:' and + end ')'. In this example we match either 'green' or 'blue' but we + do not capture the match so we only know whether or not we matched + but not which color we actually found. Using non-capturing + parentheses is more efficient than using capturing parentheses + since the regexp engine has to do less book-keeping. + + Both capturing and non-capturing parentheses may be nested. + + \target greedy quantifiers + + For historical reasons, quantifiers (e.g. \b{*}) that apply to + capturing parentheses are more "greedy" than other quantifiers. + For example, \b{a*(a*)} will match "aaa" with cap(1) == "aaa". + This behavior is different from what other regexp engines do + (notably, Perl). To obtain a more intuitive capturing behavior, + specify QRegExp::RegExp2 to the QRegExp constructor or call + setPatternSyntax(QRegExp::RegExp2). + + \target cap_in_a_loop + + When the number of matches cannot be determined in advance, a + common idiom is to use cap() in a loop. For example: + + \snippet code/src_corelib_tools_qregexp.cpp 0 + + \target assertions + \section1 Assertions + + Assertions make some statement about the text at the point where + they occur in the regexp but they do not match any characters. In + the following list \b{\e {E}} stands for any expression. + + \table + \row \li \b{^} + \li The caret signifies the beginning of the string. If you + wish to match a literal \c{^} you must escape it by + writing \c{\\^}. For example, \b{^#include} will only + match strings which \e begin with the characters '#include'. + (When the caret is the first character of a character set it + has a special meaning, see \l{#sets-of-characters}{Sets of Characters}.) + + \row \li \b{$} + \li The dollar signifies the end of the string. For example + \b{\\d\\s*$} will match strings which end with a digit + optionally followed by whitespace. If you wish to match a + literal \c{$} you must escape it by writing + \c{\\$}. + + \row \li \b{\\b} + \li A word boundary. For example the regexp + \b{\\bOK\\b} means match immediately after a word + boundary (e.g. start of string or whitespace) the letter 'O' + then the letter 'K' immediately before another word boundary + (e.g. end of string or whitespace). But note that the + assertion does not actually match any whitespace so if we + write \b{(\\bOK\\b)} and we have a match it will only + contain 'OK' even if the string is "It's \underline{OK} now". + + \row \li \b{\\B} + \li A non-word boundary. This assertion is true wherever + \b{\\b} is false. For example if we searched for + \b{\\Bon\\B} in "Left on" the match would fail (space + and end of string aren't non-word boundaries), but it would + match in "t\underline{on}ne". + + \row \li \b{(?=\e E)} + \li Positive lookahead. This assertion is true if the + expression matches at this point in the regexp. For example, + \b{const(?=\\s+char)} matches 'const' whenever it is + followed by 'char', as in 'static \underline{const} char *'. + (Compare with \b{const\\s+char}, which matches 'static + \underline{const char} *'.) + + \row \li \b{(?!\e E)} + \li Negative lookahead. This assertion is true if the + expression does not match at this point in the regexp. For + example, \b{const(?!\\s+char)} matches 'const' \e except + when it is followed by 'char'. + \endtable + + \target QRegExp wildcard matching + \section1 Wildcard Matching + + Most command shells such as \e bash or \e cmd.exe support "file + globbing", the ability to identify a group of files by using + wildcards. The setPatternSyntax() function is used to switch + between regexp and wildcard mode. Wildcard matching is much + simpler than full regexps and has only four features: + + \table + \row \li \b{c} + \li Any character represents itself apart from those mentioned + below. Thus \b{c} matches the character \e c. + \row \li \b{?} + \li Matches any single character. It is the same as + \b{.} in full regexps. + \row \li \b{*} + \li Matches zero or more of any characters. It is the + same as \b{.*} in full regexps. + \row \li \b{[...]} + \li Sets of characters can be represented in square brackets, + similar to full regexps. Within the character class, like + outside, backslash has no special meaning. + \endtable + + In the mode Wildcard, the wildcard characters cannot be + escaped. In the mode WildcardUnix, the character '\\' escapes the + wildcard. + + For example if we are in wildcard mode and have strings which + contain filenames we could identify HTML files with \b{*.html}. + This will match zero or more characters followed by a dot followed + by 'h', 't', 'm' and 'l'. + + To test a string against a wildcard expression, use exactMatch(). + For example: + + \snippet code/src_corelib_tools_qregexp.cpp 1 + + \target perl-users + \section1 Notes for Perl Users + + Most of the character class abbreviations supported by Perl are + supported by QRegExp, see \l{#characters-and-abbreviations-for-sets-of-characters} + {characters and abbreviations for sets of characters}. + + In QRegExp, apart from within character classes, \c{^} always + signifies the start of the string, so carets must always be + escaped unless used for that purpose. In Perl the meaning of caret + varies automagically depending on where it occurs so escaping it + is rarely necessary. The same applies to \c{$} which in + QRegExp always signifies the end of the string. + + QRegExp's quantifiers are the same as Perl's greedy quantifiers + (but see the \l{greedy quantifiers}{note above}). Non-greedy + matching cannot be applied to individual quantifiers, but can be + applied to all the quantifiers in the pattern. For example, to + match the Perl regexp \b{ro+?m} requires: + + \snippet code/src_corelib_tools_qregexp.cpp 2 + + The equivalent of Perl's \c{/i} option is + setCaseSensitivity(Qt::CaseInsensitive). + + Perl's \c{/g} option can be emulated using a \l{#cap_in_a_loop}{loop}. + + In QRegExp \b{.} matches any character, therefore all QRegExp + regexps have the equivalent of Perl's \c{/s} option. QRegExp + does not have an equivalent to Perl's \c{/m} option, but this + can be emulated in various ways for example by splitting the input + into lines or by looping with a regexp that searches for newlines. + + Because QRegExp is string oriented, there are no \\A, \\Z, or \\z + assertions. The \\G assertion is not supported but can be emulated + in a loop. + + Perl's $& is cap(0) or capturedTexts()[0]. There are no QRegExp + equivalents for $`, $' or $+. Perl's capturing variables, $1, $2, + ... correspond to cap(1) or capturedTexts()[1], cap(2) or + capturedTexts()[2], etc. + + To substitute a pattern use QString::replace(). + + Perl's extended \c{/x} syntax is not supported, nor are + directives, e.g. (?i), or regexp comments, e.g. (?#comment). On + the other hand, C++'s rules for literal strings can be used to + achieve the same: + + \snippet code/src_corelib_tools_qregexp.cpp 3 + + Both zero-width positive and zero-width negative lookahead + assertions (?=pattern) and (?!pattern) are supported with the same + syntax as Perl. Perl's lookbehind assertions, "independent" + subexpressions and conditional expressions are not supported. + + Non-capturing parentheses are also supported, with the same + (?:pattern) syntax. + + See QString::split() and QStringList::join() for equivalents + to Perl's split and join functions. + + Note: because C++ transforms \\'s they must be written \e twice in + code, e.g. \b{\\b} must be written \b{\\\\b}. + + \target code-examples + \section1 Code Examples + + \snippet code/src_corelib_tools_qregexp.cpp 4 + + The third string matches '\underline{6}'. This is a simple validation + regexp for integers in the range 0 to 99. + + \snippet code/src_corelib_tools_qregexp.cpp 5 + + The second string matches '\underline{This_is-OK}'. We've used the + character set abbreviation '\\S' (non-whitespace) and the anchors + to match strings which contain no whitespace. + + In the following example we match strings containing 'mail' or + 'letter' or 'correspondence' but only match whole words i.e. not + 'email' + + \snippet code/src_corelib_tools_qregexp.cpp 6 + + The second string matches "Please write the \underline{letter}". The + word 'letter' is also captured (because of the parentheses). We + can see what text we've captured like this: + + \snippet code/src_corelib_tools_qregexp.cpp 7 + + This will capture the text from the first set of capturing + parentheses (counting capturing left parentheses from left to + right). The parentheses are counted from 1 since cap(0) is the + whole matched regexp (equivalent to '&' in most regexp engines). + + \snippet code/src_corelib_tools_qregexp.cpp 8 + + Here we've passed the QRegExp to QString's replace() function to + replace the matched text with new text. + + \snippet code/src_corelib_tools_qregexp.cpp 9 + + We've used the indexIn() function to repeatedly match the regexp in + the string. Note that instead of moving forward by one character + at a time \c pos++ we could have written \c {pos += + rx.matchedLength()} to skip over the already matched string. The + count will equal 3, matching 'One \underline{Eric} another + \underline{Eirik}, and an Ericsson. How many Eiriks, \underline{Eric}?'; it + doesn't match 'Ericsson' or 'Eiriks' because they are not bounded + by non-word boundaries. + + One common use of regexps is to split lines of delimited data into + their component fields. + + \snippet code/src_corelib_tools_qregexp.cpp 10 + + In this example our input lines have the format company name, web + address and country. Unfortunately the regexp is rather long and + not very versatile -- the code will break if we add any more + fields. A simpler and better solution is to look for the + separator, '\\t' in this case, and take the surrounding text. The + QString::split() function can take a separator string or regexp + as an argument and split a string accordingly. + + \snippet code/src_corelib_tools_qregexp.cpp 11 + + Here field[0] is the company, field[1] the web address and so on. + + To imitate the matching of a shell we can use wildcard mode. + + \snippet code/src_corelib_tools_qregexp.cpp 12 + + Wildcard matching can be convenient because of its simplicity, but + any wildcard regexp can be defined using full regexps, e.g. + \b{.*\\.html$}. Notice that we can't match both \c .html and \c + .htm files with a wildcard unless we use \b{*.htm*} which will + also match 'test.html.bak'. A full regexp gives us the precision + we need, \b{.*\\.html?$}. + + QRegExp can match case insensitively using setCaseSensitivity(), + and can use non-greedy matching, see setMinimal(). By + default QRegExp uses full regexps but this can be changed with + setPatternSyntax(). Searching can be done forward with indexIn() or backward + with lastIndexIn(). Captured text can be accessed using + capturedTexts() which returns a string list of all captured + strings, or using cap() which returns the captured string for the + given index. The pos() function takes a match index and returns + the position in the string where the match was made (or -1 if + there was no match). + + \sa QString, QStringList, QRegExpValidator, QSortFilterProxyModel, + {tools/regexp}{Regular Expression Example} +*/ + +#if defined(Q_OS_VXWORKS) && defined(EOS) +# undef EOS +#endif + +const int NumBadChars = 64; +#define BadChar(ch) ((ch).unicode() % NumBadChars) + +const int NoOccurrence = INT_MAX; +const int EmptyCapture = INT_MAX; +const int InftyLen = INT_MAX; +const int InftyRep = 1025; +const int EOS = -1; + +static bool isWord(QChar ch) +{ + return ch.isLetterOrNumber() || ch.isMark() || ch == QLatin1Char('_'); +} + +/* + Merges two vectors of ints and puts the result into the first + one. +*/ +static void mergeInto(QVector *a, const QVector &b) +{ + int asize = a->size(); + int bsize = b.size(); + if (asize == 0) { + *a = b; +#ifndef QT_NO_REGEXP_OPTIM + } else if (bsize == 1 && a->at(asize - 1) < b.at(0)) { + a->resize(asize + 1); + (*a)[asize] = b.at(0); +#endif + } else if (bsize >= 1) { + int csize = asize + bsize; + QVector c(csize); + int i = 0, j = 0, k = 0; + while (i < asize) { + if (j < bsize) { + if (a->at(i) == b.at(j)) { + ++i; + --csize; + } else if (a->at(i) < b.at(j)) { + c[k++] = a->at(i++); + } else { + c[k++] = b.at(j++); + } + } else { + memcpy(c.data() + k, a->constData() + i, (asize - i) * sizeof(int)); + break; + } + } + c.resize(csize); + if (j < bsize) + memcpy(c.data() + k, b.constData() + j, (bsize - j) * sizeof(int)); + *a = c; + } +} + +#ifndef QT_NO_REGEXP_WILDCARD +/* + Translates a wildcard pattern to an equivalent regular expression + pattern (e.g., *.cpp to .*\.cpp). + + If enableEscaping is true, it is possible to escape the wildcard + characters with \ +*/ +static QString wc2rx(const QString &wc_str, const bool enableEscaping) +{ + const int wclen = wc_str.length(); + QString rx; + int i = 0; + bool isEscaping = false; // the previous character is '\' + const QChar *wc = wc_str.unicode(); + + while (i < wclen) { + const QChar c = wc[i++]; + switch (c.unicode()) { + case '\\': + if (enableEscaping) { + if (isEscaping) { + rx += QLatin1String("\\\\"); + } // we insert the \\ later if necessary + if (i == wclen) { // the end + rx += QLatin1String("\\\\"); + } + } else { + rx += QLatin1String("\\\\"); + } + isEscaping = true; + break; + case '*': + if (isEscaping) { + rx += QLatin1String("\\*"); + isEscaping = false; + } else { + rx += QLatin1String(".*"); + } + break; + case '?': + if (isEscaping) { + rx += QLatin1String("\\?"); + isEscaping = false; + } else { + rx += QLatin1Char('.'); + } + + break; + case '$': + case '(': + case ')': + case '+': + case '.': + case '^': + case '{': + case '|': + case '}': + if (isEscaping) { + isEscaping = false; + rx += QLatin1String("\\\\"); + } + rx += QLatin1Char('\\'); + rx += c; + break; + case '[': + if (isEscaping) { + isEscaping = false; + rx += QLatin1String("\\["); + } else { + rx += c; + if (wc[i] == QLatin1Char('^')) + rx += wc[i++]; + if (i < wclen) { + if (wc[i] == QLatin1Char(']')) + rx += wc[i++]; + while (i < wclen && wc[i] != QLatin1Char(']')) { + if (wc[i] == QLatin1Char('\\')) + rx += QLatin1Char('\\'); + rx += wc[i++]; + } + } + } + break; + + case ']': + if(isEscaping){ + isEscaping = false; + rx += QLatin1String("\\"); + } + rx += c; + break; + + default: + if(isEscaping){ + isEscaping = false; + rx += QLatin1String("\\\\"); + } + rx += c; + } + } + return rx; +} +#endif + +static int caretIndex(int offset, QRegExp::CaretMode caretMode) +{ + if (caretMode == QRegExp::CaretAtZero) { + return 0; + } else if (caretMode == QRegExp::CaretAtOffset) { + return offset; + } else { // QRegExp::CaretWontMatch + return -1; + } +} + +/* + The QRegExpEngineKey struct uniquely identifies an engine. +*/ +struct QRegExpEngineKey +{ + QString pattern; + QRegExp::PatternSyntax patternSyntax; + Qt::CaseSensitivity cs; + + inline QRegExpEngineKey(const QString &pattern, QRegExp::PatternSyntax patternSyntax, + Qt::CaseSensitivity cs) + : pattern(pattern), patternSyntax(patternSyntax), cs(cs) {} + + inline void clear() { + pattern.clear(); + patternSyntax = QRegExp::RegExp; + cs = Qt::CaseSensitive; + } +}; + +static bool operator==(const QRegExpEngineKey &key1, const QRegExpEngineKey &key2) +{ + return key1.pattern == key2.pattern && key1.patternSyntax == key2.patternSyntax + && key1.cs == key2.cs; +} + +static uint qHash(const QRegExpEngineKey &key, uint seed = 0) noexcept +{ + QtPrivate::QHashCombine hash; + seed = hash(seed, key.pattern); + seed = hash(seed, key.patternSyntax); + seed = hash(seed, key.cs); + return seed; +} + +class QRegExpEngine; + +//Q_DECLARE_TYPEINFO(QVector, Q_MOVABLE_TYPE); + +/* + This is the engine state during matching. +*/ +struct QRegExpMatchState +{ + const QChar *in; // a pointer to the input string data + int pos; // the current position in the string + int caretPos; + int len; // the length of the input string + bool minimal; // minimal matching? + int *bigArray; // big array holding the data for the next pointers + int *inNextStack; // is state is nextStack? + int *curStack; // stack of current states + int *nextStack; // stack of next states + int *curCapBegin; // start of current states' captures + int *nextCapBegin; // start of next states' captures + int *curCapEnd; // end of current states' captures + int *nextCapEnd; // end of next states' captures + int *tempCapBegin; // start of temporary captures + int *tempCapEnd; // end of temporary captures + int *capBegin; // start of captures for a next state + int *capEnd; // end of captures for a next state + int *slideTab; // bump-along slide table for bad-character heuristic + int *captured; // what match() returned last + int slideTabSize; // size of slide table + int capturedSize; +#ifndef QT_NO_REGEXP_BACKREF + QList > sleeping; // list of back-reference sleepers +#endif + int matchLen; // length of match + int oneTestMatchedLen; // length of partial match + + const QRegExpEngine *eng; + + inline QRegExpMatchState() : bigArray(nullptr), captured(nullptr) {} + inline ~QRegExpMatchState() { free(bigArray); } + + void drain() { free(bigArray); bigArray = nullptr; captured = nullptr; } // to save memory + void prepareForMatch(QRegExpEngine *eng); + void match(const QChar *str, int len, int pos, bool minimal, + bool oneTest, int caretIndex); + bool matchHere(); + bool testAnchor(int i, int a, const int *capBegin); +}; + +/* + The struct QRegExpAutomatonState represents one state in a modified NFA. The + input characters matched are stored in the state instead of on + the transitions, something possible for an automaton + constructed from a regular expression. +*/ +struct QRegExpAutomatonState +{ +#ifndef QT_NO_REGEXP_CAPTURE + int atom; // which atom does this state belong to? +#endif + int match; // what does it match? (see CharClassBit and BackRefBit) + QVector outs; // out-transitions + QMap reenter; // atoms reentered when transiting out + QMap anchors; // anchors met when transiting out + + inline QRegExpAutomatonState() { } +#ifndef QT_NO_REGEXP_CAPTURE + inline QRegExpAutomatonState(int a, int m) + : atom(a), match(m) { } +#else + inline QRegExpAutomatonState(int m) + : match(m) { } +#endif +}; + +Q_DECLARE_TYPEINFO(QRegExpAutomatonState, Q_MOVABLE_TYPE); + +/* + The struct QRegExpCharClassRange represents a range of characters (e.g., + [0-9] denotes range 48 to 57). +*/ +struct QRegExpCharClassRange +{ + ushort from; // 48 + ushort len; // 10 +}; + +Q_DECLARE_TYPEINFO(QRegExpCharClassRange, Q_PRIMITIVE_TYPE); + +#ifndef QT_NO_REGEXP_CAPTURE +/* + The struct QRegExpAtom represents one node in the hierarchy of regular + expression atoms. +*/ +struct QRegExpAtom +{ + enum { NoCapture = -1, OfficialCapture = -2, UnofficialCapture = -3 }; + + int parent; // index of parent in array of atoms + int capture; // index of capture, from 1 to ncap - 1 +}; + +Q_DECLARE_TYPEINFO(QRegExpAtom, Q_PRIMITIVE_TYPE); +#endif + +struct QRegExpLookahead; + +#ifndef QT_NO_REGEXP_ANCHOR_ALT +/* + The struct QRegExpAnchorAlternation represents a pair of anchors with + OR semantics. +*/ +struct QRegExpAnchorAlternation +{ + int a; // this anchor... + int b; // ...or this one +}; + +Q_DECLARE_TYPEINFO(QRegExpAnchorAlternation, Q_PRIMITIVE_TYPE); +#endif + +#ifndef QT_NO_REGEXP_CCLASS + +#define FLAG(x) (1 << (x)) +/* + The class QRegExpCharClass represents a set of characters, such as can + be found in regular expressions (e.g., [a-z] denotes the set + {a, b, ..., z}). +*/ +class QRegExpCharClass +{ +public: + QRegExpCharClass(); + + void clear(); + bool negative() const { return n; } + void setNegative(bool negative); + void addCategories(uint cats); + void addRange(ushort from, ushort to); + void addSingleton(ushort ch) { addRange(ch, ch); } + + bool in(QChar ch) const; +#ifndef QT_NO_REGEXP_OPTIM + const QVector &firstOccurrence() const { return occ1; } +#endif + +#if defined(QT_DEBUG) + void dump() const; +#endif + +private: + QVector r; // character ranges +#ifndef QT_NO_REGEXP_OPTIM + QVector occ1; // first-occurrence array +#endif + uint c; // character classes + bool n; // negative? +}; +#else +struct QRegExpCharClass +{ + int dummy; + +#ifndef QT_NO_REGEXP_OPTIM + QRegExpCharClass() { occ1.fill(0, NumBadChars); } + + const QVector &firstOccurrence() const { return occ1; } + QVector occ1; +#endif +}; +#endif + +Q_DECLARE_TYPEINFO(QRegExpCharClass, Q_MOVABLE_TYPE); + +/* + The QRegExpEngine class encapsulates a modified nondeterministic + finite automaton (NFA). +*/ +class QRegExpEngine +{ +public: + QRegExpEngine(Qt::CaseSensitivity cs, bool greedyQuantifiers) + : cs(cs), greedyQuantifiers(greedyQuantifiers) { setup(); } + + QRegExpEngine(const QRegExpEngineKey &key); + ~QRegExpEngine(); + + bool isValid() const { return valid; } + const QString &errorString() const { return yyError; } + int captureCount() const { return officialncap; } + + int createState(QChar ch); + int createState(const QRegExpCharClass &cc); +#ifndef QT_NO_REGEXP_BACKREF + int createState(int bref); +#endif + + void addCatTransitions(const QVector &from, const QVector &to); +#ifndef QT_NO_REGEXP_CAPTURE + void addPlusTransitions(const QVector &from, const QVector &to, int atom); +#endif + +#ifndef QT_NO_REGEXP_ANCHOR_ALT + int anchorAlternation(int a, int b); + int anchorConcatenation(int a, int b); +#else + int anchorAlternation(int a, int b) { return a & b; } + int anchorConcatenation(int a, int b) { return a | b; } +#endif + void addAnchors(int from, int to, int a); + +#ifndef QT_NO_REGEXP_OPTIM + void heuristicallyChooseHeuristic(); +#endif + +#if defined(QT_DEBUG) + void dump() const; +#endif + + QAtomicInt ref; + +private: + enum { CharClassBit = 0x10000, BackRefBit = 0x20000 }; + enum { InitialState = 0, FinalState = 1 }; + + void setup(); + int setupState(int match); + + /* + Let's hope that 13 lookaheads and 14 back-references are + enough. + */ + enum { MaxLookaheads = 13, MaxBackRefs = 14 }; + enum { Anchor_Dollar = 0x00000001, Anchor_Caret = 0x00000002, Anchor_Word = 0x00000004, + Anchor_NonWord = 0x00000008, Anchor_FirstLookahead = 0x00000010, + Anchor_BackRef1Empty = Anchor_FirstLookahead << MaxLookaheads, + Anchor_BackRef0Empty = Anchor_BackRef1Empty >> 1, + Anchor_Alternation = unsigned(Anchor_BackRef1Empty) << MaxBackRefs, + + Anchor_LookaheadMask = (Anchor_FirstLookahead - 1) ^ + ((Anchor_FirstLookahead << MaxLookaheads) - 1) }; +#ifndef QT_NO_REGEXP_CAPTURE + int startAtom(bool officialCapture); + void finishAtom(int atom, bool needCapture); +#endif + +#ifndef QT_NO_REGEXP_LOOKAHEAD + int addLookahead(QRegExpEngine *eng, bool negative); +#endif + +#ifndef QT_NO_REGEXP_OPTIM + bool goodStringMatch(QRegExpMatchState &matchState) const; + bool badCharMatch(QRegExpMatchState &matchState) const; +#else + bool bruteMatch(QRegExpMatchState &matchState) const; +#endif + + QVector s; // array of states +#ifndef QT_NO_REGEXP_CAPTURE + QVector f; // atom hierarchy + int nf; // number of atoms + int cf; // current atom + QVector captureForOfficialCapture; +#endif + int officialncap; // number of captures, seen from the outside + int ncap; // number of captures, seen from the inside +#ifndef QT_NO_REGEXP_CCLASS + QVector cl; // array of character classes +#endif +#ifndef QT_NO_REGEXP_LOOKAHEAD + QVector ahead; // array of lookaheads +#endif +#ifndef QT_NO_REGEXP_ANCHOR_ALT + QVector aa; // array of (a, b) pairs of anchors +#endif +#ifndef QT_NO_REGEXP_OPTIM + bool caretAnchored; // does the regexp start with ^? + bool trivial; // is the good-string all that needs to match? +#endif + bool valid; // is the regular expression valid? + Qt::CaseSensitivity cs; // case sensitive? + bool greedyQuantifiers; // RegExp2? + bool xmlSchemaExtensions; +#ifndef QT_NO_REGEXP_BACKREF + int nbrefs; // number of back-references +#endif + +#ifndef QT_NO_REGEXP_OPTIM + bool useGoodStringHeuristic; // use goodStringMatch? otherwise badCharMatch + + int goodEarlyStart; // the index where goodStr can first occur in a match + int goodLateStart; // the index where goodStr can last occur in a match + QString goodStr; // the string that any match has to contain + + int minl; // the minimum length of a match + QVector occ1; // first-occurrence array +#endif + + /* + The class Box is an abstraction for a regular expression + fragment. It can also be seen as one node in the syntax tree of + a regular expression with synthetized attributes. + + Its interface is ugly for performance reasons. + */ + class Box + { + public: + Box(QRegExpEngine *engine); + Box(const Box &b) { operator=(b); } + + Box &operator=(const Box &b); + + void clear() { operator=(Box(eng)); } + void set(QChar ch); + void set(const QRegExpCharClass &cc); +#ifndef QT_NO_REGEXP_BACKREF + void set(int bref); +#endif + + void cat(const Box &b); + void orx(const Box &b); + void plus(int atom); + void opt(); + void catAnchor(int a); +#ifndef QT_NO_REGEXP_OPTIM + void setupHeuristics(); +#endif + +#if defined(QT_DEBUG) + void dump() const; +#endif + + private: + void addAnchorsToEngine(const Box &to) const; + + QRegExpEngine *eng; // the automaton under construction + QVector ls; // the left states (firstpos) + QVector rs; // the right states (lastpos) + QMap lanchors; // the left anchors + QMap ranchors; // the right anchors + int skipanchors; // the anchors to match if the box is skipped + +#ifndef QT_NO_REGEXP_OPTIM + int earlyStart; // the index where str can first occur + int lateStart; // the index where str can last occur + QString str; // a string that has to occur in any match + QString leftStr; // a string occurring at the left of this box + QString rightStr; // a string occurring at the right of this box + int maxl; // the maximum length of this box (possibly InftyLen) +#endif + + int minl; // the minimum length of this box +#ifndef QT_NO_REGEXP_OPTIM + QVector occ1; // first-occurrence array +#endif + }; + + friend class Box; + + /* + This is the lexical analyzer for regular expressions. + */ + enum { Tok_Eos, Tok_Dollar, Tok_LeftParen, Tok_MagicLeftParen, Tok_PosLookahead, + Tok_NegLookahead, Tok_RightParen, Tok_CharClass, Tok_Caret, Tok_Quantifier, Tok_Bar, + Tok_Word, Tok_NonWord, Tok_Char = 0x10000, Tok_BackRef = 0x20000 }; + int getChar(); + int getEscape(); +#ifndef QT_NO_REGEXP_INTERVAL + int getRep(int def); +#endif +#ifndef QT_NO_REGEXP_LOOKAHEAD + void skipChars(int n); +#endif + void error(const char *msg); + void startTokenizer(const QChar *rx, int len); + int getToken(); + + const QChar *yyIn; // a pointer to the input regular expression pattern + int yyPos0; // the position of yyTok in the input pattern + int yyPos; // the position of the next character to read + int yyLen; // the length of yyIn + int yyCh; // the last character read + QScopedPointer yyCharClass; // attribute for Tok_CharClass tokens + int yyMinRep; // attribute for Tok_Quantifier + int yyMaxRep; // ditto + QString yyError; // syntax error or overflow during parsing? + + /* + This is the syntactic analyzer for regular expressions. + */ + int parse(const QChar *rx, int len); + void parseAtom(Box *box); + void parseFactor(Box *box); + void parseTerm(Box *box); + void parseExpression(Box *box); + + int yyTok; // the last token read + bool yyMayCapture; // set this to false to disable capturing + + friend struct QRegExpMatchState; +}; + +#ifndef QT_NO_REGEXP_LOOKAHEAD +/* + The struct QRegExpLookahead represents a lookahead a la Perl (e.g., + (?=foo) and (?!bar)). +*/ +struct QRegExpLookahead +{ + QRegExpEngine *eng; // NFA representing the embedded regular expression + bool neg; // negative lookahead? + + inline QRegExpLookahead(QRegExpEngine *eng0, bool neg0) + : eng(eng0), neg(neg0) { } + inline ~QRegExpLookahead() { delete eng; } +}; +#endif + +/*! + \internal + convert the pattern string to the RegExp syntax. + + This is also used by QScriptEngine::newRegExp to convert to a pattern that JavaScriptCore can understan + */ +Q_CORE_EXPORT QString qt_regexp_toCanonical(const QString &pattern, QRegExp::PatternSyntax patternSyntax) +{ + switch (patternSyntax) { +#ifndef QT_NO_REGEXP_WILDCARD + case QRegExp::Wildcard: + return wc2rx(pattern, false); + case QRegExp::WildcardUnix: + return wc2rx(pattern, true); +#endif + case QRegExp::FixedString: + return QRegExp::escape(pattern); + case QRegExp::W3CXmlSchema11: + default: + return pattern; + } +} + +QRegExpEngine::QRegExpEngine(const QRegExpEngineKey &key) + : cs(key.cs), greedyQuantifiers(key.patternSyntax == QRegExp::RegExp2), + xmlSchemaExtensions(key.patternSyntax == QRegExp::W3CXmlSchema11) +{ + setup(); + + QString rx = qt_regexp_toCanonical(key.pattern, key.patternSyntax); + + valid = (parse(rx.unicode(), rx.length()) == rx.length()); + if (!valid) { +#ifndef QT_NO_REGEXP_OPTIM + trivial = false; +#endif + error(RXERR_LEFTDELIM); + } +} + +QRegExpEngine::~QRegExpEngine() +{ +#ifndef QT_NO_REGEXP_LOOKAHEAD + qDeleteAll(ahead); +#endif +} + +void QRegExpMatchState::prepareForMatch(QRegExpEngine *eng) +{ + /* + We use one QVector for all the big data used a lot in + matchHere() and friends. + */ + int ns = eng->s.size(); // number of states + int ncap = eng->ncap; +#ifndef QT_NO_REGEXP_OPTIM + int newSlideTabSize = qMax(eng->minl + 1, 16); +#else + int newSlideTabSize = 0; +#endif + int numCaptures = eng->captureCount(); + int newCapturedSize = 2 + 2 * numCaptures; + bigArray = q_check_ptr((int *)realloc(bigArray, ((3 + 4 * ncap) * ns + 4 * ncap + newSlideTabSize + newCapturedSize)*sizeof(int))); + + // set all internal variables only _after_ bigArray is realloc'ed + // to prevent a broken regexp in oom case + + slideTabSize = newSlideTabSize; + capturedSize = newCapturedSize; + inNextStack = bigArray; + memset(inNextStack, -1, ns * sizeof(int)); + curStack = inNextStack + ns; + nextStack = inNextStack + 2 * ns; + + curCapBegin = inNextStack + 3 * ns; + nextCapBegin = curCapBegin + ncap * ns; + curCapEnd = curCapBegin + 2 * ncap * ns; + nextCapEnd = curCapBegin + 3 * ncap * ns; + + tempCapBegin = curCapBegin + 4 * ncap * ns; + tempCapEnd = tempCapBegin + ncap; + capBegin = tempCapBegin + 2 * ncap; + capEnd = tempCapBegin + 3 * ncap; + + slideTab = tempCapBegin + 4 * ncap; + captured = slideTab + slideTabSize; + memset(captured, -1, capturedSize*sizeof(int)); + this->eng = eng; +} + +/* + Tries to match in str and returns an array of (begin, length) pairs + for captured text. If there is no match, all pairs are (-1, -1). +*/ +void QRegExpMatchState::match(const QChar *str0, int len0, int pos0, + bool minimal0, bool oneTest, int caretIndex) +{ + bool matched = false; + QChar char_null; + +#ifndef QT_NO_REGEXP_OPTIM + if (eng->trivial && !oneTest) { + // ### Qt6: qsizetype + pos = int(QtPrivate::findString(QStringView(str0, len0), pos0, QStringView(eng->goodStr.unicode(), eng->goodStr.length()), eng->cs)); + matchLen = eng->goodStr.length(); + matched = (pos != -1); + } else +#endif + { + in = str0; + if (in == nullptr) + in = &char_null; + pos = pos0; + caretPos = caretIndex; + len = len0; + minimal = minimal0; + matchLen = 0; + oneTestMatchedLen = 0; + + if (eng->valid && pos >= 0 && pos <= len) { +#ifndef QT_NO_REGEXP_OPTIM + if (oneTest) { + matched = matchHere(); + } else { + if (pos <= len - eng->minl) { + if (eng->caretAnchored) { + matched = matchHere(); + } else if (eng->useGoodStringHeuristic) { + matched = eng->goodStringMatch(*this); + } else { + matched = eng->badCharMatch(*this); + } + } + } +#else + matched = oneTest ? matchHere() : eng->bruteMatch(*this); +#endif + } + } + + if (matched) { + int *c = captured; + *c++ = pos; + *c++ = matchLen; + + int numCaptures = (capturedSize - 2) >> 1; +#ifndef QT_NO_REGEXP_CAPTURE + for (int i = 0; i < numCaptures; ++i) { + int j = eng->captureForOfficialCapture.at(i); + if (capBegin[j] != EmptyCapture) { + int len = capEnd[j] - capBegin[j]; + *c++ = (len > 0) ? pos + capBegin[j] : 0; + *c++ = len; + } else { + *c++ = -1; + *c++ = -1; + } + } +#endif + } else { + // we rely on 2's complement here + memset(captured, -1, capturedSize * sizeof(int)); + } +} + +/* + The three following functions add one state to the automaton and + return the number of the state. +*/ + +int QRegExpEngine::createState(QChar ch) +{ + return setupState(ch.unicode()); +} + +int QRegExpEngine::createState(const QRegExpCharClass &cc) +{ +#ifndef QT_NO_REGEXP_CCLASS + int n = cl.size(); + cl += QRegExpCharClass(cc); + return setupState(CharClassBit | n); +#else + Q_UNUSED(cc); + return setupState(CharClassBit); +#endif +} + +#ifndef QT_NO_REGEXP_BACKREF +int QRegExpEngine::createState(int bref) +{ + if (bref > nbrefs) { + nbrefs = bref; + if (nbrefs > MaxBackRefs) { + error(RXERR_LIMIT); + return 0; + } + } + return setupState(BackRefBit | bref); +} +#endif + +/* + The two following functions add a transition between all pairs of + states (i, j) where i is found in from, and j is found in to. + + Cat-transitions are distinguished from plus-transitions for + capturing. +*/ + +void QRegExpEngine::addCatTransitions(const QVector &from, const QVector &to) +{ + for (int i = 0; i < from.size(); i++) + mergeInto(&s[from.at(i)].outs, to); +} + +#ifndef QT_NO_REGEXP_CAPTURE +void QRegExpEngine::addPlusTransitions(const QVector &from, const QVector &to, int atom) +{ + for (int i = 0; i < from.size(); i++) { + QRegExpAutomatonState &st = s[from.at(i)]; + const QVector oldOuts = st.outs; + mergeInto(&st.outs, to); + if (f.at(atom).capture != QRegExpAtom::NoCapture) { + for (int j = 0; j < to.size(); j++) { + // ### st.reenter.contains(to.at(j)) check looks suspicious + if (!st.reenter.contains(to.at(j)) && + !std::binary_search(oldOuts.constBegin(), oldOuts.constEnd(), to.at(j))) + st.reenter.insert(to.at(j), atom); + } + } + } +} +#endif + +#ifndef QT_NO_REGEXP_ANCHOR_ALT +/* + Returns an anchor that means a OR b. +*/ +int QRegExpEngine::anchorAlternation(int a, int b) +{ + if (((a & b) == a || (a & b) == b) && ((a | b) & Anchor_Alternation) == 0) + return a & b; + + int n = aa.size(); +#ifndef QT_NO_REGEXP_OPTIM + if (n > 0 && aa.at(n - 1).a == a && aa.at(n - 1).b == b) + return Anchor_Alternation | (n - 1); +#endif + + QRegExpAnchorAlternation element = {a, b}; + aa.append(element); + return Anchor_Alternation | n; +} + +/* + Returns an anchor that means a AND b. +*/ +int QRegExpEngine::anchorConcatenation(int a, int b) +{ + if (((a | b) & Anchor_Alternation) == 0) + return a | b; + if ((b & Anchor_Alternation) != 0) + qSwap(a, b); + + int aprime = anchorConcatenation(aa.at(a ^ Anchor_Alternation).a, b); + int bprime = anchorConcatenation(aa.at(a ^ Anchor_Alternation).b, b); + return anchorAlternation(aprime, bprime); +} +#endif + +/* + Adds anchor a on a transition caracterised by its from state and + its to state. +*/ +void QRegExpEngine::addAnchors(int from, int to, int a) +{ + QRegExpAutomatonState &st = s[from]; + if (st.anchors.contains(to)) + a = anchorAlternation(st.anchors.value(to), a); + st.anchors.insert(to, a); +} + +#ifndef QT_NO_REGEXP_OPTIM +/* + This function chooses between the good-string and the bad-character + heuristics. It computes two scores and chooses the heuristic with + the highest score. + + Here are some common-sense constraints on the scores that should be + respected if the formulas are ever modified: (1) If goodStr is + empty, the good-string heuristic scores 0. (2) If the regular + expression is trivial, the good-string heuristic should be used. + (3) If the search is case insensitive, the good-string heuristic + should be used, unless it scores 0. (Case insensitivity turns all + entries of occ1 to 0.) (4) If (goodLateStart - goodEarlyStart) is + big, the good-string heuristic should score less. +*/ +void QRegExpEngine::heuristicallyChooseHeuristic() +{ + if (minl == 0) { + useGoodStringHeuristic = false; + } else if (trivial) { + useGoodStringHeuristic = true; + } else { + /* + Magic formula: The good string has to constitute a good + proportion of the minimum-length string, and appear at a + more-or-less known index. + */ + int goodStringScore = (64 * goodStr.length() / minl) - + (goodLateStart - goodEarlyStart); + /* + Less magic formula: We pick some characters at random, and + check whether they are good or bad. + */ + int badCharScore = 0; + int step = qMax(1, NumBadChars / 32); + for (int i = 1; i < NumBadChars; i += step) { + if (occ1.at(i) == NoOccurrence) + badCharScore += minl; + else + badCharScore += occ1.at(i); + } + badCharScore /= minl; + useGoodStringHeuristic = (goodStringScore > badCharScore); + } +} +#endif + +#if defined(QT_DEBUG) +void QRegExpEngine::dump() const +{ + int i, j; + qDebug("Case %ssensitive engine", cs ? "" : "in"); + qDebug(" States"); + for (i = 0; i < s.size(); i++) { + qDebug(" %d%s", i, i == InitialState ? " (initial)" : i == FinalState ? " (final)" : ""); +#ifndef QT_NO_REGEXP_CAPTURE + if (nf > 0) + qDebug(" in atom %d", s[i].atom); +#endif + int m = s[i].match; + if ((m & CharClassBit) != 0) { + qDebug(" match character class %d", m ^ CharClassBit); +#ifndef QT_NO_REGEXP_CCLASS + cl[m ^ CharClassBit].dump(); +#else + qDebug(" negative character class"); +#endif + } else if ((m & BackRefBit) != 0) { + qDebug(" match back-reference %d", m ^ BackRefBit); + } else if (m >= 0x20 && m <= 0x7e) { + qDebug(" match 0x%.4x (%c)", m, m); + } else { + qDebug(" match 0x%.4x", m); + } + for (j = 0; j < s[i].outs.size(); j++) { + int next = s[i].outs[j]; + qDebug(" -> %d", next); + if (s[i].reenter.contains(next)) + qDebug(" [reenter %d]", s[i].reenter[next]); + if (s[i].anchors.value(next) != 0) + qDebug(" [anchors 0x%.8x]", s[i].anchors[next]); + } + } +#ifndef QT_NO_REGEXP_CAPTURE + if (nf > 0) { + qDebug(" Atom Parent Capture"); + for (i = 0; i < nf; i++) { + if (f[i].capture == QRegExpAtom::NoCapture) { + qDebug(" %6d %6d nil", i, f[i].parent); + } else { + int cap = f[i].capture; + bool official = captureForOfficialCapture.contains(cap); + qDebug(" %6d %6d %6d %s", i, f[i].parent, f[i].capture, + official ? "official" : ""); + } + } + } +#endif +#ifndef QT_NO_REGEXP_ANCHOR_ALT + for (i = 0; i < aa.size(); i++) + qDebug(" Anchor alternation 0x%.8x: 0x%.8x 0x%.9x", i, aa[i].a, aa[i].b); +#endif +} +#endif + +void QRegExpEngine::setup() +{ + ref.storeRelaxed(1); +#ifndef QT_NO_REGEXP_CAPTURE + f.resize(32); + nf = 0; + cf = -1; +#endif + officialncap = 0; + ncap = 0; +#ifndef QT_NO_REGEXP_OPTIM + caretAnchored = true; + trivial = true; +#endif + valid = false; +#ifndef QT_NO_REGEXP_BACKREF + nbrefs = 0; +#endif +#ifndef QT_NO_REGEXP_OPTIM + useGoodStringHeuristic = true; + minl = 0; + occ1.fill(0, NumBadChars); +#endif +} + +int QRegExpEngine::setupState(int match) +{ +#ifndef QT_NO_REGEXP_CAPTURE + s += QRegExpAutomatonState(cf, match); +#else + s += QRegExpAutomatonState(match); +#endif + return s.size() - 1; +} + +#ifndef QT_NO_REGEXP_CAPTURE +/* + Functions startAtom() and finishAtom() should be called to delimit + atoms. When a state is created, it is assigned to the current atom. + The information is later used for capturing. +*/ +int QRegExpEngine::startAtom(bool officialCapture) +{ + if ((nf & (nf + 1)) == 0 && nf + 1 >= f.size()) + f.resize((nf + 1) << 1); + f[nf].parent = cf; + cf = nf++; + f[cf].capture = officialCapture ? QRegExpAtom::OfficialCapture : QRegExpAtom::NoCapture; + return cf; +} + +void QRegExpEngine::finishAtom(int atom, bool needCapture) +{ + if (greedyQuantifiers && needCapture && f[atom].capture == QRegExpAtom::NoCapture) + f[atom].capture = QRegExpAtom::UnofficialCapture; + cf = f.at(atom).parent; +} +#endif + +#ifndef QT_NO_REGEXP_LOOKAHEAD +/* + Creates a lookahead anchor. +*/ +int QRegExpEngine::addLookahead(QRegExpEngine *eng, bool negative) +{ + int n = ahead.size(); + if (n == MaxLookaheads) { + error(RXERR_LIMIT); + return 0; + } + ahead += new QRegExpLookahead(eng, negative); + return Anchor_FirstLookahead << n; +} +#endif + +#ifndef QT_NO_REGEXP_CAPTURE +/* + We want the longest leftmost captures. +*/ +static bool isBetterCapture(int ncap, const int *begin1, const int *end1, const int *begin2, + const int *end2) +{ + for (int i = 0; i < ncap; i++) { + int delta = begin2[i] - begin1[i]; // it has to start early... + if (delta == 0) + delta = end1[i] - end2[i]; // ...and end late + + if (delta != 0) + return delta > 0; + } + return false; +} +#endif + +/* + Returns \c true if anchor a matches at position pos + i in the input + string, otherwise false. +*/ +bool QRegExpMatchState::testAnchor(int i, int a, const int *capBegin) +{ + int j; + +#ifndef QT_NO_REGEXP_ANCHOR_ALT + if ((a & QRegExpEngine::Anchor_Alternation) != 0) + return testAnchor(i, eng->aa.at(a ^ QRegExpEngine::Anchor_Alternation).a, capBegin) + || testAnchor(i, eng->aa.at(a ^ QRegExpEngine::Anchor_Alternation).b, capBegin); +#endif + + if ((a & QRegExpEngine::Anchor_Caret) != 0) { + if (pos + i != caretPos) + return false; + } + if ((a & QRegExpEngine::Anchor_Dollar) != 0) { + if (pos + i != len) + return false; + } +#ifndef QT_NO_REGEXP_ESCAPE + if ((a & (QRegExpEngine::Anchor_Word | QRegExpEngine::Anchor_NonWord)) != 0) { + bool before = false; + bool after = false; + if (pos + i != 0) + before = isWord(in[pos + i - 1]); + if (pos + i != len) + after = isWord(in[pos + i]); + if ((a & QRegExpEngine::Anchor_Word) != 0 && (before == after)) + return false; + if ((a & QRegExpEngine::Anchor_NonWord) != 0 && (before != after)) + return false; + } +#endif +#ifndef QT_NO_REGEXP_LOOKAHEAD + if ((a & QRegExpEngine::Anchor_LookaheadMask) != 0) { + const QVector &ahead = eng->ahead; + for (j = 0; j < ahead.size(); j++) { + if ((a & (QRegExpEngine::Anchor_FirstLookahead << j)) != 0) { + QRegExpMatchState matchState; + matchState.prepareForMatch(ahead[j]->eng); + matchState.match(in + pos + i, len - pos - i, 0, + true, true, caretPos - pos - i); + if ((matchState.captured[0] == 0) == ahead[j]->neg) + return false; + } + } + } +#endif +#ifndef QT_NO_REGEXP_CAPTURE +#ifndef QT_NO_REGEXP_BACKREF + for (j = 0; j < eng->nbrefs; j++) { + if ((a & (QRegExpEngine::Anchor_BackRef1Empty << j)) != 0) { + int i = eng->captureForOfficialCapture.at(j); + if (capBegin[i] != EmptyCapture) + return false; + } + } +#endif +#endif + return true; +} + +#ifndef QT_NO_REGEXP_OPTIM +/* + The three following functions are what Jeffrey Friedl would call + transmissions (or bump-alongs). Using one or the other should make + no difference except in performance. +*/ + +bool QRegExpEngine::goodStringMatch(QRegExpMatchState &matchState) const +{ + int k = matchState.pos + goodEarlyStart; + QStringMatcher matcher(goodStr.unicode(), goodStr.length(), cs); + while ((k = matcher.indexIn(matchState.in, matchState.len, k)) != -1) { + int from = k - goodLateStart; + int to = k - goodEarlyStart; + if (from > matchState.pos) + matchState.pos = from; + + while (matchState.pos <= to) { + if (matchState.matchHere()) + return true; + ++matchState.pos; + } + ++k; + } + return false; +} + +bool QRegExpEngine::badCharMatch(QRegExpMatchState &matchState) const +{ + int slideHead = 0; + int slideNext = 0; + int i; + int lastPos = matchState.len - minl; + memset(matchState.slideTab, 0, matchState.slideTabSize * sizeof(int)); + + /* + Set up the slide table, used for the bad-character heuristic, + using the table of first occurrence of each character. + */ + for (i = 0; i < minl; i++) { + int sk = occ1[BadChar(matchState.in[matchState.pos + i])]; + if (sk == NoOccurrence) + sk = i + 1; + if (sk > 0) { + int k = i + 1 - sk; + if (k < 0) { + sk = i + 1; + k = 0; + } + if (sk > matchState.slideTab[k]) + matchState.slideTab[k] = sk; + } + } + + if (matchState.pos > lastPos) + return false; + + for (;;) { + if (++slideNext >= matchState.slideTabSize) + slideNext = 0; + if (matchState.slideTab[slideHead] > 0) { + if (matchState.slideTab[slideHead] - 1 > matchState.slideTab[slideNext]) + matchState.slideTab[slideNext] = matchState.slideTab[slideHead] - 1; + matchState.slideTab[slideHead] = 0; + } else { + if (matchState.matchHere()) + return true; + } + + if (matchState.pos == lastPos) + break; + + /* + Update the slide table. This code has much in common with + the initialization code. + */ + int sk = occ1[BadChar(matchState.in[matchState.pos + minl])]; + if (sk == NoOccurrence) { + matchState.slideTab[slideNext] = minl; + } else if (sk > 0) { + int k = slideNext + minl - sk; + if (k >= matchState.slideTabSize) + k -= matchState.slideTabSize; + if (sk > matchState.slideTab[k]) + matchState.slideTab[k] = sk; + } + slideHead = slideNext; + ++matchState.pos; + } + return false; +} +#else +bool QRegExpEngine::bruteMatch(QRegExpMatchState &matchState) const +{ + while (matchState.pos <= matchState.len) { + if (matchState.matchHere()) + return true; + ++matchState.pos; + } + return false; +} +#endif + +/* + Here's the core of the engine. It tries to do a match here and now. +*/ +bool QRegExpMatchState::matchHere() +{ + int ncur = 1, nnext = 0; + int i = 0, j, k, m; + bool stop = false; + + matchLen = -1; + oneTestMatchedLen = -1; + curStack[0] = QRegExpEngine::InitialState; + + int ncap = eng->ncap; +#ifndef QT_NO_REGEXP_CAPTURE + if (ncap > 0) { + for (j = 0; j < ncap; j++) { + curCapBegin[j] = EmptyCapture; + curCapEnd[j] = EmptyCapture; + } + } +#endif + +#ifndef QT_NO_REGEXP_BACKREF + while ((ncur > 0 || !sleeping.isEmpty()) && i <= len - pos && !stop) +#else + while (ncur > 0 && i <= len - pos && !stop) +#endif + { + int ch = (i < len - pos) ? in[pos + i].unicode() : 0; + for (j = 0; j < ncur; j++) { + int cur = curStack[j]; + const QRegExpAutomatonState &scur = eng->s.at(cur); + const QVector &outs = scur.outs; + for (k = 0; k < outs.size(); k++) { + int next = outs.at(k); + const QRegExpAutomatonState &snext = eng->s.at(next); + bool inside = true; +#if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE) + int needSomeSleep = 0; +#endif + + /* + First, check if the anchors are anchored properly. + */ + int a = scur.anchors.value(next); + if (a != 0 && !testAnchor(i, a, curCapBegin + j * ncap)) + inside = false; + + /* + If indeed they are, check if the input character is + correct for this transition. + */ + if (inside) { + m = snext.match; + if ((m & (QRegExpEngine::CharClassBit | QRegExpEngine::BackRefBit)) == 0) { + if (eng->cs) + inside = (m == ch); + else + inside = (QChar(m).toLower() == QChar(ch).toLower()); + } else if (next == QRegExpEngine::FinalState) { + matchLen = i; + stop = minimal; + inside = true; + } else if ((m & QRegExpEngine::CharClassBit) != 0) { +#ifndef QT_NO_REGEXP_CCLASS + const QRegExpCharClass &cc = eng->cl.at(m ^ QRegExpEngine::CharClassBit); + if (eng->cs) + inside = cc.in(QChar(ch)); + else if (cc.negative()) + inside = cc.in(QChar(ch).toLower()) && + cc.in(QChar(ch).toUpper()); + else + inside = cc.in(QChar(ch).toLower()) || + cc.in(QChar(ch).toUpper()); +#endif +#if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE) + } else { /* ((m & QRegExpEngine::BackRefBit) != 0) */ + int bref = m ^ QRegExpEngine::BackRefBit; + int ell = j * ncap + eng->captureForOfficialCapture.at(bref - 1); + + inside = bref <= ncap && curCapBegin[ell] != EmptyCapture; + if (inside) { + if (eng->cs) + inside = (in[pos + curCapBegin[ell]] == QChar(ch)); + else + inside = (in[pos + curCapBegin[ell]].toLower() + == QChar(ch).toLower()); + } + + if (inside) { + int delta; + if (curCapEnd[ell] == EmptyCapture) + delta = i - curCapBegin[ell]; + else + delta = curCapEnd[ell] - curCapBegin[ell]; + + inside = (delta <= len - (pos + i)); + if (inside && delta > 1) { + int n = 1; + if (eng->cs) { + while (n < delta) { + if (in[pos + curCapBegin[ell] + n] + != in[pos + i + n]) + break; + ++n; + } + } else { + while (n < delta) { + QChar a = in[pos + curCapBegin[ell] + n]; + QChar b = in[pos + i + n]; + if (a.toLower() != b.toLower()) + break; + ++n; + } + } + inside = (n == delta); + if (inside) + needSomeSleep = delta - 1; + } + } +#endif + } + } + + /* + We must now update our data structures. + */ + if (inside) { +#ifndef QT_NO_REGEXP_CAPTURE + int *capBegin, *capEnd; +#endif + /* + If the next state was not encountered yet, all + is fine. + */ + if ((m = inNextStack[next]) == -1) { + m = nnext++; + nextStack[m] = next; + inNextStack[next] = m; +#ifndef QT_NO_REGEXP_CAPTURE + capBegin = nextCapBegin + m * ncap; + capEnd = nextCapEnd + m * ncap; + + /* + Otherwise, we'll first maintain captures in + temporary arrays, and decide at the end whether + it's best to keep the previous capture zones or + the new ones. + */ + } else { + capBegin = tempCapBegin; + capEnd = tempCapEnd; +#endif + } + +#ifndef QT_NO_REGEXP_CAPTURE + /* + Updating the capture zones is much of a task. + */ + if (ncap > 0) { + memcpy(capBegin, curCapBegin + j * ncap, ncap * sizeof(int)); + memcpy(capEnd, curCapEnd + j * ncap, ncap * sizeof(int)); + int c = scur.atom, n = snext.atom; + int p = -1, q = -1; + int cap; + + /* + Lemma 1. For any x in the range [0..nf), we + have f[x].parent < x. + + Proof. By looking at startAtom(), it is + clear that cf < nf holds all the time, and + thus that f[nf].parent < nf. + */ + + /* + If we are reentering an atom, we empty all + capture zones inside it. + */ + if ((q = scur.reenter.value(next)) != 0) { + QBitArray b(eng->nf, false); + b.setBit(q, true); + for (int ell = q + 1; ell < eng->nf; ell++) { + if (b.testBit(eng->f.at(ell).parent)) { + b.setBit(ell, true); + cap = eng->f.at(ell).capture; + if (cap >= 0) { + capBegin[cap] = EmptyCapture; + capEnd[cap] = EmptyCapture; + } + } + } + p = eng->f.at(q).parent; + + /* + Otherwise, close the capture zones we are + leaving. We are leaving f[c].capture, + f[f[c].parent].capture, + f[f[f[c].parent].parent].capture, ..., + until f[x].capture, with x such that + f[x].parent is the youngest common ancestor + for c and n. + + We go up along c's and n's ancestry until + we find x. + */ + } else { + p = c; + q = n; + while (p != q) { + if (p > q) { + cap = eng->f.at(p).capture; + if (cap >= 0) { + if (capBegin[cap] == i) { + capBegin[cap] = EmptyCapture; + capEnd[cap] = EmptyCapture; + } else { + capEnd[cap] = i; + } + } + p = eng->f.at(p).parent; + } else { + q = eng->f.at(q).parent; + } + } + } + + /* + In any case, we now open the capture zones + we are entering. We work upwards from n + until we reach p (the parent of the atom we + reenter or the youngest common ancestor). + */ + while (n > p) { + cap = eng->f.at(n).capture; + if (cap >= 0) { + capBegin[cap] = i; + capEnd[cap] = EmptyCapture; + } + n = eng->f.at(n).parent; + } + /* + If the next state was already in + nextStack, we must choose carefully which + capture zones we want to keep. + */ + if (capBegin == tempCapBegin && + isBetterCapture(ncap, capBegin, capEnd, nextCapBegin + m * ncap, + nextCapEnd + m * ncap)) { + memcpy(nextCapBegin + m * ncap, capBegin, ncap * sizeof(int)); + memcpy(nextCapEnd + m * ncap, capEnd, ncap * sizeof(int)); + } + } +#ifndef QT_NO_REGEXP_BACKREF + /* + We are done with updating the capture zones. + It's now time to put the next state to sleep, + if it needs to, and to remove it from + nextStack. + */ + if (needSomeSleep > 0) { + QVector zzZ(2 + 2 * ncap); + zzZ[0] = i + needSomeSleep; + zzZ[1] = next; + if (ncap > 0) { + memcpy(zzZ.data() + 2, capBegin, ncap * sizeof(int)); + memcpy(zzZ.data() + 2 + ncap, capEnd, ncap * sizeof(int)); + } + inNextStack[nextStack[--nnext]] = -1; + sleeping.append(zzZ); + } +#endif +#endif + } + } + } +#ifndef QT_NO_REGEXP_CAPTURE + /* + If we reached the final state, hurray! Copy the captured + zone. + */ + if (ncap > 0 && (m = inNextStack[QRegExpEngine::FinalState]) != -1) { + memcpy(capBegin, nextCapBegin + m * ncap, ncap * sizeof(int)); + memcpy(capEnd, nextCapEnd + m * ncap, ncap * sizeof(int)); + } +#ifndef QT_NO_REGEXP_BACKREF + /* + It's time to wake up the sleepers. + */ + j = 0; + while (j < sleeping.count()) { + if (sleeping.at(j)[0] == i) { + const QVector &zzZ = sleeping.at(j); + int next = zzZ[1]; + const int *capBegin = zzZ.data() + 2; + const int *capEnd = zzZ.data() + 2 + ncap; + bool copyOver = true; + + if ((m = inNextStack[next]) == -1) { + m = nnext++; + nextStack[m] = next; + inNextStack[next] = m; + } else { + copyOver = isBetterCapture(ncap, nextCapBegin + m * ncap, nextCapEnd + m * ncap, + capBegin, capEnd); + } + if (copyOver) { + memcpy(nextCapBegin + m * ncap, capBegin, ncap * sizeof(int)); + memcpy(nextCapEnd + m * ncap, capEnd, ncap * sizeof(int)); + } + + sleeping.removeAt(j); + } else { + ++j; + } + } +#endif +#endif + for (j = 0; j < nnext; j++) + inNextStack[nextStack[j]] = -1; + + // avoid needless iteration that confuses oneTestMatchedLen + if (nnext == 1 && nextStack[0] == QRegExpEngine::FinalState +#ifndef QT_NO_REGEXP_BACKREF + && sleeping.isEmpty() +#endif + ) + stop = true; + + qSwap(curStack, nextStack); +#ifndef QT_NO_REGEXP_CAPTURE + qSwap(curCapBegin, nextCapBegin); + qSwap(curCapEnd, nextCapEnd); +#endif + ncur = nnext; + nnext = 0; + ++i; + } + +#ifndef QT_NO_REGEXP_BACKREF + /* + If minimal matching is enabled, we might have some sleepers + left. + */ + if (!sleeping.isEmpty()) + sleeping.clear(); +#endif + + oneTestMatchedLen = i - 1; + return (matchLen >= 0); +} + +#ifndef QT_NO_REGEXP_CCLASS + +QRegExpCharClass::QRegExpCharClass() + : c(0), n(false) +{ +#ifndef QT_NO_REGEXP_OPTIM + occ1.fill(NoOccurrence, NumBadChars); +#endif +} + +void QRegExpCharClass::clear() +{ + c = 0; + r.clear(); + n = false; +} + +void QRegExpCharClass::setNegative(bool negative) +{ + n = negative; +#ifndef QT_NO_REGEXP_OPTIM + occ1.fill(0, NumBadChars); +#endif +} + +void QRegExpCharClass::addCategories(uint cats) +{ + static const int all_cats = FLAG(QChar::Mark_NonSpacing) | + FLAG(QChar::Mark_SpacingCombining) | + FLAG(QChar::Mark_Enclosing) | + FLAG(QChar::Number_DecimalDigit) | + FLAG(QChar::Number_Letter) | + FLAG(QChar::Number_Other) | + FLAG(QChar::Separator_Space) | + FLAG(QChar::Separator_Line) | + FLAG(QChar::Separator_Paragraph) | + FLAG(QChar::Other_Control) | + FLAG(QChar::Other_Format) | + FLAG(QChar::Other_Surrogate) | + FLAG(QChar::Other_PrivateUse) | + FLAG(QChar::Other_NotAssigned) | + FLAG(QChar::Letter_Uppercase) | + FLAG(QChar::Letter_Lowercase) | + FLAG(QChar::Letter_Titlecase) | + FLAG(QChar::Letter_Modifier) | + FLAG(QChar::Letter_Other) | + FLAG(QChar::Punctuation_Connector) | + FLAG(QChar::Punctuation_Dash) | + FLAG(QChar::Punctuation_Open) | + FLAG(QChar::Punctuation_Close) | + FLAG(QChar::Punctuation_InitialQuote) | + FLAG(QChar::Punctuation_FinalQuote) | + FLAG(QChar::Punctuation_Other) | + FLAG(QChar::Symbol_Math) | + FLAG(QChar::Symbol_Currency) | + FLAG(QChar::Symbol_Modifier) | + FLAG(QChar::Symbol_Other); + c |= (all_cats & cats); +#ifndef QT_NO_REGEXP_OPTIM + occ1.fill(0, NumBadChars); +#endif +} + +void QRegExpCharClass::addRange(ushort from, ushort to) +{ + if (from > to) + qSwap(from, to); + int m = r.size(); + r.resize(m + 1); + r[m].from = from; + r[m].len = to - from + 1; + +#ifndef QT_NO_REGEXP_OPTIM + int i; + + if (to - from < NumBadChars) { + if (from % NumBadChars <= to % NumBadChars) { + for (i = from % NumBadChars; i <= to % NumBadChars; i++) + occ1[i] = 0; + } else { + for (i = 0; i <= to % NumBadChars; i++) + occ1[i] = 0; + for (i = from % NumBadChars; i < NumBadChars; i++) + occ1[i] = 0; + } + } else { + occ1.fill(0, NumBadChars); + } +#endif +} + +bool QRegExpCharClass::in(QChar ch) const +{ +#ifndef QT_NO_REGEXP_OPTIM + if (occ1.at(BadChar(ch)) == NoOccurrence) + return n; +#endif + + if (c != 0 && (c & FLAG(ch.category())) != 0) + return !n; + + const int uc = ch.unicode(); + int size = r.size(); + + for (int i = 0; i < size; ++i) { + const QRegExpCharClassRange &range = r.at(i); + if (uint(uc - range.from) < uint(r.at(i).len)) + return !n; + } + return n; +} + +#if defined(QT_DEBUG) +void QRegExpCharClass::dump() const +{ + int i; + qDebug(" %stive character class", n ? "nega" : "posi"); +#ifndef QT_NO_REGEXP_CCLASS + if (c != 0) + qDebug(" categories 0x%.8x", c); +#endif + for (i = 0; i < r.size(); i++) + qDebug(" 0x%.4x through 0x%.4x", r[i].from, r[i].from + r[i].len - 1); +} +#endif +#endif + +QRegExpEngine::Box::Box(QRegExpEngine *engine) + : eng(engine), skipanchors(0) +#ifndef QT_NO_REGEXP_OPTIM + , earlyStart(0), lateStart(0), maxl(0) +#endif +{ +#ifndef QT_NO_REGEXP_OPTIM + occ1.fill(NoOccurrence, NumBadChars); +#endif + minl = 0; +} + +QRegExpEngine::Box &QRegExpEngine::Box::operator=(const Box &b) +{ + eng = b.eng; + ls = b.ls; + rs = b.rs; + lanchors = b.lanchors; + ranchors = b.ranchors; + skipanchors = b.skipanchors; +#ifndef QT_NO_REGEXP_OPTIM + earlyStart = b.earlyStart; + lateStart = b.lateStart; + str = b.str; + leftStr = b.leftStr; + rightStr = b.rightStr; + maxl = b.maxl; + occ1 = b.occ1; +#endif + minl = b.minl; + return *this; +} + +void QRegExpEngine::Box::set(QChar ch) +{ + ls.resize(1); + ls[0] = eng->createState(ch); + rs = ls; +#ifndef QT_NO_REGEXP_OPTIM + str = ch; + leftStr = ch; + rightStr = ch; + maxl = 1; + occ1[BadChar(ch)] = 0; +#endif + minl = 1; +} + +void QRegExpEngine::Box::set(const QRegExpCharClass &cc) +{ + ls.resize(1); + ls[0] = eng->createState(cc); + rs = ls; +#ifndef QT_NO_REGEXP_OPTIM + maxl = 1; + occ1 = cc.firstOccurrence(); +#endif + minl = 1; +} + +#ifndef QT_NO_REGEXP_BACKREF +void QRegExpEngine::Box::set(int bref) +{ + ls.resize(1); + ls[0] = eng->createState(bref); + rs = ls; + if (bref >= 1 && bref <= MaxBackRefs) + skipanchors = Anchor_BackRef0Empty << bref; +#ifndef QT_NO_REGEXP_OPTIM + maxl = InftyLen; +#endif + minl = 0; +} +#endif + +void QRegExpEngine::Box::cat(const Box &b) +{ + eng->addCatTransitions(rs, b.ls); + addAnchorsToEngine(b); + if (minl == 0) { + lanchors.unite(b.lanchors); + if (skipanchors != 0) { + for (int i = 0; i < b.ls.size(); i++) { + int a = eng->anchorConcatenation(lanchors.value(b.ls.at(i), 0), skipanchors); + lanchors.insert(b.ls.at(i), a); + } + } + mergeInto(&ls, b.ls); + } + if (b.minl == 0) { + ranchors.unite(b.ranchors); + if (b.skipanchors != 0) { + for (int i = 0; i < rs.size(); i++) { + int a = eng->anchorConcatenation(ranchors.value(rs.at(i), 0), b.skipanchors); + ranchors.insert(rs.at(i), a); + } + } + mergeInto(&rs, b.rs); + } else { + ranchors = b.ranchors; + rs = b.rs; + } + +#ifndef QT_NO_REGEXP_OPTIM + if (maxl != InftyLen) { + if (rightStr.length() + b.leftStr.length() > + qMax(str.length(), b.str.length())) { + earlyStart = minl - rightStr.length(); + lateStart = maxl - rightStr.length(); + str = rightStr + b.leftStr; + } else if (b.str.length() > str.length()) { + earlyStart = minl + b.earlyStart; + lateStart = maxl + b.lateStart; + str = b.str; + } + } + + if (leftStr.length() == maxl) + leftStr += b.leftStr; + + if (b.rightStr.length() == b.maxl) { + rightStr += b.rightStr; + } else { + rightStr = b.rightStr; + } + + if (maxl == InftyLen || b.maxl == InftyLen) { + maxl = InftyLen; + } else { + maxl += b.maxl; + } + + for (int i = 0; i < NumBadChars; i++) { + if (b.occ1.at(i) != NoOccurrence && minl + b.occ1.at(i) < occ1.at(i)) + occ1[i] = minl + b.occ1.at(i); + } +#endif + + minl += b.minl; + if (minl == 0) + skipanchors = eng->anchorConcatenation(skipanchors, b.skipanchors); + else + skipanchors = 0; +} + +void QRegExpEngine::Box::orx(const Box &b) +{ + mergeInto(&ls, b.ls); + lanchors.unite(b.lanchors); + mergeInto(&rs, b.rs); + ranchors.unite(b.ranchors); + + if (b.minl == 0) { + if (minl == 0) + skipanchors = eng->anchorAlternation(skipanchors, b.skipanchors); + else + skipanchors = b.skipanchors; + } + +#ifndef QT_NO_REGEXP_OPTIM + for (int i = 0; i < NumBadChars; i++) { + if (occ1.at(i) > b.occ1.at(i)) + occ1[i] = b.occ1.at(i); + } + earlyStart = 0; + lateStart = 0; + str = QString(); + leftStr = QString(); + rightStr = QString(); + if (b.maxl > maxl) + maxl = b.maxl; +#endif + if (b.minl < minl) + minl = b.minl; +} + +void QRegExpEngine::Box::plus(int atom) +{ +#ifndef QT_NO_REGEXP_CAPTURE + eng->addPlusTransitions(rs, ls, atom); +#else + Q_UNUSED(atom); + eng->addCatTransitions(rs, ls); +#endif + addAnchorsToEngine(*this); +#ifndef QT_NO_REGEXP_OPTIM + maxl = InftyLen; +#endif +} + +void QRegExpEngine::Box::opt() +{ +#ifndef QT_NO_REGEXP_OPTIM + earlyStart = 0; + lateStart = 0; + str = QString(); + leftStr = QString(); + rightStr = QString(); +#endif + skipanchors = 0; + minl = 0; +} + +void QRegExpEngine::Box::catAnchor(int a) +{ + if (a != 0) { + for (int i = 0; i < rs.size(); i++) { + a = eng->anchorConcatenation(ranchors.value(rs.at(i), 0), a); + ranchors.insert(rs.at(i), a); + } + if (minl == 0) + skipanchors = eng->anchorConcatenation(skipanchors, a); + } +} + +#ifndef QT_NO_REGEXP_OPTIM +void QRegExpEngine::Box::setupHeuristics() +{ + eng->goodEarlyStart = earlyStart; + eng->goodLateStart = lateStart; + eng->goodStr = eng->cs ? str : str.toLower(); + + eng->minl = minl; + if (eng->cs) { + /* + A regular expression such as 112|1 has occ1['2'] = 2 and minl = + 1 at this point. An entry of occ1 has to be at most minl or + infinity for the rest of the algorithm to go well. + + We waited until here before normalizing these cases (instead of + doing it in Box::orx()) because sometimes things improve by + themselves. Consider for example (112|1)34. + */ + for (int i = 0; i < NumBadChars; i++) { + if (occ1.at(i) != NoOccurrence && occ1.at(i) >= minl) + occ1[i] = minl; + } + eng->occ1 = occ1; + } else { + eng->occ1.fill(0, NumBadChars); + } + + eng->heuristicallyChooseHeuristic(); +} +#endif + +#if defined(QT_DEBUG) +void QRegExpEngine::Box::dump() const +{ + int i; + qDebug("Box of at least %d character%s", minl, minl == 1 ? "" : "s"); + qDebug(" Left states:"); + for (i = 0; i < ls.size(); i++) { + if (lanchors.value(ls[i], 0) == 0) + qDebug(" %d", ls[i]); + else + qDebug(" %d [anchors 0x%.8x]", ls[i], lanchors[ls[i]]); + } + qDebug(" Right states:"); + for (i = 0; i < rs.size(); i++) { + if (ranchors.value(rs[i], 0) == 0) + qDebug(" %d", rs[i]); + else + qDebug(" %d [anchors 0x%.8x]", rs[i], ranchors[rs[i]]); + } + qDebug(" Skip anchors: 0x%.8x", skipanchors); +} +#endif + +void QRegExpEngine::Box::addAnchorsToEngine(const Box &to) const +{ + for (int i = 0; i < to.ls.size(); i++) { + for (int j = 0; j < rs.size(); j++) { + int a = eng->anchorConcatenation(ranchors.value(rs.at(j), 0), + to.lanchors.value(to.ls.at(i), 0)); + eng->addAnchors(rs[j], to.ls[i], a); + } + } +} + +#ifndef QT_NO_REGEXP_CCLASS +// fast lookup hash for xml schema extensions +// sorted by name for b-search +static const struct CategoriesRangeMapEntry { + const char name[40]; + uint first, second; +} categoriesRangeMap[] = { + { "AegeanNumbers", 0x10100, 0x1013F }, + { "AlphabeticPresentationForms", 0xFB00, 0xFB4F }, + { "AncientGreekMusicalNotation", 0x1D200, 0x1D24F }, + { "AncientGreekNumbers", 0x10140, 0x1018F }, + { "Arabic", 0x0600, 0x06FF }, + { "ArabicPresentationForms-A", 0xFB50, 0xFDFF }, + { "ArabicPresentationForms-B", 0xFE70, 0xFEFF }, + { "ArabicSupplement", 0x0750, 0x077F }, + { "Armenian", 0x0530, 0x058F }, + { "Arrows", 0x2190, 0x21FF }, + { "BasicLatin", 0x0000, 0x007F }, + { "Bengali", 0x0980, 0x09FF }, + { "BlockElements", 0x2580, 0x259F }, + { "Bopomofo", 0x3100, 0x312F }, + { "BopomofoExtended", 0x31A0, 0x31BF }, + { "BoxDrawing", 0x2500, 0x257F }, + { "BraillePatterns", 0x2800, 0x28FF }, + { "Buginese", 0x1A00, 0x1A1F }, + { "Buhid", 0x1740, 0x175F }, + { "ByzantineMusicalSymbols", 0x1D000, 0x1D0FF }, + { "CJKCompatibility", 0x3300, 0x33FF }, + { "CJKCompatibilityForms", 0xFE30, 0xFE4F }, + { "CJKCompatibilityIdeographs", 0xF900, 0xFAFF }, + { "CJKCompatibilityIdeographsSupplement", 0x2F800, 0x2FA1F }, + { "CJKRadicalsSupplement", 0x2E80, 0x2EFF }, + { "CJKStrokes", 0x31C0, 0x31EF }, + { "CJKSymbolsandPunctuation", 0x3000, 0x303F }, + { "CJKUnifiedIdeographs", 0x4E00, 0x9FFF }, + { "CJKUnifiedIdeographsExtensionA", 0x3400, 0x4DB5 }, + { "CJKUnifiedIdeographsExtensionB", 0x20000, 0x2A6DF }, + { "Cherokee", 0x13A0, 0x13FF }, + { "CombiningDiacriticalMarks", 0x0300, 0x036F }, + { "CombiningDiacriticalMarksSupplement", 0x1DC0, 0x1DFF }, + { "CombiningHalfMarks", 0xFE20, 0xFE2F }, + { "CombiningMarksforSymbols", 0x20D0, 0x20FF }, + { "ControlPictures", 0x2400, 0x243F }, + { "Coptic", 0x2C80, 0x2CFF }, + { "CurrencySymbols", 0x20A0, 0x20CF }, + { "CypriotSyllabary", 0x10800, 0x1083F }, + { "Cyrillic", 0x0400, 0x04FF }, + { "CyrillicSupplement", 0x0500, 0x052F }, + { "Deseret", 0x10400, 0x1044F }, + { "Devanagari", 0x0900, 0x097F }, + { "Dingbats", 0x2700, 0x27BF }, + { "EnclosedAlphanumerics", 0x2460, 0x24FF }, + { "EnclosedCJKLettersandMonths", 0x3200, 0x32FF }, + { "Ethiopic", 0x1200, 0x137F }, + { "EthiopicExtended", 0x2D80, 0x2DDF }, + { "EthiopicSupplement", 0x1380, 0x139F }, + { "GeneralPunctuation", 0x2000, 0x206F }, + { "GeometricShapes", 0x25A0, 0x25FF }, + { "Georgian", 0x10A0, 0x10FF }, + { "GeorgianSupplement", 0x2D00, 0x2D2F }, + { "Glagolitic", 0x2C00, 0x2C5F }, + { "Gothic", 0x10330, 0x1034F }, + { "Greek", 0x0370, 0x03FF }, + { "GreekExtended", 0x1F00, 0x1FFF }, + { "Gujarati", 0x0A80, 0x0AFF }, + { "Gurmukhi", 0x0A00, 0x0A7F }, + { "HalfwidthandFullwidthForms", 0xFF00, 0xFFEF }, + { "HangulCompatibilityJamo", 0x3130, 0x318F }, + { "HangulJamo", 0x1100, 0x11FF }, + { "HangulSyllables", 0xAC00, 0xD7A3 }, + { "Hanunoo", 0x1720, 0x173F }, + { "Hebrew", 0x0590, 0x05FF }, + { "Hiragana", 0x3040, 0x309F }, + { "IPAExtensions", 0x0250, 0x02AF }, + { "IdeographicDescriptionCharacters", 0x2FF0, 0x2FFF }, + { "Kanbun", 0x3190, 0x319F }, + { "KangxiRadicals", 0x2F00, 0x2FDF }, + { "Kannada", 0x0C80, 0x0CFF }, + { "Katakana", 0x30A0, 0x30FF }, + { "KatakanaPhoneticExtensions", 0x31F0, 0x31FF }, + { "Kharoshthi", 0x10A00, 0x10A5F }, + { "Khmer", 0x1780, 0x17FF }, + { "KhmerSymbols", 0x19E0, 0x19FF }, + { "Lao", 0x0E80, 0x0EFF }, + { "Latin-1Supplement", 0x0080, 0x00FF }, + { "LatinExtended-A", 0x0100, 0x017F }, + { "LatinExtended-B", 0x0180, 0x024F }, + { "LatinExtendedAdditional", 0x1E00, 0x1EFF }, + { "LetterlikeSymbols", 0x2100, 0x214F }, + { "Limbu", 0x1900, 0x194F }, + { "LinearBIdeograms", 0x10080, 0x100FF }, + { "LinearBSyllabary", 0x10000, 0x1007F }, + { "Malayalam", 0x0D00, 0x0D7F }, + { "MathematicalAlphanumericSymbols", 0x1D400, 0x1D7FF }, + { "MathematicalOperators", 0x2200, 0x22FF }, + { "MiscellaneousMathematicalSymbols-A", 0x27C0, 0x27EF }, + { "MiscellaneousMathematicalSymbols-B", 0x2980, 0x29FF }, + { "MiscellaneousSymbols", 0x2600, 0x26FF }, + { "MiscellaneousSymbolsandArrows", 0x2B00, 0x2BFF }, + { "MiscellaneousTechnical", 0x2300, 0x23FF }, + { "ModifierToneLetters", 0xA700, 0xA71F }, + { "Mongolian", 0x1800, 0x18AF }, + { "MusicalSymbols", 0x1D100, 0x1D1FF }, + { "Myanmar", 0x1000, 0x109F }, + { "NewTaiLue", 0x1980, 0x19DF }, + { "NumberForms", 0x2150, 0x218F }, + { "Ogham", 0x1680, 0x169F }, + { "OldItalic", 0x10300, 0x1032F }, + { "OldPersian", 0x103A0, 0x103DF }, + { "OpticalCharacterRecognition", 0x2440, 0x245F }, + { "Oriya", 0x0B00, 0x0B7F }, + { "Osmanya", 0x10480, 0x104AF }, + { "PhoneticExtensions", 0x1D00, 0x1D7F }, + { "PhoneticExtensionsSupplement", 0x1D80, 0x1DBF }, + { "PrivateUse", 0xE000, 0xF8FF }, + { "Runic", 0x16A0, 0x16FF }, + { "Shavian", 0x10450, 0x1047F }, + { "Sinhala", 0x0D80, 0x0DFF }, + { "SmallFormVariants", 0xFE50, 0xFE6F }, + { "SpacingModifierLetters", 0x02B0, 0x02FF }, + { "Specials", 0xFFF0, 0xFFFF }, + { "SuperscriptsandSubscripts", 0x2070, 0x209F }, + { "SupplementalArrows-A", 0x27F0, 0x27FF }, + { "SupplementalArrows-B", 0x2900, 0x297F }, + { "SupplementalMathematicalOperators", 0x2A00, 0x2AFF }, + { "SupplementalPunctuation", 0x2E00, 0x2E7F }, + { "SupplementaryPrivateUseArea-A", 0xF0000, 0xFFFFF }, + { "SupplementaryPrivateUseArea-B", 0x100000, 0x10FFFF }, + { "SylotiNagri", 0xA800, 0xA82F }, + { "Syriac", 0x0700, 0x074F }, + { "Tagalog", 0x1700, 0x171F }, + { "Tagbanwa", 0x1760, 0x177F }, + { "Tags", 0xE0000, 0xE007F }, + { "TaiLe", 0x1950, 0x197F }, + { "TaiXuanJingSymbols", 0x1D300, 0x1D35F }, + { "Tamil", 0x0B80, 0x0BFF }, + { "Telugu", 0x0C00, 0x0C7F }, + { "Thaana", 0x0780, 0x07BF }, + { "Thai", 0x0E00, 0x0E7F }, + { "Tibetan", 0x0F00, 0x0FFF }, + { "Tifinagh", 0x2D30, 0x2D7F }, + { "Ugaritic", 0x10380, 0x1039F }, + { "UnifiedCanadianAboriginalSyllabics", 0x1400, 0x167F }, + { "VariationSelectors", 0xFE00, 0xFE0F }, + { "VariationSelectorsSupplement", 0xE0100, 0xE01EF }, + { "VerticalForms", 0xFE10, 0xFE1F }, + { "YiRadicals", 0xA490, 0xA4CF }, + { "YiSyllables", 0xA000, 0xA48F }, + { "YijingHexagramSymbols", 0x4DC0, 0x4DFF } +}; + +inline bool operator<(const CategoriesRangeMapEntry &entry1, const CategoriesRangeMapEntry &entry2) +{ return qstrcmp(entry1.name, entry2.name) < 0; } +inline bool operator<(const char *name, const CategoriesRangeMapEntry &entry) +{ return qstrcmp(name, entry.name) < 0; } +inline bool operator<(const CategoriesRangeMapEntry &entry, const char *name) +{ return qstrcmp(entry.name, name) < 0; } +#endif // QT_NO_REGEXP_CCLASS + +int QRegExpEngine::getChar() +{ + return (yyPos == yyLen) ? EOS : yyIn[yyPos++].unicode(); +} + +int QRegExpEngine::getEscape() +{ +#ifndef QT_NO_REGEXP_ESCAPE + const char tab[] = "afnrtv"; // no b, as \b means word boundary + const char backTab[] = "\a\f\n\r\t\v"; + ushort low; + int i; +#endif + ushort val; + int prevCh = yyCh; + + if (prevCh == EOS) { + error(RXERR_END); + return Tok_Char | '\\'; + } + yyCh = getChar(); +#ifndef QT_NO_REGEXP_ESCAPE + if ((prevCh & ~0xff) == 0) { + const char *p = strchr(tab, prevCh); + if (p != nullptr) + return Tok_Char | backTab[p - tab]; + } +#endif + + switch (prevCh) { +#ifndef QT_NO_REGEXP_ESCAPE + case '0': + val = 0; + for (i = 0; i < 3; i++) { + if (yyCh >= '0' && yyCh <= '7') + val = (val << 3) | (yyCh - '0'); + else + break; + yyCh = getChar(); + } + if ((val & ~0377) != 0) + error(RXERR_OCTAL); + return Tok_Char | val; +#endif +#ifndef QT_NO_REGEXP_ESCAPE + case 'B': + return Tok_NonWord; +#endif +#ifndef QT_NO_REGEXP_CCLASS + case 'D': + // see QChar::isDigit() + yyCharClass->addCategories(uint(-1) ^ FLAG(QChar::Number_DecimalDigit)); + return Tok_CharClass; + case 'S': + // see QChar::isSpace() + yyCharClass->addCategories(uint(-1) ^ (FLAG(QChar::Separator_Space) | + FLAG(QChar::Separator_Line) | + FLAG(QChar::Separator_Paragraph) | + FLAG(QChar::Other_Control))); + yyCharClass->addRange(0x0000, 0x0008); + yyCharClass->addRange(0x000e, 0x001f); + yyCharClass->addRange(0x007f, 0x0084); + yyCharClass->addRange(0x0086, 0x009f); + return Tok_CharClass; + case 'W': + // see QChar::isLetterOrNumber() and QChar::isMark() + yyCharClass->addCategories(uint(-1) ^ (FLAG(QChar::Mark_NonSpacing) | + FLAG(QChar::Mark_SpacingCombining) | + FLAG(QChar::Mark_Enclosing) | + FLAG(QChar::Number_DecimalDigit) | + FLAG(QChar::Number_Letter) | + FLAG(QChar::Number_Other) | + FLAG(QChar::Letter_Uppercase) | + FLAG(QChar::Letter_Lowercase) | + FLAG(QChar::Letter_Titlecase) | + FLAG(QChar::Letter_Modifier) | + FLAG(QChar::Letter_Other) | + FLAG(QChar::Punctuation_Connector))); + yyCharClass->addRange(0x203f, 0x2040); + yyCharClass->addSingleton(0x2040); + yyCharClass->addSingleton(0x2054); + yyCharClass->addSingleton(0x30fb); + yyCharClass->addRange(0xfe33, 0xfe34); + yyCharClass->addRange(0xfe4d, 0xfe4f); + yyCharClass->addSingleton(0xff3f); + yyCharClass->addSingleton(0xff65); + return Tok_CharClass; +#endif +#ifndef QT_NO_REGEXP_ESCAPE + case 'b': + return Tok_Word; +#endif +#ifndef QT_NO_REGEXP_CCLASS + case 'd': + // see QChar::isDigit() + yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit)); + return Tok_CharClass; + case 's': + // see QChar::isSpace() + yyCharClass->addCategories(FLAG(QChar::Separator_Space) | + FLAG(QChar::Separator_Line) | + FLAG(QChar::Separator_Paragraph)); + yyCharClass->addRange(0x0009, 0x000d); + yyCharClass->addSingleton(0x0085); + return Tok_CharClass; + case 'w': + // see QChar::isLetterOrNumber() and QChar::isMark() + yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) | + FLAG(QChar::Mark_SpacingCombining) | + FLAG(QChar::Mark_Enclosing) | + FLAG(QChar::Number_DecimalDigit) | + FLAG(QChar::Number_Letter) | + FLAG(QChar::Number_Other) | + FLAG(QChar::Letter_Uppercase) | + FLAG(QChar::Letter_Lowercase) | + FLAG(QChar::Letter_Titlecase) | + FLAG(QChar::Letter_Modifier) | + FLAG(QChar::Letter_Other)); + yyCharClass->addSingleton(0x005f); // '_' + return Tok_CharClass; + case 'I': + if (xmlSchemaExtensions) { + yyCharClass->setNegative(!yyCharClass->negative()); + Q_FALLTHROUGH(); + } else { + break; + } + case 'i': + if (xmlSchemaExtensions) { + yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) | + FLAG(QChar::Mark_SpacingCombining) | + FLAG(QChar::Mark_Enclosing) | + FLAG(QChar::Number_DecimalDigit) | + FLAG(QChar::Number_Letter) | + FLAG(QChar::Number_Other) | + FLAG(QChar::Letter_Uppercase) | + FLAG(QChar::Letter_Lowercase) | + FLAG(QChar::Letter_Titlecase) | + FLAG(QChar::Letter_Modifier) | + FLAG(QChar::Letter_Other)); + yyCharClass->addSingleton(0x003a); // ':' + yyCharClass->addSingleton(0x005f); // '_' + yyCharClass->addRange(0x0041, 0x005a); // [A-Z] + yyCharClass->addRange(0x0061, 0x007a); // [a-z] + yyCharClass->addRange(0xc0, 0xd6); + yyCharClass->addRange(0xd8, 0xf6); + yyCharClass->addRange(0xf8, 0x2ff); + yyCharClass->addRange(0x370, 0x37d); + yyCharClass->addRange(0x37f, 0x1fff); + yyCharClass->addRange(0x200c, 0x200d); + yyCharClass->addRange(0x2070, 0x218f); + yyCharClass->addRange(0x2c00, 0x2fef); + yyCharClass->addRange(0x3001, 0xd7ff); + yyCharClass->addRange(0xf900, 0xfdcf); + yyCharClass->addRange(0xfdf0, 0xfffd); + yyCharClass->addRange((ushort)0x10000, (ushort)0xeffff); + return Tok_CharClass; + } else { + break; + } + case 'C': + if (xmlSchemaExtensions) { + yyCharClass->setNegative(!yyCharClass->negative()); + Q_FALLTHROUGH(); + } else { + break; + } + case 'c': + if (xmlSchemaExtensions) { + yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) | + FLAG(QChar::Mark_SpacingCombining) | + FLAG(QChar::Mark_Enclosing) | + FLAG(QChar::Number_DecimalDigit) | + FLAG(QChar::Number_Letter) | + FLAG(QChar::Number_Other) | + FLAG(QChar::Letter_Uppercase) | + FLAG(QChar::Letter_Lowercase) | + FLAG(QChar::Letter_Titlecase) | + FLAG(QChar::Letter_Modifier) | + FLAG(QChar::Letter_Other)); + yyCharClass->addSingleton(0x002d); // '-' + yyCharClass->addSingleton(0x002e); // '.' + yyCharClass->addSingleton(0x003a); // ':' + yyCharClass->addSingleton(0x005f); // '_' + yyCharClass->addSingleton(0xb7); + yyCharClass->addRange(0x0030, 0x0039); // [0-9] + yyCharClass->addRange(0x0041, 0x005a); // [A-Z] + yyCharClass->addRange(0x0061, 0x007a); // [a-z] + yyCharClass->addRange(0xc0, 0xd6); + yyCharClass->addRange(0xd8, 0xf6); + yyCharClass->addRange(0xf8, 0x2ff); + yyCharClass->addRange(0x370, 0x37d); + yyCharClass->addRange(0x37f, 0x1fff); + yyCharClass->addRange(0x200c, 0x200d); + yyCharClass->addRange(0x2070, 0x218f); + yyCharClass->addRange(0x2c00, 0x2fef); + yyCharClass->addRange(0x3001, 0xd7ff); + yyCharClass->addRange(0xf900, 0xfdcf); + yyCharClass->addRange(0xfdf0, 0xfffd); + yyCharClass->addRange((ushort)0x10000, (ushort)0xeffff); + yyCharClass->addRange(0x0300, 0x036f); + yyCharClass->addRange(0x203f, 0x2040); + return Tok_CharClass; + } else { + break; + } + case 'P': + if (xmlSchemaExtensions) { + yyCharClass->setNegative(!yyCharClass->negative()); + Q_FALLTHROUGH(); + } else { + break; + } + case 'p': + if (xmlSchemaExtensions) { + if (yyCh != '{') { + error(RXERR_CHARCLASS); + return Tok_CharClass; + } + + QByteArray category; + yyCh = getChar(); + while (yyCh != '}') { + if (yyCh == EOS) { + error(RXERR_END); + return Tok_CharClass; + } + category.append(yyCh); + yyCh = getChar(); + } + yyCh = getChar(); // skip closing '}' + + int catlen = category.length(); + if (catlen == 1 || catlen == 2) { + switch (category.at(0)) { + case 'M': + if (catlen == 1) { + yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) | + FLAG(QChar::Mark_SpacingCombining) | + FLAG(QChar::Mark_Enclosing)); + } else { + switch (category.at(1)) { + case 'n': yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing)); break; // Mn + case 'c': yyCharClass->addCategories(FLAG(QChar::Mark_SpacingCombining)); break; // Mc + case 'e': yyCharClass->addCategories(FLAG(QChar::Mark_Enclosing)); break; // Me + default: error(RXERR_CATEGORY); break; + } + } + break; + case 'N': + if (catlen == 1) { + yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit) | + FLAG(QChar::Number_Letter) | + FLAG(QChar::Number_Other)); + } else { + switch (category.at(1)) { + case 'd': yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit)); break; // Nd + case 'l': yyCharClass->addCategories(FLAG(QChar::Number_Letter)); break; // Hl + case 'o': yyCharClass->addCategories(FLAG(QChar::Number_Other)); break; // No + default: error(RXERR_CATEGORY); break; + } + } + break; + case 'Z': + if (catlen == 1) { + yyCharClass->addCategories(FLAG(QChar::Separator_Space) | + FLAG(QChar::Separator_Line) | + FLAG(QChar::Separator_Paragraph)); + } else { + switch (category.at(1)) { + case 's': yyCharClass->addCategories(FLAG(QChar::Separator_Space)); break; // Zs + case 'l': yyCharClass->addCategories(FLAG(QChar::Separator_Line)); break; // Zl + case 'p': yyCharClass->addCategories(FLAG(QChar::Separator_Paragraph)); break; // Zp + default: error(RXERR_CATEGORY); break; + } + } + break; + case 'C': + if (catlen == 1) { + yyCharClass->addCategories(FLAG(QChar::Other_Control) | + FLAG(QChar::Other_Format) | + FLAG(QChar::Other_Surrogate) | + FLAG(QChar::Other_PrivateUse) | + FLAG(QChar::Other_NotAssigned)); + } else { + switch (category.at(1)) { + case 'c': yyCharClass->addCategories(FLAG(QChar::Other_Control)); break; // Cc + case 'f': yyCharClass->addCategories(FLAG(QChar::Other_Format)); break; // Cf + case 's': yyCharClass->addCategories(FLAG(QChar::Other_Surrogate)); break; // Cs + case 'o': yyCharClass->addCategories(FLAG(QChar::Other_PrivateUse)); break; // Co + case 'n': yyCharClass->addCategories(FLAG(QChar::Other_NotAssigned)); break; // Cn + default: error(RXERR_CATEGORY); break; + } + } + break; + case 'L': + if (catlen == 1) { + yyCharClass->addCategories(FLAG(QChar::Letter_Uppercase) | + FLAG(QChar::Letter_Lowercase) | + FLAG(QChar::Letter_Titlecase) | + FLAG(QChar::Letter_Modifier) | + FLAG(QChar::Letter_Other)); + } else { + switch (category.at(1)) { + case 'u': yyCharClass->addCategories(FLAG(QChar::Letter_Uppercase)); break; // Lu + case 'l': yyCharClass->addCategories(FLAG(QChar::Letter_Lowercase)); break; // Ll + case 't': yyCharClass->addCategories(FLAG(QChar::Letter_Titlecase)); break; // Lt + case 'm': yyCharClass->addCategories(FLAG(QChar::Letter_Modifier)); break; // Lm + case 'o': yyCharClass->addCategories(FLAG(QChar::Letter_Other)); break; // Lo + default: error(RXERR_CATEGORY); break; + } + } + break; + case 'P': + if (catlen == 1) { + yyCharClass->addCategories(FLAG(QChar::Punctuation_Connector) | + FLAG(QChar::Punctuation_Dash) | + FLAG(QChar::Punctuation_Open) | + FLAG(QChar::Punctuation_Close) | + FLAG(QChar::Punctuation_InitialQuote) | + FLAG(QChar::Punctuation_FinalQuote) | + FLAG(QChar::Punctuation_Other)); + } else { + switch (category.at(1)) { + case 'c': yyCharClass->addCategories(FLAG(QChar::Punctuation_Connector)); break; // Pc + case 'd': yyCharClass->addCategories(FLAG(QChar::Punctuation_Dash)); break; // Pd + case 's': yyCharClass->addCategories(FLAG(QChar::Punctuation_Open)); break; // Ps + case 'e': yyCharClass->addCategories(FLAG(QChar::Punctuation_Close)); break; // Pe + case 'i': yyCharClass->addCategories(FLAG(QChar::Punctuation_InitialQuote)); break; // Pi + case 'f': yyCharClass->addCategories(FLAG(QChar::Punctuation_FinalQuote)); break; // Pf + case 'o': yyCharClass->addCategories(FLAG(QChar::Punctuation_Other)); break; // Po + default: error(RXERR_CATEGORY); break; + } + } + break; + case 'S': + if (catlen == 1) { + yyCharClass->addCategories(FLAG(QChar::Symbol_Math) | + FLAG(QChar::Symbol_Currency) | + FLAG(QChar::Symbol_Modifier) | + FLAG(QChar::Symbol_Other)); + } else { + switch (category.at(1)) { + case 'm': yyCharClass->addCategories(FLAG(QChar::Symbol_Math)); break; // Sm + case 'c': yyCharClass->addCategories(FLAG(QChar::Symbol_Currency)); break; // Sc + case 'k': yyCharClass->addCategories(FLAG(QChar::Symbol_Modifier)); break; // Sk + case 'o': yyCharClass->addCategories(FLAG(QChar::Symbol_Other)); break; // So + default: error(RXERR_CATEGORY); break; + } + } + break; + default: + error(RXERR_CATEGORY); + break; + } + } else if (catlen > 2 && category.at(0) == 'I' && category.at(1) == 's') { + static const int N = sizeof(categoriesRangeMap) / sizeof(categoriesRangeMap[0]); + const char * const categoryFamily = category.constData() + 2; + const CategoriesRangeMapEntry *r = std::lower_bound(categoriesRangeMap, categoriesRangeMap + N, categoryFamily); + if (r != categoriesRangeMap + N && qstrcmp(r->name, categoryFamily) == 0) + yyCharClass->addRange(r->first, r->second); + else + error(RXERR_CATEGORY); + } else { + error(RXERR_CATEGORY); + } + return Tok_CharClass; + } else { + break; + } +#endif +#ifndef QT_NO_REGEXP_ESCAPE + case 'x': + val = 0; + for (i = 0; i < 4; i++) { + low = QChar(yyCh).toLower().unicode(); + if (low >= '0' && low <= '9') + val = (val << 4) | (low - '0'); + else if (low >= 'a' && low <= 'f') + val = (val << 4) | (low - 'a' + 10); + else + break; + yyCh = getChar(); + } + return Tok_Char | val; +#endif + default: + break; + } + if (prevCh >= '1' && prevCh <= '9') { +#ifndef QT_NO_REGEXP_BACKREF + val = prevCh - '0'; + while (yyCh >= '0' && yyCh <= '9') { + val = (val * 10) + (yyCh - '0'); + yyCh = getChar(); + } + return Tok_BackRef | val; +#else + error(RXERR_DISABLED); +#endif + } + return Tok_Char | prevCh; +} + +#ifndef QT_NO_REGEXP_INTERVAL +int QRegExpEngine::getRep(int def) +{ + if (yyCh >= '0' && yyCh <= '9') { + int rep = 0; + do { + rep = 10 * rep + yyCh - '0'; + if (rep >= InftyRep) { + error(RXERR_REPETITION); + rep = def; + } + yyCh = getChar(); + } while (yyCh >= '0' && yyCh <= '9'); + return rep; + } else { + return def; + } +} +#endif + +#ifndef QT_NO_REGEXP_LOOKAHEAD +void QRegExpEngine::skipChars(int n) +{ + if (n > 0) { + yyPos += n - 1; + yyCh = getChar(); + } +} +#endif + +void QRegExpEngine::error(const char *msg) +{ + if (yyError.isEmpty()) + yyError = QLatin1String(msg); +} + +void QRegExpEngine::startTokenizer(const QChar *rx, int len) +{ + yyIn = rx; + yyPos0 = 0; + yyPos = 0; + yyLen = len; + yyCh = getChar(); + yyCharClass.reset(new QRegExpCharClass); + yyMinRep = 0; + yyMaxRep = 0; + yyError = QString(); +} + +int QRegExpEngine::getToken() +{ +#ifndef QT_NO_REGEXP_CCLASS + ushort pendingCh = 0; + bool charPending; + bool rangePending; + int tok; +#endif + int prevCh = yyCh; + + yyPos0 = yyPos - 1; +#ifndef QT_NO_REGEXP_CCLASS + yyCharClass->clear(); +#endif + yyMinRep = 0; + yyMaxRep = 0; + yyCh = getChar(); + + switch (prevCh) { + case EOS: + yyPos0 = yyPos; + return Tok_Eos; + case '$': + return Tok_Dollar; + case '(': + if (yyCh == '?') { + prevCh = getChar(); + yyCh = getChar(); + switch (prevCh) { +#ifndef QT_NO_REGEXP_LOOKAHEAD + case '!': + return Tok_NegLookahead; + case '=': + return Tok_PosLookahead; +#endif + case ':': + return Tok_MagicLeftParen; + case '<': + error(RXERR_LOOKBEHIND); + return Tok_MagicLeftParen; + default: + error(RXERR_LOOKAHEAD); + return Tok_MagicLeftParen; + } + } else { + return Tok_LeftParen; + } + case ')': + return Tok_RightParen; + case '*': + yyMinRep = 0; + yyMaxRep = InftyRep; + return Tok_Quantifier; + case '+': + yyMinRep = 1; + yyMaxRep = InftyRep; + return Tok_Quantifier; + case '.': +#ifndef QT_NO_REGEXP_CCLASS + yyCharClass->setNegative(true); +#endif + return Tok_CharClass; + case '?': + yyMinRep = 0; + yyMaxRep = 1; + return Tok_Quantifier; + case '[': +#ifndef QT_NO_REGEXP_CCLASS + if (yyCh == '^') { + yyCharClass->setNegative(true); + yyCh = getChar(); + } + charPending = false; + rangePending = false; + do { + if (yyCh == '-' && charPending && !rangePending) { + rangePending = true; + yyCh = getChar(); + } else { + if (charPending && !rangePending) { + yyCharClass->addSingleton(pendingCh); + charPending = false; + } + if (yyCh == '\\') { + yyCh = getChar(); + tok = getEscape(); + if (tok == Tok_Word) + tok = '\b'; + } else { + tok = Tok_Char | yyCh; + yyCh = getChar(); + } + if (tok == Tok_CharClass) { + if (rangePending) { + yyCharClass->addSingleton('-'); + yyCharClass->addSingleton(pendingCh); + charPending = false; + rangePending = false; + } + } else if ((tok & Tok_Char) != 0) { + if (rangePending) { + yyCharClass->addRange(pendingCh, tok ^ Tok_Char); + charPending = false; + rangePending = false; + } else { + pendingCh = tok ^ Tok_Char; + charPending = true; + } + } else { + error(RXERR_CHARCLASS); + } + } + } while (yyCh != ']' && yyCh != EOS); + if (rangePending) + yyCharClass->addSingleton('-'); + if (charPending) + yyCharClass->addSingleton(pendingCh); + if (yyCh == EOS) + error(RXERR_END); + else + yyCh = getChar(); + return Tok_CharClass; +#else + error(RXERR_END); + return Tok_Char | '['; +#endif + case '\\': + return getEscape(); + case ']': + error(RXERR_LEFTDELIM); + return Tok_Char | ']'; + case '^': + return Tok_Caret; + case '{': +#ifndef QT_NO_REGEXP_INTERVAL + yyMinRep = getRep(0); + yyMaxRep = yyMinRep; + if (yyCh == ',') { + yyCh = getChar(); + yyMaxRep = getRep(InftyRep); + } + if (yyMaxRep < yyMinRep) + error(RXERR_INTERVAL); + if (yyCh != '}') + error(RXERR_REPETITION); + yyCh = getChar(); + return Tok_Quantifier; +#else + error(RXERR_DISABLED); + return Tok_Char | '{'; +#endif + case '|': + return Tok_Bar; + case '}': + error(RXERR_LEFTDELIM); + return Tok_Char | '}'; + default: + return Tok_Char | prevCh; + } +} + +int QRegExpEngine::parse(const QChar *pattern, int len) +{ + valid = true; + startTokenizer(pattern, len); + yyTok = getToken(); +#ifndef QT_NO_REGEXP_CAPTURE + yyMayCapture = true; +#else + yyMayCapture = false; +#endif + +#ifndef QT_NO_REGEXP_CAPTURE + int atom = startAtom(false); +#endif + QRegExpCharClass anything; + Box box(this); // create InitialState + box.set(anything); + Box rightBox(this); // create FinalState + rightBox.set(anything); + + Box middleBox(this); + parseExpression(&middleBox); +#ifndef QT_NO_REGEXP_CAPTURE + finishAtom(atom, false); +#endif +#ifndef QT_NO_REGEXP_OPTIM + middleBox.setupHeuristics(); +#endif + box.cat(middleBox); + box.cat(rightBox); + yyCharClass.reset(); + +#ifndef QT_NO_REGEXP_CAPTURE + for (int i = 0; i < nf; ++i) { + switch (f[i].capture) { + case QRegExpAtom::NoCapture: + break; + case QRegExpAtom::OfficialCapture: + f[i].capture = ncap; + captureForOfficialCapture.append(ncap); + ++ncap; + ++officialncap; + break; + case QRegExpAtom::UnofficialCapture: + f[i].capture = greedyQuantifiers ? ncap++ : QRegExpAtom::NoCapture; + } + } + +#ifndef QT_NO_REGEXP_BACKREF +#ifndef QT_NO_REGEXP_OPTIM + if (officialncap == 0 && nbrefs == 0) { + ncap = nf = 0; + f.clear(); + } +#endif + // handle the case where there's a \5 with no corresponding capture + // (captureForOfficialCapture.size() != officialncap) + for (int i = 0; i < nbrefs - officialncap; ++i) { + captureForOfficialCapture.append(ncap); + ++ncap; + } +#endif +#endif + + if (!yyError.isEmpty()) + return -1; + +#ifndef QT_NO_REGEXP_OPTIM + const QRegExpAutomatonState &sinit = s.at(InitialState); + caretAnchored = !sinit.anchors.isEmpty(); + if (caretAnchored) { + const QMap &anchors = sinit.anchors; + QMap::const_iterator a; + for (a = anchors.constBegin(); a != anchors.constEnd(); ++a) { + if ( +#ifndef QT_NO_REGEXP_ANCHOR_ALT + (*a & Anchor_Alternation) != 0 || +#endif + (*a & Anchor_Caret) == 0) + { + caretAnchored = false; + break; + } + } + } +#endif + + // cleanup anchors + int numStates = s.count(); + for (int i = 0; i < numStates; ++i) { + QRegExpAutomatonState &state = s[i]; + if (!state.anchors.isEmpty()) { + QMap::iterator a = state.anchors.begin(); + while (a != state.anchors.end()) { + if (a.value() == 0) + a = state.anchors.erase(a); + else + ++a; + } + } + } + + return yyPos0; +} + +void QRegExpEngine::parseAtom(Box *box) +{ +#ifndef QT_NO_REGEXP_LOOKAHEAD + QRegExpEngine *eng = nullptr; + bool neg; + int len; +#endif + + if ((yyTok & Tok_Char) != 0) { + box->set(QChar(yyTok ^ Tok_Char)); + } else { +#ifndef QT_NO_REGEXP_OPTIM + trivial = false; +#endif + switch (yyTok) { + case Tok_Dollar: + box->catAnchor(Anchor_Dollar); + break; + case Tok_Caret: + box->catAnchor(Anchor_Caret); + break; +#ifndef QT_NO_REGEXP_LOOKAHEAD + case Tok_PosLookahead: + case Tok_NegLookahead: + neg = (yyTok == Tok_NegLookahead); + eng = new QRegExpEngine(cs, greedyQuantifiers); + len = eng->parse(yyIn + yyPos - 1, yyLen - yyPos + 1); + if (len >= 0) + skipChars(len); + else + error(RXERR_LOOKAHEAD); + box->catAnchor(addLookahead(eng, neg)); + yyTok = getToken(); + if (yyTok != Tok_RightParen) + error(RXERR_LOOKAHEAD); + break; +#endif +#ifndef QT_NO_REGEXP_ESCAPE + case Tok_Word: + box->catAnchor(Anchor_Word); + break; + case Tok_NonWord: + box->catAnchor(Anchor_NonWord); + break; +#endif + case Tok_LeftParen: + case Tok_MagicLeftParen: + yyTok = getToken(); + parseExpression(box); + if (yyTok != Tok_RightParen) + error(RXERR_END); + break; + case Tok_CharClass: + box->set(*yyCharClass); + break; + case Tok_Quantifier: + error(RXERR_REPETITION); + break; + default: +#ifndef QT_NO_REGEXP_BACKREF + if ((yyTok & Tok_BackRef) != 0) + box->set(yyTok ^ Tok_BackRef); + else +#endif + error(RXERR_DISABLED); + } + } + yyTok = getToken(); +} + +void QRegExpEngine::parseFactor(Box *box) +{ +#ifndef QT_NO_REGEXP_CAPTURE + int outerAtom = greedyQuantifiers ? startAtom(false) : -1; + int innerAtom = startAtom(yyMayCapture && yyTok == Tok_LeftParen); + bool magicLeftParen = (yyTok == Tok_MagicLeftParen); +#else + const int innerAtom = -1; +#endif + +#ifndef QT_NO_REGEXP_INTERVAL +#define YYREDO() \ + yyIn = in, yyPos0 = pos0, yyPos = pos, yyLen = len, yyCh = ch, \ + *yyCharClass = charClass, yyMinRep = 0, yyMaxRep = 0, yyTok = tok + + const QChar *in = yyIn; + int pos0 = yyPos0; + int pos = yyPos; + int len = yyLen; + int ch = yyCh; + QRegExpCharClass charClass; + if (yyTok == Tok_CharClass) + charClass = *yyCharClass; + int tok = yyTok; + bool mayCapture = yyMayCapture; +#endif + + parseAtom(box); +#ifndef QT_NO_REGEXP_CAPTURE + finishAtom(innerAtom, magicLeftParen); +#endif + + bool hasQuantifier = (yyTok == Tok_Quantifier); + if (hasQuantifier) { +#ifndef QT_NO_REGEXP_OPTIM + trivial = false; +#endif + if (yyMaxRep == InftyRep) { + box->plus(innerAtom); +#ifndef QT_NO_REGEXP_INTERVAL + } else if (yyMaxRep == 0) { + box->clear(); +#endif + } + if (yyMinRep == 0) + box->opt(); + +#ifndef QT_NO_REGEXP_INTERVAL + yyMayCapture = false; + int alpha = (yyMinRep == 0) ? 0 : yyMinRep - 1; + int beta = (yyMaxRep == InftyRep) ? 0 : yyMaxRep - (alpha + 1); + + Box rightBox(this); + int i; + + for (i = 0; i < beta; i++) { + YYREDO(); + Box leftBox(this); + parseAtom(&leftBox); + leftBox.cat(rightBox); + leftBox.opt(); + rightBox = leftBox; + } + for (i = 0; i < alpha; i++) { + YYREDO(); + Box leftBox(this); + parseAtom(&leftBox); + leftBox.cat(rightBox); + rightBox = leftBox; + } + rightBox.cat(*box); + *box = rightBox; +#endif + yyTok = getToken(); +#ifndef QT_NO_REGEXP_INTERVAL + yyMayCapture = mayCapture; +#endif + } +#undef YYREDO +#ifndef QT_NO_REGEXP_CAPTURE + if (greedyQuantifiers) + finishAtom(outerAtom, hasQuantifier); +#endif +} + +void QRegExpEngine::parseTerm(Box *box) +{ +#ifndef QT_NO_REGEXP_OPTIM + if (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar) + parseFactor(box); +#endif + while (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar) { + Box rightBox(this); + parseFactor(&rightBox); + box->cat(rightBox); + } +} + +void QRegExpEngine::parseExpression(Box *box) +{ + parseTerm(box); + while (yyTok == Tok_Bar) { +#ifndef QT_NO_REGEXP_OPTIM + trivial = false; +#endif + Box rightBox(this); + yyTok = getToken(); + parseTerm(&rightBox); + box->orx(rightBox); + } +} + +/* + The struct QRegExpPrivate contains the private data of a regular + expression other than the automaton. It makes it possible for many + QRegExp objects to use the same QRegExpEngine object with different + QRegExpPrivate objects. +*/ +struct QRegExpPrivate +{ + QRegExpEngine *eng; + QRegExpEngineKey engineKey; + bool minimal; +#ifndef QT_NO_REGEXP_CAPTURE + QString t; // last string passed to QRegExp::indexIn() or lastIndexIn() + QStringList capturedCache; // what QRegExp::capturedTexts() returned last +#endif + QRegExpMatchState matchState; + + inline QRegExpPrivate() + : eng(nullptr), engineKey(QString(), QRegExp::RegExp, Qt::CaseSensitive), minimal(false) { } + inline QRegExpPrivate(const QRegExpEngineKey &key) + : eng(nullptr), engineKey(key), minimal(false) {} +}; + +#if !defined(QT_NO_REGEXP_OPTIM) +struct QRECache +{ + typedef QHash EngineCache; + typedef QCache UnusedEngineCache; + EngineCache usedEngines; + UnusedEngineCache unusedEngines; +}; +Q_GLOBAL_STATIC(QRECache, engineCache) +static QBasicMutex engineCacheMutex; +#endif // QT_NO_REGEXP_OPTIM + +static void derefEngine(QRegExpEngine *eng, const QRegExpEngineKey &key) +{ +#if !defined(QT_NO_REGEXP_OPTIM) + QMutexLocker locker(&engineCacheMutex); + if (!eng->ref.deref()) { + if (QRECache *c = engineCache()) { + c->unusedEngines.insert(key, eng, 4 + key.pattern.length() / 4); + c->usedEngines.remove(key); + } else { + delete eng; + } + } +#else + Q_UNUSED(key); + if (!eng->ref.deref()) + delete eng; +#endif +} + +static void prepareEngine_helper(QRegExpPrivate *priv) +{ + Q_ASSERT(!priv->eng); + +#if !defined(QT_NO_REGEXP_OPTIM) + QMutexLocker locker(&engineCacheMutex); + if (QRECache *c = engineCache()) { + priv->eng = c->unusedEngines.take(priv->engineKey); + if (!priv->eng) + priv->eng = c->usedEngines.value(priv->engineKey); + if (!priv->eng) + priv->eng = new QRegExpEngine(priv->engineKey); + else + priv->eng->ref.ref(); + + c->usedEngines.insert(priv->engineKey, priv->eng); + return; + } +#endif // QT_NO_REGEXP_OPTIM + + priv->eng = new QRegExpEngine(priv->engineKey); +} + +inline static void prepareEngine(QRegExpPrivate *priv) +{ + if (priv->eng) + return; + prepareEngine_helper(priv); + priv->matchState.prepareForMatch(priv->eng); +} + +static void prepareEngineForMatch(QRegExpPrivate *priv, const QString &str) +{ + prepareEngine(priv); + priv->matchState.prepareForMatch(priv->eng); +#ifndef QT_NO_REGEXP_CAPTURE + priv->t = str; + priv->capturedCache.clear(); +#else + Q_UNUSED(str); +#endif +} + +static void invalidateEngine(QRegExpPrivate *priv) +{ + if (priv->eng) { + derefEngine(priv->eng, priv->engineKey); + priv->eng = nullptr; + priv->matchState.drain(); + } +} + +/*! + \enum QRegExp::CaretMode + + The CaretMode enum defines the different meanings of the caret + (\b{^}) in a regular expression. The possible values are: + + \value CaretAtZero + The caret corresponds to index 0 in the searched string. + + \value CaretAtOffset + The caret corresponds to the start offset of the search. + + \value CaretWontMatch + The caret never matches. +*/ + +/*! + \enum QRegExp::PatternSyntax + + The syntax used to interpret the meaning of the pattern. + + \value RegExp A rich Perl-like pattern matching syntax. This is + the default. + + \value RegExp2 Like RegExp, but with \l{greedy quantifiers}. + (Introduced in Qt 4.2.) + + \value Wildcard This provides a simple pattern matching syntax + similar to that used by shells (command interpreters) for "file + globbing". See \l{QRegExp wildcard matching}. + + \value WildcardUnix This is similar to Wildcard but with the + behavior of a Unix shell. The wildcard characters can be escaped + with the character "\\". + + \value FixedString The pattern is a fixed string. This is + equivalent to using the RegExp pattern on a string in + which all metacharacters are escaped using escape(). + + \value W3CXmlSchema11 The pattern is a regular expression as + defined by the W3C XML Schema 1.1 specification. + + \sa setPatternSyntax() +*/ + +/*! + Constructs an empty regexp. + + \sa isValid(), errorString() +*/ +QRegExp::QRegExp() +{ + priv = new QRegExpPrivate; + prepareEngine(priv); +} + +/*! + Constructs a regular expression object for the given \a pattern + string. The pattern must be given using wildcard notation if \a + syntax is \l Wildcard; the default is \l RegExp. The pattern is + case sensitive, unless \a cs is Qt::CaseInsensitive. Matching is + greedy (maximal), but can be changed by calling + setMinimal(). + + \sa setPattern(), setCaseSensitivity(), setPatternSyntax() +*/ +QRegExp::QRegExp(const QString &pattern, Qt::CaseSensitivity cs, PatternSyntax syntax) +{ + priv = new QRegExpPrivate(QRegExpEngineKey(pattern, syntax, cs)); + prepareEngine(priv); +} + +/*! + Constructs a regular expression as a copy of \a rx. + + \sa operator=() +*/ +QRegExp::QRegExp(const QRegExp &rx) +{ + priv = new QRegExpPrivate; + operator=(rx); +} + +/*! + Destroys the regular expression and cleans up its internal data. +*/ +QRegExp::~QRegExp() +{ + invalidateEngine(priv); + delete priv; +} + +/*! + Copies the regular expression \a rx and returns a reference to the + copy. The case sensitivity, wildcard, and minimal matching options + are also copied. +*/ +QRegExp &QRegExp::operator=(const QRegExp &rx) +{ + prepareEngine(rx.priv); // to allow sharing + QRegExpEngine *otherEng = rx.priv->eng; + if (otherEng) + otherEng->ref.ref(); + invalidateEngine(priv); + priv->eng = otherEng; + priv->engineKey = rx.priv->engineKey; + priv->minimal = rx.priv->minimal; +#ifndef QT_NO_REGEXP_CAPTURE + priv->t = rx.priv->t; + priv->capturedCache = rx.priv->capturedCache; +#endif + if (priv->eng) + priv->matchState.prepareForMatch(priv->eng); + priv->matchState.captured = rx.priv->matchState.captured; + return *this; +} + +/*! + \fn QRegExp &QRegExp::operator=(QRegExp &&other) + + Move-assigns \a other to this QRegExp instance. + + \since 5.2 +*/ + +/*! + \fn void QRegExp::swap(QRegExp &other) + \since 4.8 + + Swaps regular expression \a other with this regular + expression. This operation is very fast and never fails. +*/ + +/*! + Returns \c true if this regular expression is equal to \a rx; + otherwise returns \c false. + + Two QRegExp objects are equal if they have the same pattern + strings and the same settings for case sensitivity, wildcard and + minimal matching. +*/ +bool QRegExp::operator==(const QRegExp &rx) const +{ + return priv->engineKey == rx.priv->engineKey && priv->minimal == rx.priv->minimal; +} + +/*! + \since 5.6 + \relates QRegExp + + Returns the hash value for \a key, using + \a seed to seed the calculation. +*/ +uint qHash(const QRegExp &key, uint seed) noexcept +{ + QtPrivate::QHashCombine hash; + seed = hash(seed, key.priv->engineKey); + seed = hash(seed, key.priv->minimal); + return seed; +} + +/*! + \fn bool QRegExp::operator!=(const QRegExp &rx) const + + Returns \c true if this regular expression is not equal to \a rx; + otherwise returns \c false. + + \sa operator==() +*/ + +/*! + Returns \c true if the pattern string is empty; otherwise returns + false. + + If you call exactMatch() with an empty pattern on an empty string + it will return true; otherwise it returns \c false since it operates + over the whole string. If you call indexIn() with an empty pattern + on \e any string it will return the start offset (0 by default) + because the empty pattern matches the 'emptiness' at the start of + the string. In this case the length of the match returned by + matchedLength() will be 0. + + See QString::isEmpty(). +*/ + +bool QRegExp::isEmpty() const +{ + return priv->engineKey.pattern.isEmpty(); +} + +/*! + Returns \c true if the regular expression is valid; otherwise returns + false. An invalid regular expression never matches. + + The pattern \b{[a-z} is an example of an invalid pattern, since + it lacks a closing square bracket. + + Note that the validity of a regexp may also depend on the setting + of the wildcard flag, for example \b{*.html} is a valid + wildcard regexp but an invalid full regexp. + + \sa errorString() +*/ +bool QRegExp::isValid() const +{ + if (priv->engineKey.pattern.isEmpty()) { + return true; + } else { + prepareEngine(priv); + return priv->eng->isValid(); + } +} + +/*! + Returns the pattern string of the regular expression. The pattern + has either regular expression syntax or wildcard syntax, depending + on patternSyntax(). + + \sa patternSyntax(), caseSensitivity() +*/ +QString QRegExp::pattern() const +{ + return priv->engineKey.pattern; +} + +/*! + Sets the pattern string to \a pattern. The case sensitivity, + wildcard, and minimal matching options are not changed. + + \sa setPatternSyntax(), setCaseSensitivity() +*/ +void QRegExp::setPattern(const QString &pattern) +{ + if (priv->engineKey.pattern != pattern) { + invalidateEngine(priv); + priv->engineKey.pattern = pattern; + } +} + +/*! + Returns Qt::CaseSensitive if the regexp is matched case + sensitively; otherwise returns Qt::CaseInsensitive. + + \sa patternSyntax(), pattern(), isMinimal() +*/ +Qt::CaseSensitivity QRegExp::caseSensitivity() const +{ + return priv->engineKey.cs; +} + +/*! + Sets case sensitive matching to \a cs. + + If \a cs is Qt::CaseSensitive, \b{\\.txt$} matches + \c{readme.txt} but not \c{README.TXT}. + + \sa setPatternSyntax(), setPattern(), setMinimal() +*/ +void QRegExp::setCaseSensitivity(Qt::CaseSensitivity cs) +{ + if ((bool)cs != (bool)priv->engineKey.cs) { + invalidateEngine(priv); + priv->engineKey.cs = cs; + } +} + +/*! + Returns the syntax used by the regular expression. The default is + QRegExp::RegExp. + + \sa pattern(), caseSensitivity() +*/ +QRegExp::PatternSyntax QRegExp::patternSyntax() const +{ + return priv->engineKey.patternSyntax; +} + +/*! + Sets the syntax mode for the regular expression. The default is + QRegExp::RegExp. + + Setting \a syntax to QRegExp::Wildcard enables simple shell-like + \l{QRegExp wildcard matching}. For example, \b{r*.txt} matches the + string \c{readme.txt} in wildcard mode, but does not match + \c{readme}. + + Setting \a syntax to QRegExp::FixedString means that the pattern + is interpreted as a plain string. Special characters (e.g., + backslash) don't need to be escaped then. + + \sa setPattern(), setCaseSensitivity(), escape() +*/ +void QRegExp::setPatternSyntax(PatternSyntax syntax) +{ + if (syntax != priv->engineKey.patternSyntax) { + invalidateEngine(priv); + priv->engineKey.patternSyntax = syntax; + } +} + +/*! + Returns \c true if minimal (non-greedy) matching is enabled; + otherwise returns \c false. + + \sa caseSensitivity(), setMinimal() +*/ +bool QRegExp::isMinimal() const +{ + return priv->minimal; +} + +/*! + Enables or disables minimal matching. If \a minimal is false, + matching is greedy (maximal) which is the default. + + For example, suppose we have the input string "We must be + bold, very bold!" and the pattern + \b{.*}. With the default greedy (maximal) matching, + the match is "We must be \underline{bold, very + bold}!". But with minimal (non-greedy) matching, the + first match is: "We must be \underline{bold}, very + bold!" and the second match is "We must be bold, + very \underline{bold}!". In practice we might use the pattern + \b{[^<]*\} instead, although this will still fail for + nested tags. + + \sa setCaseSensitivity() +*/ +void QRegExp::setMinimal(bool minimal) +{ + priv->minimal = minimal; +} + +// ### Qt 5: make non-const +/*! + Returns \c true if \a str is matched exactly by this regular + expression; otherwise returns \c false. You can determine how much of + the string was matched by calling matchedLength(). + + For a given regexp string R, exactMatch("R") is the equivalent of + indexIn("^R$") since exactMatch() effectively encloses the regexp + in the start of string and end of string anchors, except that it + sets matchedLength() differently. + + For example, if the regular expression is \b{blue}, then + exactMatch() returns \c true only for input \c blue. For inputs \c + bluebell, \c blutak and \c lightblue, exactMatch() returns \c false + and matchedLength() will return 4, 3 and 0 respectively. + + Although const, this function sets matchedLength(), + capturedTexts(), and pos(). + + \sa indexIn(), lastIndexIn() +*/ +bool QRegExp::exactMatch(const QString &str) const +{ + prepareEngineForMatch(priv, str); + priv->matchState.match(str.unicode(), str.length(), 0, priv->minimal, true, 0); + if (priv->matchState.captured[1] == str.length()) { + return true; + } else { + priv->matchState.captured[0] = 0; + priv->matchState.captured[1] = priv->matchState.oneTestMatchedLen; + return false; + } +} + +// ### Qt 5: make non-const +/*! + Attempts to find a match in \a str from position \a offset (0 by + default). If \a offset is -1, the search starts at the last + character; if -2, at the next to last character; etc. + + Returns the position of the first match, or -1 if there was no + match. + + The \a caretMode parameter can be used to instruct whether \b{^} + should match at index 0 or at \a offset. + + You might prefer to use QString::indexOf(), QString::contains(), + or even QStringList::filter(). To replace matches use + QString::replace(). + + Example: + \snippet code/src_corelib_tools_qregexp.cpp 13 + + Although const, this function sets matchedLength(), + capturedTexts() and pos(). + + If the QRegExp is a wildcard expression (see setPatternSyntax()) + and want to test a string against the whole wildcard expression, + use exactMatch() instead of this function. + + \sa lastIndexIn(), exactMatch() +*/ + +int QRegExp::indexIn(const QString &str, int offset, CaretMode caretMode) const +{ + prepareEngineForMatch(priv, str); + if (offset < 0) + offset += str.length(); + priv->matchState.match(str.unicode(), str.length(), offset, + priv->minimal, false, caretIndex(offset, caretMode)); + return priv->matchState.captured[0]; +} + +// ### Qt 5: make non-const +/*! + Attempts to find a match backwards in \a str from position \a + offset. If \a offset is -1 (the default), the search starts at the + last character; if -2, at the next to last character; etc. + + Returns the position of the first match, or -1 if there was no + match. + + The \a caretMode parameter can be used to instruct whether \b{^} + should match at index 0 or at \a offset. + + Although const, this function sets matchedLength(), + capturedTexts() and pos(). + + \warning Searching backwards is much slower than searching + forwards. + + \sa indexIn(), exactMatch() +*/ + +int QRegExp::lastIndexIn(const QString &str, int offset, CaretMode caretMode) const +{ + prepareEngineForMatch(priv, str); + if (offset < 0) + offset += str.length(); + if (offset < 0 || offset > str.length()) { + memset(priv->matchState.captured, -1, priv->matchState.capturedSize*sizeof(int)); + return -1; + } + + while (offset >= 0) { + priv->matchState.match(str.unicode(), str.length(), offset, + priv->minimal, true, caretIndex(offset, caretMode)); + if (priv->matchState.captured[0] == offset) + return offset; + --offset; + } + return -1; +} + +/*! + Returns the length of the last matched string, or -1 if there was + no match. + + \sa exactMatch(), indexIn(), lastIndexIn() +*/ +int QRegExp::matchedLength() const +{ + return priv->matchState.captured[1]; +} + +#ifndef QT_NO_REGEXP_CAPTURE + +/*! + \since 4.6 + Returns the number of captures contained in the regular expression. + */ +int QRegExp::captureCount() const +{ + prepareEngine(priv); + return priv->eng->captureCount(); +} + +/*! + Returns a list of the captured text strings. + + The first string in the list is the entire matched string. Each + subsequent list element contains a string that matched a + (capturing) subexpression of the regexp. + + For example: + \snippet code/src_corelib_tools_qregexp.cpp 14 + + The above example also captures elements that may be present but + which we have no interest in. This problem can be solved by using + non-capturing parentheses: + + \snippet code/src_corelib_tools_qregexp.cpp 15 + + Note that if you want to iterate over the list, you should iterate + over a copy, e.g. + \snippet code/src_corelib_tools_qregexp.cpp 16 + + Some regexps can match an indeterminate number of times. For + example if the input string is "Offsets: 12 14 99 231 7" and the + regexp, \c{rx}, is \b{(\\d+)+}, we would hope to get a list of + all the numbers matched. However, after calling + \c{rx.indexIn(str)}, capturedTexts() will return the list ("12", + "12"), i.e. the entire match was "12" and the first subexpression + matched was "12". The correct approach is to use cap() in a + \l{QRegExp#cap_in_a_loop}{loop}. + + The order of elements in the string list is as follows. The first + element is the entire matching string. Each subsequent element + corresponds to the next capturing open left parentheses. Thus + capturedTexts()[1] is the text of the first capturing parentheses, + capturedTexts()[2] is the text of the second and so on + (corresponding to $1, $2, etc., in some other regexp languages). + + \sa cap(), pos() +*/ +QStringList QRegExp::capturedTexts() const +{ + if (priv->capturedCache.isEmpty()) { + prepareEngine(priv); + const int *captured = priv->matchState.captured; + int n = priv->matchState.capturedSize; + + for (int i = 0; i < n; i += 2) { + QString m; + if (captured[i + 1] == 0) + m = QLatin1String(""); // ### Qt 5: don't distinguish between null and empty + else if (captured[i] >= 0) + m = priv->t.mid(captured[i], captured[i + 1]); + priv->capturedCache.append(m); + } + priv->t.clear(); + } + return priv->capturedCache; +} + +/*! + \internal +*/ +QStringList QRegExp::capturedTexts() +{ + return const_cast(this)->capturedTexts(); +} + +/*! + Returns the text captured by the \a nth subexpression. The entire + match has index 0 and the parenthesized subexpressions have + indexes starting from 1 (excluding non-capturing parentheses). + + \snippet code/src_corelib_tools_qregexp.cpp 17 + + The order of elements matched by cap() is as follows. The first + element, cap(0), is the entire matching string. Each subsequent + element corresponds to the next capturing open left parentheses. + Thus cap(1) is the text of the first capturing parentheses, cap(2) + is the text of the second, and so on. + + \sa capturedTexts(), pos() +*/ +QString QRegExp::cap(int nth) const +{ + return capturedTexts().value(nth); +} + +/*! + \internal +*/ +QString QRegExp::cap(int nth) +{ + return const_cast(this)->cap(nth); +} + +/*! + Returns the position of the \a nth captured text in the searched + string. If \a nth is 0 (the default), pos() returns the position + of the whole match. + + Example: + \snippet code/src_corelib_tools_qregexp.cpp 18 + + For zero-length matches, pos() always returns -1. (For example, if + cap(4) would return an empty string, pos(4) returns -1.) This is + a feature of the implementation. + + \sa cap(), capturedTexts() +*/ +int QRegExp::pos(int nth) const +{ + if (nth < 0 || nth >= priv->matchState.capturedSize / 2) + return -1; + else + return priv->matchState.captured[2 * nth]; +} + +/*! + \internal +*/ +int QRegExp::pos(int nth) +{ + return const_cast(this)->pos(nth); +} + +/*! + Returns a text string that explains why a regexp pattern is + invalid the case being; otherwise returns "no error occurred". + + \sa isValid() +*/ +QString QRegExp::errorString() const +{ + if (isValid()) { + return QString::fromLatin1(RXERR_OK); + } else { + return priv->eng->errorString(); + } +} + +/*! + \internal +*/ +QString QRegExp::errorString() +{ + return const_cast(this)->errorString(); +} +#endif + +/*! + Returns the string \a str with every regexp special character + escaped with a backslash. The special characters are $, (,), *, +, + ., ?, [, \,], ^, {, | and }. + + Example: + + \snippet code/src_corelib_tools_qregexp.cpp 19 + + This function is useful to construct regexp patterns dynamically: + + \snippet code/src_corelib_tools_qregexp.cpp 20 + + \sa setPatternSyntax() +*/ +QString QRegExp::escape(const QString &str) +{ + QString quoted; + const int count = str.count(); + quoted.reserve(count * 2); + const QLatin1Char backslash('\\'); + for (int i = 0; i < count; i++) { + switch (str.at(i).toLatin1()) { + case '$': + case '(': + case ')': + case '*': + case '+': + case '.': + case '?': + case '[': + case '\\': + case ']': + case '^': + case '{': + case '|': + case '}': + quoted.append(backslash); + } + quoted.append(str.at(i)); + } + return quoted; +} + + +#ifndef QT_NO_DATASTREAM +/*! + \relates QRegExp + + Writes the regular expression \a regExp to stream \a out. + + \sa {Serializing Qt Data Types} +*/ +QDataStream &operator<<(QDataStream &out, const QRegExp ®Exp) +{ + return out << regExp.pattern() << (quint8)regExp.caseSensitivity() + << (quint8)regExp.patternSyntax() + << (quint8)!!regExp.isMinimal(); +} + +/*! + \relates QRegExp + + Reads a regular expression from stream \a in into \a regExp. + + \sa {Serializing Qt Data Types} +*/ +QDataStream &operator>>(QDataStream &in, QRegExp ®Exp) +{ + QString pattern; + quint8 cs; + quint8 patternSyntax; + quint8 isMinimal; + + in >> pattern >> cs >> patternSyntax >> isMinimal; + + QRegExp newRegExp(pattern, Qt::CaseSensitivity(cs), + QRegExp::PatternSyntax(patternSyntax)); + + newRegExp.setMinimal(isMinimal); + regExp = newRegExp; + return in; +} +#endif // QT_NO_DATASTREAM + +#ifndef QT_NO_DEBUG_STREAM +QDebug operator<<(QDebug dbg, const QRegExp &r) +{ + QDebugStateSaver saver(dbg); + dbg.nospace() << "QRegExp(patternSyntax=" << r.patternSyntax() + << ", pattern='"<< r.pattern() << "')"; + return dbg; +} +#endif + +QT_END_NAMESPACE -- cgit v1.2.3