diff options
author | Sona Kurazyan <sona.kurazyan@qt.io> | 2020-06-29 16:54:15 +0200 |
---|---|---|
committer | Sona Kurazyan <sona.kurazyan@qt.io> | 2020-07-13 10:53:23 +0200 |
commit | 361dc074f2301b4b68435c05ccaa7279c0170776 (patch) | |
tree | 5b22e926cae55437c46aa9507d8dd36bb252ebb0 /src | |
parent | ac14858e85cfee06c1e19843b92d50e38bc969dd (diff) |
Move QRegExp and its remaining mentions out of QtCore
Task-number: QTBUG-85235
Change-Id: Ibd6c98d952c1bb9916b64715c6430fb0d3fe3843
Reviewed-by: Lars Knoll <lars.knoll@qt.io>
Diffstat (limited to 'src')
-rw-r--r-- | src/corelib/.prev_CMakeLists.txt | 1 | ||||
-rw-r--r-- | src/corelib/CMakeLists.txt | 1 | ||||
-rw-r--r-- | src/corelib/doc/snippets/code/src_corelib_text_qregexp.cpp | 243 | ||||
-rw-r--r-- | src/corelib/text/qregexp.cpp | 5039 | ||||
-rw-r--r-- | src/corelib/text/qregexp.h | 151 | ||||
-rw-r--r-- | src/corelib/text/text.pri | 2 | ||||
-rw-r--r-- | src/tools/uic/qclass_lib_map.h | 1 |
7 files changed, 0 insertions, 5438 deletions
diff --git a/src/corelib/.prev_CMakeLists.txt b/src/corelib/.prev_CMakeLists.txt index f93c934c85..23b44f739d 100644 --- a/src/corelib/.prev_CMakeLists.txt +++ b/src/corelib/.prev_CMakeLists.txt @@ -145,7 +145,6 @@ qt_add_module(Core text/qlocale.cpp text/qlocale.h text/qlocale_p.h text/qlocale_data_p.h text/qlocale_tools.cpp text/qlocale_tools_p.h - text/qregexp.cpp text/qregexp.h text/qstring.cpp text/qstring.h text/qstring_compat.cpp text/qstringalgorithms.h text/qstringalgorithms_p.h diff --git a/src/corelib/CMakeLists.txt b/src/corelib/CMakeLists.txt index 9deda274b7..84d1a79b9b 100644 --- a/src/corelib/CMakeLists.txt +++ b/src/corelib/CMakeLists.txt @@ -164,7 +164,6 @@ qt_add_module(Core text/qlocale.cpp text/qlocale.h text/qlocale_p.h text/qlocale_data_p.h text/qlocale_tools.cpp text/qlocale_tools_p.h - text/qregexp.cpp text/qregexp.h text/qstring.cpp text/qstring.h text/qstring_compat.cpp text/qstringalgorithms.h text/qstringalgorithms_p.h diff --git a/src/corelib/doc/snippets/code/src_corelib_text_qregexp.cpp b/src/corelib/doc/snippets/code/src_corelib_text_qregexp.cpp deleted file mode 100644 index 8339ea413e..0000000000 --- a/src/corelib/doc/snippets/code/src_corelib_text_qregexp.cpp +++ /dev/null @@ -1,243 +0,0 @@ -/**************************************************************************** -** -** Copyright (C) 2016 The Qt Company Ltd. -** Contact: https://www.qt.io/licensing/ -** -** This file is part of the documentation of the Qt Toolkit. -** -** $QT_BEGIN_LICENSE:BSD$ -** Commercial License Usage -** Licensees holding valid commercial Qt licenses may use this file in -** accordance with the commercial license agreement provided with the -** Software or, alternatively, in accordance with the terms contained in -** a written agreement between you and The Qt Company. For licensing terms -** and conditions see https://www.qt.io/terms-conditions. For further -** information use the contact form at https://www.qt.io/contact-us. -** -** BSD License Usage -** Alternatively, you may use this file under the terms of the BSD license -** as follows: -** -** "Redistribution and use in source and binary forms, with or without -** modification, are permitted provided that the following conditions are -** met: -** * Redistributions of source code must retain the above copyright -** notice, this list of conditions and the following disclaimer. -** * Redistributions in binary form must reproduce the above copyright -** notice, this list of conditions and the following disclaimer in -** the documentation and/or other materials provided with the -** distribution. -** * Neither the name of The Qt Company Ltd nor the names of its -** contributors may be used to endorse or promote products derived -** from this software without specific prior written permission. -** -** -** THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -** "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -** LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -** A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -** OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -** SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -** LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -** DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -** THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -** (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -** OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE." -** -** $QT_END_LICENSE$ -** -****************************************************************************/ - -//! [0] -QRegExp rx("(\\d+)"); -QString str = "Offsets: 12 14 99 231 7"; -QStringList list; -int pos = 0; - -while ((pos = rx.indexIn(str, pos)) != -1) { - list << rx.cap(1); - pos += rx.matchedLength(); -} -// list: ["12", "14", "99", "231", "7"] -//! [0] - - -//! [1] -QRegExp rx("*.txt"); -rx.setPatternSyntax(QRegExp::Wildcard); -rx.exactMatch("README.txt"); // returns true -rx.exactMatch("welcome.txt.bak"); // returns false -//! [1] - - -//! [2] -QRegExp rx("ro+m"); -rx.setMinimal(true); -//! [2] - - -//! [3] -QRegExp mark("\\b" // word boundary - "[Mm]ark" // the word we want to match - ); -//! [3] - - -//! [4] -QRegExp rx("^\\d\\d?$"); // match integers 0 to 99 -rx.indexIn("123"); // returns -1 (no match) -rx.indexIn("-6"); // returns -1 (no match) -rx.indexIn("6"); // returns 0 (matched at position 0) -//! [4] - - -//! [5] -QRegExp rx("^\\S+$"); // match strings without whitespace -rx.indexIn("Hello world"); // returns -1 (no match) -rx.indexIn("This_is-OK"); // returns 0 (matched at position 0) -//! [5] - - -//! [6] -QRegExp rx("\\b(mail|letter|correspondence)\\b"); -rx.indexIn("I sent you an email"); // returns -1 (no match) -rx.indexIn("Please write the letter"); // returns 17 -//! [6] - - -//! [7] -QString captured = rx.cap(1); // captured == "letter" -//! [7] - - -//! [8] -QRegExp rx("&(?!amp;)"); // match ampersands but not & -QString line1 = "This & that"; -line1.replace(rx, "&"); -// line1 == "This & that" -QString line2 = "His & hers & theirs"; -line2.replace(rx, "&"); -// line2 == "His & hers & theirs" -//! [8] - - -//! [9] -QString str = "One Eric another Eirik, and an Ericsson. " - "How many Eiriks, Eric?"; -QRegExp rx("\\b(Eric|Eirik)\\b"); // match Eric or Eirik -int pos = 0; // where we are in the string -int count = 0; // how many Eric and Eirik's we've counted -while (pos >= 0) { - pos = rx.indexIn(str, pos); - if (pos >= 0) { - ++pos; // move along in str - ++count; // count our Eric or Eirik - } -} -//! [9] - - -//! [10] -str = "The Qt Company Ltd\tqt.io\tFinland"; -QString company, web, country; -rx.setPattern("^([^\t]+)\t([^\t]+)\t([^\t]+)$"); -if (rx.indexIn(str) != -1) { - company = rx.cap(1); - web = rx.cap(2); - country = rx.cap(3); -} -//! [10] - - -//! [11] -QStringList field = str.split("\t"); -//! [11] - - -//! [12] -QRegExp rx("*.html"); -rx.setPatternSyntax(QRegExp::Wildcard); -rx.exactMatch("index.html"); // returns true -rx.exactMatch("default.htm"); // returns false -rx.exactMatch("readme.txt"); // returns false -//! [12] - - -//! [13] -QString str = "offsets: 1.23 .50 71.00 6.00"; -QRegExp rx("\\d*\\.\\d+"); // primitive floating point matching -int count = 0; -int pos = 0; -while ((pos = rx.indexIn(str, pos)) != -1) { - ++count; - pos += rx.matchedLength(); -} -// pos will be 9, 14, 18 and finally 24; count will end up as 4 -//! [13] - - -//! [14] -QRegExp rx("(\\d+)(\\s*)(cm|inch(es)?)"); -int pos = rx.indexIn("Length: 36 inches"); -QStringList list = rx.capturedTexts(); -// list is now ("36 inches", "36", " ", "inches", "es") -//! [14] - - -//! [15] -QRegExp rx("(\\d+)(?:\\s*)(cm|inch(?:es)?)"); -int pos = rx.indexIn("Length: 36 inches"); -QStringList list = rx.capturedTexts(); -// list is now ("36 inches", "36", "inches") -//! [15] - - -//! [16] -QStringList list = rx.capturedTexts(); -QStringList::iterator it = list.begin(); -while (it != list.end()) { - myProcessing(*it); - ++it; -} -//! [16] - - -//! [17] -QRegExp rxlen("(\\d+)(?:\\s*)(cm|inch)"); -int pos = rxlen.indexIn("Length: 189cm"); -if (pos > -1) { - QString value = rxlen.cap(1); // "189" - QString unit = rxlen.cap(2); // "cm" - // ... -} -//! [17] - - -//! [18] -QRegExp rx("/([a-z]+)/([a-z]+)"); -rx.indexIn("Output /dev/null"); // returns 7 (position of /dev/null) -rx.pos(0); // returns 7 (position of /dev/null) -rx.pos(1); // returns 8 (position of dev) -rx.pos(2); // returns 12 (position of null) -//! [18] - - -//! [19] -s1 = QRegExp::escape("bingo"); // s1 == "bingo" -s2 = QRegExp::escape("f(x)"); // s2 == "f\\(x\\)" -//! [19] - - -//! [20] -QRegExp rx("(" + QRegExp::escape(name) + - "|" + QRegExp::escape(alias) + ")"); -//! [20] - -{ -//! [21] -QString p("a .*|pattern"); - -// re matches exactly the pattern string p -QRegularExpression re(QRegularExpression::anchoredPattern(p)); -//! [21] -} diff --git a/src/corelib/text/qregexp.cpp b/src/corelib/text/qregexp.cpp deleted file mode 100644 index d7a2434b52..0000000000 --- a/src/corelib/text/qregexp.cpp +++ /dev/null @@ -1,5039 +0,0 @@ -/**************************************************************************** -** -** Copyright (C) 2016 The Qt Company Ltd. -** Contact: https://www.qt.io/licensing/ -** -** This file is part of the QtCore module of the Qt Toolkit. -** -** $QT_BEGIN_LICENSE:LGPL$ -** Commercial License Usage -** Licensees holding valid commercial Qt licenses may use this file in -** accordance with the commercial license agreement provided with the -** Software or, alternatively, in accordance with the terms contained in -** a written agreement between you and The Qt Company. For licensing terms -** and conditions see https://www.qt.io/terms-conditions. For further -** information use the contact form at https://www.qt.io/contact-us. -** -** GNU Lesser General Public License Usage -** Alternatively, this file may be used under the terms of the GNU Lesser -** General Public License version 3 as published by the Free Software -** Foundation and appearing in the file LICENSE.LGPL3 included in the -** packaging of this file. Please review the following information to -** ensure the GNU Lesser General Public License version 3 requirements -** will be met: https://www.gnu.org/licenses/lgpl-3.0.html. -** -** GNU General Public License Usage -** Alternatively, this file may be used under the terms of the GNU -** General Public License version 2.0 or (at your option) the GNU General -** Public license version 3 or any later version approved by the KDE Free -** Qt Foundation. The licenses are as published by the Free Software -** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3 -** included in the packaging of this file. Please review the following -** information to ensure the GNU General Public License requirements will -** be met: https://www.gnu.org/licenses/gpl-2.0.html and -** https://www.gnu.org/licenses/gpl-3.0.html. -** -** $QT_END_LICENSE$ -** -****************************************************************************/ - -#include "qregexp.h" - -#include "qalgorithms.h" -#include "qbitarray.h" -#include "qcache.h" -#include "qdatastream.h" -#include "qdebug.h" -#include "qhashfunctions.h" -#include "qlist.h" -#include "qmap.h" -#include "qmutex.h" -#include "qstring.h" -#include "qstringlist.h" -#include "qstringmatcher.h" -#include "private/qlocking_p.h" - -#include <limits.h> -#include <algorithm> - -QT_BEGIN_NAMESPACE - -// error strings for the regexp parser -#define RXERR_OK QT_TRANSLATE_NOOP("QRegExp", "no error occurred") -#define RXERR_DISABLED QT_TRANSLATE_NOOP("QRegExp", "disabled feature used") -#define RXERR_CHARCLASS QT_TRANSLATE_NOOP("QRegExp", "bad char class syntax") -#define RXERR_LOOKAHEAD QT_TRANSLATE_NOOP("QRegExp", "bad lookahead syntax") -#define RXERR_LOOKBEHIND QT_TRANSLATE_NOOP("QRegExp", "lookbehinds not supported, see QTBUG-2371") -#define RXERR_REPETITION QT_TRANSLATE_NOOP("QRegExp", "bad repetition syntax") -#define RXERR_OCTAL QT_TRANSLATE_NOOP("QRegExp", "invalid octal value") -#define RXERR_LEFTDELIM QT_TRANSLATE_NOOP("QRegExp", "missing left delim") -#define RXERR_END QT_TRANSLATE_NOOP("QRegExp", "unexpected end") -#define RXERR_LIMIT QT_TRANSLATE_NOOP("QRegExp", "met internal limit") -#define RXERR_INTERVAL QT_TRANSLATE_NOOP("QRegExp", "invalid interval") -#define RXERR_CATEGORY QT_TRANSLATE_NOOP("QRegExp", "invalid category") - -/*! - \class QRegExp - \inmodule QtCore - \obsolete Use QRegularExpression instead - \reentrant - \brief The QRegExp class provides pattern matching using regular expressions. - - \ingroup tools - \ingroup shared - - \keyword regular expression - - This class is deprecated in Qt 6. Please use QRegularExpression instead - for all new code. For guidelines on porting old code from QRegExp to - QRegularExpression, see {Porting to QRegularExpression} - - A regular expression, or "regexp", is a pattern for matching - substrings in a text. This is useful in many contexts, e.g., - - \table - \row \li Validation - \li A regexp can test whether a substring meets some criteria, - e.g. is an integer or contains no whitespace. - \row \li Searching - \li A regexp provides more powerful pattern matching than - simple substring matching, e.g., match one of the words - \e{mail}, \e{letter} or \e{correspondence}, but none of the - words \e{email}, \e{mailman}, \e{mailer}, \e{letterbox}, etc. - \row \li Search and Replace - \li A regexp can replace all occurrences of a substring with a - different substring, e.g., replace all occurrences of \e{&} - with \e{\&} except where the \e{&} is already followed by - an \e{amp;}. - \row \li String Splitting - \li A regexp can be used to identify where a string should be - split apart, e.g. splitting tab-delimited strings. - \endtable - - A brief introduction to regexps is presented, a description of - Qt's regexp language, some examples, and the function - documentation itself. QRegExp is modeled on Perl's regexp - language. It fully supports Unicode. QRegExp can also be used in a - simpler, \e{wildcard mode} that is similar to the functionality - found in command shells. The syntax rules used by QRegExp can be - changed with setPatternSyntax(). In particular, the pattern syntax - can be set to QRegExp::FixedString, which means the pattern to be - matched is interpreted as a plain string, i.e., special characters - (e.g., backslash) are not escaped. - - A good text on regexps is \e {Mastering Regular Expressions} - (Third Edition) by Jeffrey E. F. Friedl, ISBN 0-596-52812-4. - - \note In Qt 5, the new QRegularExpression class provides a Perl - compatible implementation of regular expressions and is recommended - in place of QRegExp. - - \tableofcontents - - \section1 Introduction - - Regexps are built up from expressions, quantifiers, and - assertions. The simplest expression is a character, e.g. \b{x} - or \b{5}. An expression can also be a set of characters - enclosed in square brackets. \b{[ABCD]} will match an \b{A} - or a \b{B} or a \b{C} or a \b{D}. We can write this same - expression as \b{[A-D]}, and an expression to match any - capital letter in the English alphabet is written as - \b{[A-Z]}. - - A quantifier specifies the number of occurrences of an expression - that must be matched. \b{x{1,1}} means match one and only one - \b{x}. \b{x{1,5}} means match a sequence of \b{x} - characters that contains at least one \b{x} but no more than - five. - - Note that in general regexps cannot be used to check for balanced - brackets or tags. For example, a regexp can be written to match an - opening html \c{<b>} and its closing \c{</b>}, if the \c{<b>} tags - are not nested, but if the \c{<b>} tags are nested, that same - regexp will match an opening \c{<b>} tag with the wrong closing - \c{</b>}. For the fragment \c{<b>bold <b>bolder</b></b>}, the - first \c{<b>} would be matched with the first \c{</b>}, which is - not correct. However, it is possible to write a regexp that will - match nested brackets or tags correctly, but only if the number of - nesting levels is fixed and known. If the number of nesting levels - is not fixed and known, it is impossible to write a regexp that - will not fail. - - Suppose we want a regexp to match integers in the range 0 to 99. - At least one digit is required, so we start with the expression - \b{[0-9]{1,1}}, which matches a single digit exactly once. This - regexp matches integers in the range 0 to 9. To match integers up - to 99, increase the maximum number of occurrences to 2, so the - regexp becomes \b{[0-9]{1,2}}. This regexp satisfies the - original requirement to match integers from 0 to 99, but it will - also match integers that occur in the middle of strings. If we - want the matched integer to be the whole string, we must use the - anchor assertions, \b{^} (caret) and \b{$} (dollar). When - \b{^} is the first character in a regexp, it means the regexp - must match from the beginning of the string. When \b{$} is the - last character of the regexp, it means the regexp must match to - the end of the string. The regexp becomes \b{^[0-9]{1,2}$}. - Note that assertions, e.g. \b{^} and \b{$}, do not match - characters but locations in the string. - - If you have seen regexps described elsewhere, they may have looked - different from the ones shown here. This is because some sets of - characters and some quantifiers are so common that they have been - given special symbols to represent them. \b{[0-9]} can be - replaced with the symbol \b{\\d}. The quantifier to match - exactly one occurrence, \b{{1,1}}, can be replaced with the - expression itself, i.e. \b{x{1,1}} is the same as \b{x}. So - our 0 to 99 matcher could be written as \b{^\\d{1,2}$}. It can - also be written \b{^\\d\\d{0,1}$}, i.e. \e{From the start of - the string, match a digit, followed immediately by 0 or 1 digits}. - In practice, it would be written as \b{^\\d\\d?$}. The \b{?} - is shorthand for the quantifier \b{{0,1}}, i.e. 0 or 1 - occurrences. \b{?} makes an expression optional. The regexp - \b{^\\d\\d?$} means \e{From the beginning of the string, match - one digit, followed immediately by 0 or 1 more digit, followed - immediately by end of string}. - - To write a regexp that matches one of the words 'mail' \e or - 'letter' \e or 'correspondence' but does not match words that - contain these words, e.g., 'email', 'mailman', 'mailer', and - 'letterbox', start with a regexp that matches 'mail'. Expressed - fully, the regexp is \b{m{1,1}a{1,1}i{1,1}l{1,1}}, but because - a character expression is automatically quantified by - \b{{1,1}}, we can simplify the regexp to \b{mail}, i.e., an - 'm' followed by an 'a' followed by an 'i' followed by an 'l'. Now - we can use the vertical bar \b{|}, which means \b{or}, to - include the other two words, so our regexp for matching any of the - three words becomes \b{mail|letter|correspondence}. Match - 'mail' \b{or} 'letter' \b{or} 'correspondence'. While this - regexp will match one of the three words we want to match, it will - also match words we don't want to match, e.g., 'email'. To - prevent the regexp from matching unwanted words, we must tell it - to begin and end the match at word boundaries. First we enclose - our regexp in parentheses, \b{(mail|letter|correspondence)}. - Parentheses group expressions together, and they identify a part - of the regexp that we wish to \l{capturing text}{capture}. - Enclosing the expression in parentheses allows us to use it as a - component in more complex regexps. It also allows us to examine - which of the three words was actually matched. To force the match - to begin and end on word boundaries, we enclose the regexp in - \b{\\b} \e{word boundary} assertions: - \b{\\b(mail|letter|correspondence)\\b}. Now the regexp means: - \e{Match a word boundary, followed by the regexp in parentheses, - followed by a word boundary}. The \b{\\b} assertion matches a - \e position in the regexp, not a \e character. A word boundary is - any non-word character, e.g., a space, newline, or the beginning - or ending of a string. - - If we want to replace ampersand characters with the HTML entity - \b{\&}, the regexp to match is simply \b{\&}. But this - regexp will also match ampersands that have already been converted - to HTML entities. We want to replace only ampersands that are not - already followed by \b{amp;}. For this, we need the negative - lookahead assertion, \b{(?!}__\b{)}. The regexp can then be - written as \b{\&(?!amp;)}, i.e. \e{Match an ampersand that is} - \b{not} \e{followed by} \b{amp;}. - - If we want to count all the occurrences of 'Eric' and 'Eirik' in a - string, two valid solutions are \b{\\b(Eric|Eirik)\\b} and - \b{\\bEi?ri[ck]\\b}. The word boundary assertion '\\b' is - required to avoid matching words that contain either name, - e.g. 'Ericsson'. Note that the second regexp matches more - spellings than we want: 'Eric', 'Erik', 'Eiric' and 'Eirik'. - - Some of the examples discussed above are implemented in the - \l{#code-examples}{code examples} section. - - \target characters-and-abbreviations-for-sets-of-characters - \section1 Characters and Abbreviations for Sets of Characters - - \table - \header \li Element \li Meaning - \row \li \b{c} - \li A character represents itself unless it has a special - regexp meaning. e.g. \b{c} matches the character \e c. - \row \li \b{\\c} - \li A character that follows a backslash matches the character - itself, except as specified below. e.g., To match a literal - caret at the beginning of a string, write \b{\\^}. - \row \li \b{\\a} - \li Matches the ASCII bell (BEL, 0x07). - \row \li \b{\\f} - \li Matches the ASCII form feed (FF, 0x0C). - \row \li \b{\\n} - \li Matches the ASCII line feed (LF, 0x0A, Unix newline). - \row \li \b{\\r} - \li Matches the ASCII carriage return (CR, 0x0D). - \row \li \b{\\t} - \li Matches the ASCII horizontal tab (HT, 0x09). - \row \li \b{\\v} - \li Matches the ASCII vertical tab (VT, 0x0B). - \row \li \b{\\x\e{hhhh}} - \li Matches the Unicode character corresponding to the - hexadecimal number \e{hhhh} (between 0x0000 and 0xFFFF). - \row \li \b{\\0\e{ooo}} (i.e., \\zero \e{ooo}) - \li matches the ASCII/Latin1 character for the octal number - \e{ooo} (between 0 and 0377). - \row \li \b{. (dot)} - \li Matches any character (including newline). - \row \li \b{\\d} - \li Matches a digit (QChar::isDigit()). - \row \li \b{\\D} - \li Matches a non-digit. - \row \li \b{\\s} - \li Matches a whitespace character (QChar::isSpace()). - \row \li \b{\\S} - \li Matches a non-whitespace character. - \row \li \b{\\w} - \li Matches a word character (QChar::isLetterOrNumber(), QChar::isMark(), or '_'). - \row \li \b{\\W} - \li Matches a non-word character. - \row \li \b{\\\e{n}} - \li The \e{n}-th backreference, e.g. \\1, \\2, etc. - \endtable - - \b{Note:} The C++ compiler transforms backslashes in strings. - To include a \b{\\} in a regexp, enter it twice, i.e. \c{\\}. - To match the backslash character itself, enter it four times, i.e. - \c{\\\\}. - - \target sets-of-characters - \section1 Sets of Characters - - Square brackets mean match any character contained in the square - brackets. The character set abbreviations described above can - appear in a character set in square brackets. Except for the - character set abbreviations and the following two exceptions, - characters do not have special meanings in square brackets. - - \table - \row \li \b{^} - - \li The caret negates the character set if it occurs as the - first character (i.e. immediately after the opening square - bracket). \b{[abc]} matches 'a' or 'b' or 'c', but - \b{[^abc]} matches anything \e but 'a' or 'b' or 'c'. - - \row \li \b{-} - - \li The dash indicates a range of characters. \b{[W-Z]} - matches 'W' or 'X' or 'Y' or 'Z'. - - \endtable - - Using the predefined character set abbreviations is more portable - than using character ranges across platforms and languages. For - example, \b{[0-9]} matches a digit in Western alphabets but - \b{\\d} matches a digit in \e any alphabet. - - Note: In other regexp documentation, sets of characters are often - called "character classes". - - \target quantifiers - \section1 Quantifiers - - By default, an expression is automatically quantified by - \b{{1,1}}, i.e. it should occur exactly once. In the following - list, \b{\e {E}} stands for expression. An expression is a - character, or an abbreviation for a set of characters, or a set of - characters in square brackets, or an expression in parentheses. - - \table - \row \li \b{\e {E}?} - - \li Matches zero or one occurrences of \e E. This quantifier - means \e{The previous expression is optional}, because it - will match whether or not the expression is found. \b{\e - {E}?} is the same as \b{\e {E}{0,1}}. e.g., \b{dents?} - matches 'dent' or 'dents'. - - \row \li \b{\e {E}+} - - \li Matches one or more occurrences of \e E. \b{\e {E}+} is - the same as \b{\e {E}{1,}}. e.g., \b{0+} matches '0', - '00', '000', etc. - - \row \li \b{\e {E}*} - - \li Matches zero or more occurrences of \e E. It is the same - as \b{\e {E}{0,}}. The \b{*} quantifier is often used - in error where \b{+} should be used. For example, if - \b{\\s*$} is used in an expression to match strings that - end in whitespace, it will match every string because - \b{\\s*$} means \e{Match zero or more whitespaces followed - by end of string}. The correct regexp to match strings that - have at least one trailing whitespace character is - \b{\\s+$}. - - \row \li \b{\e {E}{n}} - - \li Matches exactly \e n occurrences of \e E. \b{\e {E}{n}} - is the same as repeating \e E \e n times. For example, - \b{x{5}} is the same as \b{xxxxx}. It is also the same - as \b{\e {E}{n,n}}, e.g. \b{x{5,5}}. - - \row \li \b{\e {E}{n,}} - \li Matches at least \e n occurrences of \e E. - - \row \li \b{\e {E}{,m}} - \li Matches at most \e m occurrences of \e E. \b{\e {E}{,m}} - is the same as \b{\e {E}{0,m}}. - - \row \li \b{\e {E}{n,m}} - \li Matches at least \e n and at most \e m occurrences of \e E. - \endtable - - To apply a quantifier to more than just the preceding character, - use parentheses to group characters together in an expression. For - example, \b{tag+} matches a 't' followed by an 'a' followed by - at least one 'g', whereas \b{(tag)+} matches at least one - occurrence of 'tag'. - - Note: Quantifiers are normally "greedy". They always match as much - text as they can. For example, \b{0+} matches the first zero it - finds and all the consecutive zeros after the first zero. Applied - to '20005', it matches '2\underline{000}5'. Quantifiers can be made - non-greedy, see setMinimal(). - - \target capturing parentheses - \target backreferences - \section1 Capturing Text - - Parentheses allow us to group elements together so that we can - quantify and capture them. For example if we have the expression - \b{mail|letter|correspondence} that matches a string we know - that \e one of the words matched but not which one. Using - parentheses allows us to "capture" whatever is matched within - their bounds, so if we used \b{(mail|letter|correspondence)} - and matched this regexp against the string "I sent you some email" - we can use the cap() or capturedTexts() functions to extract the - matched characters, in this case 'mail'. - - We can use captured text within the regexp itself. To refer to the - captured text we use \e backreferences which are indexed from 1, - the same as for cap(). For example we could search for duplicate - words in a string using \b{\\b(\\w+)\\W+\\1\\b} which means match a - word boundary followed by one or more word characters followed by - one or more non-word characters followed by the same text as the - first parenthesized expression followed by a word boundary. - - If we want to use parentheses purely for grouping and not for - capturing we can use the non-capturing syntax, e.g. - \b{(?:green|blue)}. Non-capturing parentheses begin '(?:' and - end ')'. In this example we match either 'green' or 'blue' but we - do not capture the match so we only know whether or not we matched - but not which color we actually found. Using non-capturing - parentheses is more efficient than using capturing parentheses - since the regexp engine has to do less book-keeping. - - Both capturing and non-capturing parentheses may be nested. - - \target greedy quantifiers - - For historical reasons, quantifiers (e.g. \b{*}) that apply to - capturing parentheses are more "greedy" than other quantifiers. - For example, \b{a*(a*)} will match "aaa" with cap(1) == "aaa". - This behavior is different from what other regexp engines do - (notably, Perl). To obtain a more intuitive capturing behavior, - specify QRegExp::RegExp2 to the QRegExp constructor or call - setPatternSyntax(QRegExp::RegExp2). - - \target cap_in_a_loop - - When the number of matches cannot be determined in advance, a - common idiom is to use cap() in a loop. For example: - - \snippet code/src_corelib_text_qregexp.cpp 0 - - \target assertions - \section1 Assertions - - Assertions make some statement about the text at the point where - they occur in the regexp but they do not match any characters. In - the following list \b{\e {E}} stands for any expression. - - \table - \row \li \b{^} - \li The caret signifies the beginning of the string. If you - wish to match a literal \c{^} you must escape it by - writing \c{\\^}. For example, \b{^#include} will only - match strings which \e begin with the characters '#include'. - (When the caret is the first character of a character set it - has a special meaning, see \l{#sets-of-characters}{Sets of Characters}.) - - \row \li \b{$} - \li The dollar signifies the end of the string. For example - \b{\\d\\s*$} will match strings which end with a digit - optionally followed by whitespace. If you wish to match a - literal \c{$} you must escape it by writing - \c{\\$}. - - \row \li \b{\\b} - \li A word boundary. For example the regexp - \b{\\bOK\\b} means match immediately after a word - boundary (e.g. start of string or whitespace) the letter 'O' - then the letter 'K' immediately before another word boundary - (e.g. end of string or whitespace). But note that the - assertion does not actually match any whitespace so if we - write \b{(\\bOK\\b)} and we have a match it will only - contain 'OK' even if the string is "It's \underline{OK} now". - - \row \li \b{\\B} - \li A non-word boundary. This assertion is true wherever - \b{\\b} is false. For example if we searched for - \b{\\Bon\\B} in "Left on" the match would fail (space - and end of string aren't non-word boundaries), but it would - match in "t\underline{on}ne". - - \row \li \b{(?=\e E)} - \li Positive lookahead. This assertion is true if the - expression matches at this point in the regexp. For example, - \b{const(?=\\s+char)} matches 'const' whenever it is - followed by 'char', as in 'static \underline{const} char *'. - (Compare with \b{const\\s+char}, which matches 'static - \underline{const char} *'.) - - \row \li \b{(?!\e E)} - \li Negative lookahead. This assertion is true if the - expression does not match at this point in the regexp. For - example, \b{const(?!\\s+char)} matches 'const' \e except - when it is followed by 'char'. - \endtable - - \target QRegExp wildcard matching - \section1 Wildcard Matching - - Most command shells such as \e bash or \e cmd.exe support "file - globbing", the ability to identify a group of files by using - wildcards. The setPatternSyntax() function is used to switch - between regexp and wildcard mode. Wildcard matching is much - simpler than full regexps and has only four features: - - \table - \row \li \b{c} - \li Any character represents itself apart from those mentioned - below. Thus \b{c} matches the character \e c. - \row \li \b{?} - \li Matches any single character. It is the same as - \b{.} in full regexps. - \row \li \b{*} - \li Matches zero or more of any characters. It is the - same as \b{.*} in full regexps. - \row \li \b{[...]} - \li Sets of characters can be represented in square brackets, - similar to full regexps. Within the character class, like - outside, backslash has no special meaning. - \endtable - - In the mode Wildcard, the wildcard characters cannot be - escaped. In the mode WildcardUnix, the character '\\' escapes the - wildcard. - - For example if we are in wildcard mode and have strings which - contain filenames we could identify HTML files with \b{*.html}. - This will match zero or more characters followed by a dot followed - by 'h', 't', 'm' and 'l'. - - To test a string against a wildcard expression, use exactMatch(). - For example: - - \snippet code/src_corelib_text_qregexp.cpp 1 - - \target perl-users - \section1 Notes for Perl Users - - Most of the character class abbreviations supported by Perl are - supported by QRegExp, see \l{#characters-and-abbreviations-for-sets-of-characters} - {characters and abbreviations for sets of characters}. - - In QRegExp, apart from within character classes, \c{^} always - signifies the start of the string, so carets must always be - escaped unless used for that purpose. In Perl the meaning of caret - varies automagically depending on where it occurs so escaping it - is rarely necessary. The same applies to \c{$} which in - QRegExp always signifies the end of the string. - - QRegExp's quantifiers are the same as Perl's greedy quantifiers - (but see the \l{greedy quantifiers}{note above}). Non-greedy - matching cannot be applied to individual quantifiers, but can be - applied to all the quantifiers in the pattern. For example, to - match the Perl regexp \b{ro+?m} requires: - - \snippet code/src_corelib_text_qregexp.cpp 2 - - The equivalent of Perl's \c{/i} option is - setCaseSensitivity(Qt::CaseInsensitive). - - Perl's \c{/g} option can be emulated using a \l{#cap_in_a_loop}{loop}. - - In QRegExp \b{.} matches any character, therefore all QRegExp - regexps have the equivalent of Perl's \c{/s} option. QRegExp - does not have an equivalent to Perl's \c{/m} option, but this - can be emulated in various ways for example by splitting the input - into lines or by looping with a regexp that searches for newlines. - - Because QRegExp is string oriented, there are no \\A, \\Z, or \\z - assertions. The \\G assertion is not supported but can be emulated - in a loop. - - Perl's $& is cap(0) or capturedTexts()[0]. There are no QRegExp - equivalents for $`, $' or $+. Perl's capturing variables, $1, $2, - ... correspond to cap(1) or capturedTexts()[1], cap(2) or - capturedTexts()[2], etc. - - To substitute a pattern use QString::replace(). - - Perl's extended \c{/x} syntax is not supported, nor are - directives, e.g. (?i), or regexp comments, e.g. (?#comment). On - the other hand, C++'s rules for literal strings can be used to - achieve the same: - - \snippet code/src_corelib_text_qregexp.cpp 3 - - Both zero-width positive and zero-width negative lookahead - assertions (?=pattern) and (?!pattern) are supported with the same - syntax as Perl. Perl's lookbehind assertions, "independent" - subexpressions and conditional expressions are not supported. - - Non-capturing parentheses are also supported, with the same - (?:pattern) syntax. - - See QString::split() and QStringList::join() for equivalents - to Perl's split and join functions. - - Note: because C++ transforms \\'s they must be written \e twice in - code, e.g. \b{\\b} must be written \b{\\\\b}. - - \target code-examples - \section1 Code Examples - - \snippet code/src_corelib_text_qregexp.cpp 4 - - The third string matches '\underline{6}'. This is a simple validation - regexp for integers in the range 0 to 99. - - \snippet code/src_corelib_text_qregexp.cpp 5 - - The second string matches '\underline{This_is-OK}'. We've used the - character set abbreviation '\\S' (non-whitespace) and the anchors - to match strings which contain no whitespace. - - In the following example we match strings containing 'mail' or - 'letter' or 'correspondence' but only match whole words i.e. not - 'email' - - \snippet code/src_corelib_text_qregexp.cpp 6 - - The second string matches "Please write the \underline{letter}". The - word 'letter' is also captured (because of the parentheses). We - can see what text we've captured like this: - - \snippet code/src_corelib_text_qregexp.cpp 7 - - This will capture the text from the first set of capturing - parentheses (counting capturing left parentheses from left to - right). The parentheses are counted from 1 since cap(0) is the - whole matched regexp (equivalent to '&' in most regexp engines). - - \snippet code/src_corelib_text_qregexp.cpp 8 - - Here we've passed the QRegExp to QString's replace() function to - replace the matched text with new text. - - \snippet code/src_corelib_text_qregexp.cpp 9 - - We've used the indexIn() function to repeatedly match the regexp in - the string. Note that instead of moving forward by one character - at a time \c pos++ we could have written \c {pos += - rx.matchedLength()} to skip over the already matched string. The - count will equal 3, matching 'One \underline{Eric} another - \underline{Eirik}, and an Ericsson. How many Eiriks, \underline{Eric}?'; it - doesn't match 'Ericsson' or 'Eiriks' because they are not bounded - by non-word boundaries. - - One common use of regexps is to split lines of delimited data into - their component fields. - - \snippet code/src_corelib_text_qregexp.cpp 10 - - In this example our input lines have the format company name, web - address and country. Unfortunately the regexp is rather long and - not very versatile -- the code will break if we add any more - fields. A simpler and better solution is to look for the - separator, '\\t' in this case, and take the surrounding text. The - QString::split() function can take a separator string or regexp - as an argument and split a string accordingly. - - \snippet code/src_corelib_text_qregexp.cpp 11 - - Here field[0] is the company, field[1] the web address and so on. - - To imitate the matching of a shell we can use wildcard mode. - - \snippet code/src_corelib_text_qregexp.cpp 12 - - Wildcard matching can be convenient because of its simplicity, but - any wildcard regexp can be defined using full regexps, e.g. - \b{.*\\.html$}. Notice that we can't match both \c .html and \c - .htm files with a wildcard unless we use \b{*.htm*} which will - also match 'test.html.bak'. A full regexp gives us the precision - we need, \b{.*\\.html?$}. - - QRegExp can match case insensitively using setCaseSensitivity(), - and can use non-greedy matching, see setMinimal(). By - default QRegExp uses full regexps but this can be changed with - setPatternSyntax(). Searching can be done forward with indexIn() or backward - with lastIndexIn(). Captured text can be accessed using - capturedTexts() which returns a string list of all captured - strings, or using cap() which returns the captured string for the - given index. The pos() function takes a match index and returns - the position in the string where the match was made (or -1 if - there was no match). - - \sa QString, QStringList, QSortFilterProxyModel, - {tools/regexp}{Regular Expression Example} - - - \section1 Porting to QRegularExpression - - The QRegularExpression class introduced in Qt 5 is a big improvement upon - QRegExp, in terms of APIs offered, supported pattern syntax and speed of - execution. The biggest difference is that QRegularExpression simply holds a - regular expression, and it's \e{not} modified when a match is requested. - Instead, a QRegularExpressionMatch object is returned, in order to check - the result of a match and extract the captured substring. The same applies - with global matching and QRegularExpressionMatchIterator. - - Other differences are outlined below. - - \section2 Different pattern syntax - - Porting a regular expression from QRegExp to QRegularExpression may require - changes to the pattern itself. - - In certain scenarios, QRegExp was too lenient and accepted patterns that - are simply invalid when using QRegularExpression. These are somehow easy - to detect, because the QRegularExpression objects built with these patterns - are not valid (cf. QRegularExpression::isValid()). - - In other cases, a pattern ported from QRegExp to QRegularExpression may - silently change semantics. Therefore, it is necessary to review the - patterns used. The most notable cases of silent incompatibility are: - - \list - - \li Curly braces are needed in order to use a hexadecimal escape like - \c{\xHHHH} with more than 2 digits. A pattern like \c{\x2022} neeeds to - be ported to \c{\x{2022}}, or it will match a space (\c{0x20}) followed - by the string \c{"22"}. In general, it is highly recommended to always use - curly braces with the \c{\x} escape, no matter the amount of digits - specified. - - \li A 0-to-n quantification like \c{{,n}} needs to be ported to \c{{0,n}} to - preserve semantics. Otherwise, a pattern such as \c{\d{,3}} would - actually match a digit followed by the exact string \c{"{,3}"}. - - \li QRegExp by default does Unicode-aware matching, while - QRegularExpression requires a separate option; see below for more details. - - \li c{.} in QRegExp does by default match all characters, including the - newline character. QRegularExpression excludes the newline character by - default. To include the newline character, set the - QRegularExpression::DotMatchesEverythingOption pattern option. - - \endlist - - \section2 Porting from QRegExp::exactMatch() - - QRegExp::exactMatch() in Qt 4 served two purposes: it exactly matched - a regular expression against a subject string, and it implemented partial - matching. - - \section3 Porting from QRegExp's Exact Matching - - Exact matching indicates whether the regular expression matches the entire - subject string. For example, the classes yield on the subject string \c{"abc123"}: - - \table - \header \li \li QRegExp::exactMatch() \li QRegularExpressionMatch::hasMatch() - \row \li \c{"\\d+"} \li \b false \li \b true - \row \li \c{"[a-z]+\\d+"} \li \b true \li \b true - \endtable - - Exact matching is not reflected in QRegularExpression. If you want - to be sure that the subject string matches the regular expression - exactly, you can wrap the pattern using the QRegularExpression::anchoredPattern() - function: - - \snippet code/src_corelib_text_qregexp.cpp 21 - - \section3 Porting from QRegExp's Partial Matching - - When using QRegExp::exactMatch(), if an exact match was not found, one - could still find out how much of the subject string was matched by the - regular expression by calling QRegExp::matchedLength(). If the returned length - was equal to the subject string's length, then one could conclude that a partial - match was found. - - QRegularExpression supports partial matching explicitly by means of the - appropriate MatchType. - - \section2 Global matching - - Due to limitations of the QRegExp API it was impossible to implement global - matching correctly (that is, like Perl does). In particular, patterns that - can match 0 characters (like \c{"a*"}) are problematic. - - QRegularExpression::globalMatch() implements Perl global match correctly, and - the returned iterator can be used to examine each result. - - \section2 Unicode properties support - - When using QRegExp, character classes such as \c{\w}, \c{\d}, etc. match - characters with the corresponding Unicode property: for instance, \c{\d} - matches any character with the Unicode Nd (decimal digit) property. - - Those character classes only match ASCII characters by default when using - QRegularExpression: for instance, \c{\d} matches exactly a character in the - \c{0-9} ASCII range. It is possible to change this behavior by using the - UseUnicodePropertiesOption pattern option. - - \section2 Wildcard matching - - There is no direct way to do wildcard matching in QRegularExpression. - However, the wildcardToRegularExpression method is provided to translate - glob patterns into a Perl-compatible regular expression that can be used - for that purpose. - - \section2 Other pattern syntaxes - - QRegularExpression supports only Perl-compatible regular expressions. - - \section2 Minimal matching - - QRegExp::setMinimal() implemented minimal matching by simply reversing the - greediness of the quantifiers (QRegExp did not support lazy quantifiers, - like \c{*?}, \c{+?}, etc.). QRegularExpression instead does support greedy, - lazy and possessive quantifiers. The InvertedGreedinessOption - pattern option can be useful to emulate the effects of QRegExp::setMinimal(): - if enabled, it inverts the greediness of quantifiers (greedy ones become - lazy and vice versa). - - \section2 Caret modes - - The AnchorAtOffsetMatchOption match option can be used to emulate the - QRegExp::CaretAtOffset behavior. There is no equivalent for the other - QRegExp::CaretMode modes. -*/ - -#if defined(Q_OS_VXWORKS) && defined(EOS) -# undef EOS -#endif - -const int NumBadChars = 64; -#define BadChar(ch) ((ch).unicode() % NumBadChars) - -const int NoOccurrence = INT_MAX; -const int EmptyCapture = INT_MAX; -const int InftyLen = INT_MAX; -const int InftyRep = 1025; -const int EOS = -1; - -static bool isWord(QChar ch) -{ - return ch.isLetterOrNumber() || ch.isMark() || ch == QLatin1Char('_'); -} - -/* - Merges two vectors of ints and puts the result into the first - one. -*/ -static void mergeInto(QList<int> *a, const QList<int> &b) -{ - int asize = a->size(); - int bsize = b.size(); - if (asize == 0) { - *a = b; -#ifndef QT_NO_REGEXP_OPTIM - } else if (bsize == 1 && a->at(asize - 1) < b.at(0)) { - a->resize(asize + 1); - (*a)[asize] = b.at(0); -#endif - } else if (bsize >= 1) { - int csize = asize + bsize; - QList<int> c(csize); - int i = 0, j = 0, k = 0; - while (i < asize) { - if (j < bsize) { - if (a->at(i) == b.at(j)) { - ++i; - --csize; - } else if (a->at(i) < b.at(j)) { - c[k++] = a->at(i++); - } else { - c[k++] = b.at(j++); - } - } else { - memcpy(c.data() + k, a->constData() + i, (asize - i) * sizeof(int)); - break; - } - } - c.resize(csize); - if (j < bsize) - memcpy(c.data() + k, b.constData() + j, (bsize - j) * sizeof(int)); - *a = c; - } -} - -#ifndef QT_NO_REGEXP_WILDCARD -/* - Translates a wildcard pattern to an equivalent regular expression - pattern (e.g., *.cpp to .*\.cpp). - - If enableEscaping is true, it is possible to escape the wildcard - characters with \ -*/ -static QString wc2rx(const QString &wc_str, const bool enableEscaping) -{ - const int wclen = wc_str.length(); - QString rx; - int i = 0; - bool isEscaping = false; // the previous character is '\' - const QChar *wc = wc_str.unicode(); - - while (i < wclen) { - const QChar c = wc[i++]; - switch (c.unicode()) { - case '\\': - if (enableEscaping) { - if (isEscaping) { - rx += QLatin1String("\\\\"); - } // we insert the \\ later if necessary - if (i == wclen) { // the end - rx += QLatin1String("\\\\"); - } - } else { - rx += QLatin1String("\\\\"); - } - isEscaping = true; - break; - case '*': - if (isEscaping) { - rx += QLatin1String("\\*"); - isEscaping = false; - } else { - rx += QLatin1String(".*"); - } - break; - case '?': - if (isEscaping) { - rx += QLatin1String("\\?"); - isEscaping = false; - } else { - rx += QLatin1Char('.'); - } - - break; - case '$': - case '(': - case ')': - case '+': - case '.': - case '^': - case '{': - case '|': - case '}': - if (isEscaping) { - isEscaping = false; - rx += QLatin1String("\\\\"); - } - rx += QLatin1Char('\\'); - rx += c; - break; - case '[': - if (isEscaping) { - isEscaping = false; - rx += QLatin1String("\\["); - } else { - rx += c; - if (wc[i] == QLatin1Char('^')) - rx += wc[i++]; - if (i < wclen) { - if (wc[i] == QLatin1Char(']')) - rx += wc[i++]; - while (i < wclen && wc[i] != QLatin1Char(']')) { - if (wc[i] == QLatin1Char('\\')) - rx += QLatin1Char('\\'); - rx += wc[i++]; - } - } - } - break; - - case ']': - if(isEscaping){ - isEscaping = false; - rx += QLatin1String("\\"); - } - rx += c; - break; - - default: - if(isEscaping){ - isEscaping = false; - rx += QLatin1String("\\\\"); - } - rx += c; - } - } - return rx; -} -#endif - -static int caretIndex(int offset, QRegExp::CaretMode caretMode) -{ - if (caretMode == QRegExp::CaretAtZero) { - return 0; - } else if (caretMode == QRegExp::CaretAtOffset) { - return offset; - } else { // QRegExp::CaretWontMatch - return -1; - } -} - -/* - The QRegExpEngineKey struct uniquely identifies an engine. -*/ -struct QRegExpEngineKey -{ - QString pattern; - QRegExp::PatternSyntax patternSyntax; - Qt::CaseSensitivity cs; - - inline QRegExpEngineKey(const QString &pattern, QRegExp::PatternSyntax patternSyntax, - Qt::CaseSensitivity cs) - : pattern(pattern), patternSyntax(patternSyntax), cs(cs) {} - - inline void clear() { - pattern.clear(); - patternSyntax = QRegExp::RegExp; - cs = Qt::CaseSensitive; - } -}; - -static bool operator==(const QRegExpEngineKey &key1, const QRegExpEngineKey &key2) -{ - return key1.pattern == key2.pattern && key1.patternSyntax == key2.patternSyntax - && key1.cs == key2.cs; -} - -static size_t qHash(const QRegExpEngineKey &key, size_t seed = 0) noexcept -{ - return qHashMulti(seed, key.pattern, key.patternSyntax, key.cs); -} - -class QRegExpEngine; - -/* - This is the engine state during matching. -*/ -struct QRegExpMatchState -{ - const QChar *in; // a pointer to the input string data - int pos; // the current position in the string - int caretPos; - int len; // the length of the input string - bool minimal; // minimal matching? - int *bigArray; // big array holding the data for the next pointers - int *inNextStack; // is state is nextStack? - int *curStack; // stack of current states - int *nextStack; // stack of next states - int *curCapBegin; // start of current states' captures - int *nextCapBegin; // start of next states' captures - int *curCapEnd; // end of current states' captures - int *nextCapEnd; // end of next states' captures - int *tempCapBegin; // start of temporary captures - int *tempCapEnd; // end of temporary captures - int *capBegin; // start of captures for a next state - int *capEnd; // end of captures for a next state - int *slideTab; // bump-along slide table for bad-character heuristic - int *captured; // what match() returned last - int slideTabSize; // size of slide table - int capturedSize; -#ifndef QT_NO_REGEXP_BACKREF - QList<QList<int>> sleeping; // list of back-reference sleepers -#endif - int matchLen; // length of match - int oneTestMatchedLen; // length of partial match - - const QRegExpEngine *eng; - - inline QRegExpMatchState() : bigArray(nullptr), captured(nullptr) {} - inline ~QRegExpMatchState() { free(bigArray); } - - void drain() { free(bigArray); bigArray = nullptr; captured = nullptr; } // to save memory - void prepareForMatch(QRegExpEngine *eng); - void match(const QChar *str, int len, int pos, bool minimal, - bool oneTest, int caretIndex); - bool matchHere(); - bool testAnchor(int i, int a, const int *capBegin); -}; - -/* - The struct QRegExpAutomatonState represents one state in a modified NFA. The - input characters matched are stored in the state instead of on - the transitions, something possible for an automaton - constructed from a regular expression. -*/ -struct QRegExpAutomatonState -{ -#ifndef QT_NO_REGEXP_CAPTURE - int atom; // which atom does this state belong to? -#endif - int match; // what does it match? (see CharClassBit and BackRefBit) - QList<int> outs; // out-transitions - QMap<int, int> reenter; // atoms reentered when transiting out - QMap<int, int> anchors; // anchors met when transiting out - - inline QRegExpAutomatonState() { } -#ifndef QT_NO_REGEXP_CAPTURE - inline QRegExpAutomatonState(int a, int m) - : atom(a), match(m) { } -#else - inline QRegExpAutomatonState(int m) - : match(m) { } -#endif -}; - -Q_DECLARE_TYPEINFO(QRegExpAutomatonState, Q_MOVABLE_TYPE); - -/* - The struct QRegExpCharClassRange represents a range of characters (e.g., - [0-9] denotes range 48 to 57). -*/ -struct QRegExpCharClassRange -{ - ushort from; // 48 - ushort len; // 10 -}; - -Q_DECLARE_TYPEINFO(QRegExpCharClassRange, Q_PRIMITIVE_TYPE); - -#ifndef QT_NO_REGEXP_CAPTURE -/* - The struct QRegExpAtom represents one node in the hierarchy of regular - expression atoms. -*/ -struct QRegExpAtom -{ - enum { NoCapture = -1, OfficialCapture = -2, UnofficialCapture = -3 }; - - int parent; // index of parent in array of atoms - int capture; // index of capture, from 1 to ncap - 1 -}; - -Q_DECLARE_TYPEINFO(QRegExpAtom, Q_PRIMITIVE_TYPE); -#endif - -struct QRegExpLookahead; - -#ifndef QT_NO_REGEXP_ANCHOR_ALT -/* - The struct QRegExpAnchorAlternation represents a pair of anchors with - OR semantics. -*/ -struct QRegExpAnchorAlternation -{ - int a; // this anchor... - int b; // ...or this one -}; - -Q_DECLARE_TYPEINFO(QRegExpAnchorAlternation, Q_PRIMITIVE_TYPE); -#endif - -#ifndef QT_NO_REGEXP_CCLASS - -#define FLAG(x) (1 << (x)) -/* - The class QRegExpCharClass represents a set of characters, such as can - be found in regular expressions (e.g., [a-z] denotes the set - {a, b, ..., z}). -*/ -class QRegExpCharClass -{ -public: - QRegExpCharClass(); - - void clear(); - bool negative() const { return n; } - void setNegative(bool negative); - void addCategories(uint cats); - void addRange(ushort from, ushort to); - void addSingleton(ushort ch) { addRange(ch, ch); } - - bool in(QChar ch) const; -#ifndef QT_NO_REGEXP_OPTIM - const QList<int> &firstOccurrence() const { return occ1; } -#endif - -#if defined(QT_DEBUG) - void dump() const; -#endif - -private: - QList<QRegExpCharClassRange> r; // character ranges -#ifndef QT_NO_REGEXP_OPTIM - QList<int> occ1; // first-occurrence array -#endif - uint c; // character classes - bool n; // negative? -}; -#else -struct QRegExpCharClass -{ - int dummy; - -#ifndef QT_NO_REGEXP_OPTIM - QRegExpCharClass() { occ1.fill(0, NumBadChars); } - - const QList<int> &firstOccurrence() const { return occ1; } - QList<int> occ1; -#endif -}; -#endif - -Q_DECLARE_TYPEINFO(QRegExpCharClass, Q_MOVABLE_TYPE); - -/* - The QRegExpEngine class encapsulates a modified nondeterministic - finite automaton (NFA). -*/ -class QRegExpEngine -{ -public: - QRegExpEngine(Qt::CaseSensitivity cs, bool greedyQuantifiers) - : cs(cs), greedyQuantifiers(greedyQuantifiers) { setup(); } - - QRegExpEngine(const QRegExpEngineKey &key); - ~QRegExpEngine(); - - bool isValid() const { return valid; } - const QString &errorString() const { return yyError; } - int captureCount() const { return officialncap; } - - int createState(QChar ch); - int createState(const QRegExpCharClass &cc); -#ifndef QT_NO_REGEXP_BACKREF - int createState(int bref); -#endif - - void addCatTransitions(const QList<int> &from, const QList<int> &to); -#ifndef QT_NO_REGEXP_CAPTURE - void addPlusTransitions(const QList<int> &from, const QList<int> &to, int atom); -#endif - -#ifndef QT_NO_REGEXP_ANCHOR_ALT - int anchorAlternation(int a, int b); - int anchorConcatenation(int a, int b); -#else - int anchorAlternation(int a, int b) { return a & b; } - int anchorConcatenation(int a, int b) { return a | b; } -#endif - void addAnchors(int from, int to, int a); - -#ifndef QT_NO_REGEXP_OPTIM - void heuristicallyChooseHeuristic(); -#endif - -#if defined(QT_DEBUG) - void dump() const; -#endif - - QAtomicInt ref; - -private: - enum { CharClassBit = 0x10000, BackRefBit = 0x20000 }; - enum { InitialState = 0, FinalState = 1 }; - - void setup(); - int setupState(int match); - - /* - Let's hope that 13 lookaheads and 14 back-references are - enough. - */ - enum { MaxLookaheads = 13, MaxBackRefs = 14 }; - enum { Anchor_Dollar = 0x00000001, Anchor_Caret = 0x00000002, Anchor_Word = 0x00000004, - Anchor_NonWord = 0x00000008, Anchor_FirstLookahead = 0x00000010, - Anchor_BackRef1Empty = Anchor_FirstLookahead << MaxLookaheads, - Anchor_BackRef0Empty = Anchor_BackRef1Empty >> 1, - Anchor_Alternation = unsigned(Anchor_BackRef1Empty) << MaxBackRefs, - - Anchor_LookaheadMask = (Anchor_FirstLookahead - 1) ^ - ((Anchor_FirstLookahead << MaxLookaheads) - 1) }; -#ifndef QT_NO_REGEXP_CAPTURE - int startAtom(bool officialCapture); - void finishAtom(int atom, bool needCapture); -#endif - -#ifndef QT_NO_REGEXP_LOOKAHEAD - int addLookahead(QRegExpEngine *eng, bool negative); -#endif - -#ifndef QT_NO_REGEXP_OPTIM - bool goodStringMatch(QRegExpMatchState &matchState) const; - bool badCharMatch(QRegExpMatchState &matchState) const; -#else - bool bruteMatch(QRegExpMatchState &matchState) const; -#endif - - QList<QRegExpAutomatonState> s; // array of states -#ifndef QT_NO_REGEXP_CAPTURE - QList<QRegExpAtom> f; // atom hierarchy - int nf; // number of atoms - int cf; // current atom - QList<int> captureForOfficialCapture; -#endif - int officialncap; // number of captures, seen from the outside - int ncap; // number of captures, seen from the inside -#ifndef QT_NO_REGEXP_CCLASS - QList<QRegExpCharClass> cl; // array of character classes -#endif -#ifndef QT_NO_REGEXP_LOOKAHEAD - QList<QRegExpLookahead *> ahead; // array of lookaheads -#endif -#ifndef QT_NO_REGEXP_ANCHOR_ALT - QList<QRegExpAnchorAlternation> aa; // array of (a, b) pairs of anchors -#endif -#ifndef QT_NO_REGEXP_OPTIM - bool caretAnchored; // does the regexp start with ^? - bool trivial; // is the good-string all that needs to match? -#endif - bool valid; // is the regular expression valid? - Qt::CaseSensitivity cs; // case sensitive? - bool greedyQuantifiers; // RegExp2? - bool xmlSchemaExtensions; -#ifndef QT_NO_REGEXP_BACKREF - int nbrefs; // number of back-references -#endif - -#ifndef QT_NO_REGEXP_OPTIM - bool useGoodStringHeuristic; // use goodStringMatch? otherwise badCharMatch - - int goodEarlyStart; // the index where goodStr can first occur in a match - int goodLateStart; // the index where goodStr can last occur in a match - QString goodStr; // the string that any match has to contain - - int minl; // the minimum length of a match - QList<int> occ1; // first-occurrence array -#endif - - /* - The class Box is an abstraction for a regular expression - fragment. It can also be seen as one node in the syntax tree of - a regular expression with synthetized attributes. - - Its interface is ugly for performance reasons. - */ - class Box - { - public: - Box(QRegExpEngine *engine); - Box(const Box &b) { operator=(b); } - - Box &operator=(const Box &b); - - void clear() { operator=(Box(eng)); } - void set(QChar ch); - void set(const QRegExpCharClass &cc); -#ifndef QT_NO_REGEXP_BACKREF - void set(int bref); -#endif - - void cat(const Box &b); - void orx(const Box &b); - void plus(int atom); - void opt(); - void catAnchor(int a); -#ifndef QT_NO_REGEXP_OPTIM - void setupHeuristics(); -#endif - -#if defined(QT_DEBUG) - void dump() const; -#endif - - private: - void addAnchorsToEngine(const Box &to) const; - - QRegExpEngine *eng; // the automaton under construction - QList<int> ls; // the left states (firstpos) - QList<int> rs; // the right states (lastpos) - QMap<int, int> lanchors; // the left anchors - QMap<int, int> ranchors; // the right anchors - int skipanchors; // the anchors to match if the box is skipped - -#ifndef QT_NO_REGEXP_OPTIM - int earlyStart; // the index where str can first occur - int lateStart; // the index where str can last occur - QString str; // a string that has to occur in any match - QString leftStr; // a string occurring at the left of this box - QString rightStr; // a string occurring at the right of this box - int maxl; // the maximum length of this box (possibly InftyLen) -#endif - - int minl; // the minimum length of this box -#ifndef QT_NO_REGEXP_OPTIM - QList<int> occ1; // first-occurrence array -#endif - }; - - friend class Box; - - /* - This is the lexical analyzer for regular expressions. - */ - enum { Tok_Eos, Tok_Dollar, Tok_LeftParen, Tok_MagicLeftParen, Tok_PosLookahead, - Tok_NegLookahead, Tok_RightParen, Tok_CharClass, Tok_Caret, Tok_Quantifier, Tok_Bar, - Tok_Word, Tok_NonWord, Tok_Char = 0x10000, Tok_BackRef = 0x20000 }; - int getChar(); - int getEscape(); -#ifndef QT_NO_REGEXP_INTERVAL - int getRep(int def); -#endif -#ifndef QT_NO_REGEXP_LOOKAHEAD - void skipChars(int n); -#endif - void error(const char *msg); - void startTokenizer(const QChar *rx, int len); - int getToken(); - - const QChar *yyIn; // a pointer to the input regular expression pattern - int yyPos0; // the position of yyTok in the input pattern - int yyPos; // the position of the next character to read - int yyLen; // the length of yyIn - int yyCh; // the last character read - QScopedPointer<QRegExpCharClass> yyCharClass; // attribute for Tok_CharClass tokens - int yyMinRep; // attribute for Tok_Quantifier - int yyMaxRep; // ditto - QString yyError; // syntax error or overflow during parsing? - - /* - This is the syntactic analyzer for regular expressions. - */ - int parse(const QChar *rx, int len); - void parseAtom(Box *box); - void parseFactor(Box *box); - void parseTerm(Box *box); - void parseExpression(Box *box); - - int yyTok; // the last token read - bool yyMayCapture; // set this to false to disable capturing - - friend struct QRegExpMatchState; -}; - -#ifndef QT_NO_REGEXP_LOOKAHEAD -/* - The struct QRegExpLookahead represents a lookahead a la Perl (e.g., - (?=foo) and (?!bar)). -*/ -struct QRegExpLookahead -{ - QRegExpEngine *eng; // NFA representing the embedded regular expression - bool neg; // negative lookahead? - - inline QRegExpLookahead(QRegExpEngine *eng0, bool neg0) - : eng(eng0), neg(neg0) { } - inline ~QRegExpLookahead() { delete eng; } -}; -#endif - -/*! - \internal - convert the pattern string to the RegExp syntax. - - This is also used by QScriptEngine::newRegExp to convert to a pattern that JavaScriptCore can understan - */ -Q_CORE_EXPORT QString qt_regexp_toCanonical(const QString &pattern, QRegExp::PatternSyntax patternSyntax) -{ - switch (patternSyntax) { -#ifndef QT_NO_REGEXP_WILDCARD - case QRegExp::Wildcard: - return wc2rx(pattern, false); - case QRegExp::WildcardUnix: - return wc2rx(pattern, true); -#endif - case QRegExp::FixedString: - return QRegExp::escape(pattern); - case QRegExp::W3CXmlSchema11: - default: - return pattern; - } -} - -QRegExpEngine::QRegExpEngine(const QRegExpEngineKey &key) - : cs(key.cs), greedyQuantifiers(key.patternSyntax == QRegExp::RegExp2), - xmlSchemaExtensions(key.patternSyntax == QRegExp::W3CXmlSchema11) -{ - setup(); - - QString rx = qt_regexp_toCanonical(key.pattern, key.patternSyntax); - - valid = (parse(rx.unicode(), rx.length()) == rx.length()); - if (!valid) { -#ifndef QT_NO_REGEXP_OPTIM - trivial = false; -#endif - error(RXERR_LEFTDELIM); - } -} - -QRegExpEngine::~QRegExpEngine() -{ -#ifndef QT_NO_REGEXP_LOOKAHEAD - qDeleteAll(ahead); -#endif -} - -void QRegExpMatchState::prepareForMatch(QRegExpEngine *eng) -{ - /* - We use one QList<int> for all the big data used a lot in - matchHere() and friends. - */ - int ns = eng->s.size(); // number of states - int ncap = eng->ncap; -#ifndef QT_NO_REGEXP_OPTIM - int newSlideTabSize = qMax(eng->minl + 1, 16); -#else - int newSlideTabSize = 0; -#endif - int numCaptures = eng->captureCount(); - int newCapturedSize = 2 + 2 * numCaptures; - bigArray = q_check_ptr((int *)realloc(bigArray, ((3 + 4 * ncap) * ns + 4 * ncap + newSlideTabSize + newCapturedSize)*sizeof(int))); - - // set all internal variables only _after_ bigArray is realloc'ed - // to prevent a broken regexp in oom case - - slideTabSize = newSlideTabSize; - capturedSize = newCapturedSize; - inNextStack = bigArray; - memset(inNextStack, -1, ns * sizeof(int)); - curStack = inNextStack + ns; - nextStack = inNextStack + 2 * ns; - - curCapBegin = inNextStack + 3 * ns; - nextCapBegin = curCapBegin + ncap * ns; - curCapEnd = curCapBegin + 2 * ncap * ns; - nextCapEnd = curCapBegin + 3 * ncap * ns; - - tempCapBegin = curCapBegin + 4 * ncap * ns; - tempCapEnd = tempCapBegin + ncap; - capBegin = tempCapBegin + 2 * ncap; - capEnd = tempCapBegin + 3 * ncap; - - slideTab = tempCapBegin + 4 * ncap; - captured = slideTab + slideTabSize; - memset(captured, -1, capturedSize*sizeof(int)); - this->eng = eng; -} - -/* - Tries to match in str and returns an array of (begin, length) pairs - for captured text. If there is no match, all pairs are (-1, -1). -*/ -void QRegExpMatchState::match(const QChar *str0, int len0, int pos0, - bool minimal0, bool oneTest, int caretIndex) -{ - bool matched = false; - QChar char_null; - -#ifndef QT_NO_REGEXP_OPTIM - if (eng->trivial && !oneTest) { - // ### Qt6: qsizetype - pos = int(QtPrivate::findString(QStringView(str0, len0), pos0, QStringView(eng->goodStr.unicode(), eng->goodStr.length()), eng->cs)); - matchLen = eng->goodStr.length(); - matched = (pos != -1); - } else -#endif - { - in = str0; - if (in == nullptr) - in = &char_null; - pos = pos0; - caretPos = caretIndex; - len = len0; - minimal = minimal0; - matchLen = 0; - oneTestMatchedLen = 0; - - if (eng->valid && pos >= 0 && pos <= len) { -#ifndef QT_NO_REGEXP_OPTIM - if (oneTest) { - matched = matchHere(); - } else { - if (pos <= len - eng->minl) { - if (eng->caretAnchored) { - matched = matchHere(); - } else if (eng->useGoodStringHeuristic) { - matched = eng->goodStringMatch(*this); - } else { - matched = eng->badCharMatch(*this); - } - } - } -#else - matched = oneTest ? matchHere() : eng->bruteMatch(*this); -#endif - } - } - - if (matched) { - int *c = captured; - *c++ = pos; - *c++ = matchLen; - - int numCaptures = (capturedSize - 2) >> 1; -#ifndef QT_NO_REGEXP_CAPTURE - for (int i = 0; i < numCaptures; ++i) { - int j = eng->captureForOfficialCapture.at(i); - if (capBegin[j] != EmptyCapture) { - int len = capEnd[j] - capBegin[j]; - *c++ = (len > 0) ? pos + capBegin[j] : 0; - *c++ = len; - } else { - *c++ = -1; - *c++ = -1; - } - } -#endif - } else { - // we rely on 2's complement here - memset(captured, -1, capturedSize * sizeof(int)); - } -} - -/* - The three following functions add one state to the automaton and - return the number of the state. -*/ - -int QRegExpEngine::createState(QChar ch) -{ - return setupState(ch.unicode()); -} - -int QRegExpEngine::createState(const QRegExpCharClass &cc) -{ -#ifndef QT_NO_REGEXP_CCLASS - int n = cl.size(); - cl += QRegExpCharClass(cc); - return setupState(CharClassBit | n); -#else - Q_UNUSED(cc); - return setupState(CharClassBit); -#endif -} - -#ifndef QT_NO_REGEXP_BACKREF -int QRegExpEngine::createState(int bref) -{ - if (bref > nbrefs) { - nbrefs = bref; - if (nbrefs > MaxBackRefs) { - error(RXERR_LIMIT); - return 0; - } - } - return setupState(BackRefBit | bref); -} -#endif - -/* - The two following functions add a transition between all pairs of - states (i, j) where i is found in from, and j is found in to. - - Cat-transitions are distinguished from plus-transitions for - capturing. -*/ - -void QRegExpEngine::addCatTransitions(const QList<int> &from, const QList<int> &to) -{ - for (int i = 0; i < from.size(); i++) - mergeInto(&s[from.at(i)].outs, to); -} - -#ifndef QT_NO_REGEXP_CAPTURE -void QRegExpEngine::addPlusTransitions(const QList<int> &from, const QList<int> &to, int atom) -{ - for (int i = 0; i < from.size(); i++) { - QRegExpAutomatonState &st = s[from.at(i)]; - const QList<int> oldOuts = st.outs; - mergeInto(&st.outs, to); - if (f.at(atom).capture != QRegExpAtom::NoCapture) { - for (int j = 0; j < to.size(); j++) { - // ### st.reenter.contains(to.at(j)) check looks suspicious - if (!st.reenter.contains(to.at(j)) && - !std::binary_search(oldOuts.constBegin(), oldOuts.constEnd(), to.at(j))) - st.reenter.insert(to.at(j), atom); - } - } - } -} -#endif - -#ifndef QT_NO_REGEXP_ANCHOR_ALT -/* - Returns an anchor that means a OR b. -*/ -int QRegExpEngine::anchorAlternation(int a, int b) -{ - if (((a & b) == a || (a & b) == b) && ((a | b) & Anchor_Alternation) == 0) - return a & b; - - int n = aa.size(); -#ifndef QT_NO_REGEXP_OPTIM - if (n > 0 && aa.at(n - 1).a == a && aa.at(n - 1).b == b) - return Anchor_Alternation | (n - 1); -#endif - - QRegExpAnchorAlternation element = {a, b}; - aa.append(element); - return Anchor_Alternation | n; -} - -/* - Returns an anchor that means a AND b. -*/ -int QRegExpEngine::anchorConcatenation(int a, int b) -{ - if (((a | b) & Anchor_Alternation) == 0) - return a | b; - if ((b & Anchor_Alternation) != 0) - qSwap(a, b); - - int aprime = anchorConcatenation(aa.at(a ^ Anchor_Alternation).a, b); - int bprime = anchorConcatenation(aa.at(a ^ Anchor_Alternation).b, b); - return anchorAlternation(aprime, bprime); -} -#endif - -/* - Adds anchor a on a transition caracterised by its from state and - its to state. -*/ -void QRegExpEngine::addAnchors(int from, int to, int a) -{ - QRegExpAutomatonState &st = s[from]; - if (st.anchors.contains(to)) - a = anchorAlternation(st.anchors.value(to), a); - st.anchors.insert(to, a); -} - -#ifndef QT_NO_REGEXP_OPTIM -/* - This function chooses between the good-string and the bad-character - heuristics. It computes two scores and chooses the heuristic with - the highest score. - - Here are some common-sense constraints on the scores that should be - respected if the formulas are ever modified: (1) If goodStr is - empty, the good-string heuristic scores 0. (2) If the regular - expression is trivial, the good-string heuristic should be used. - (3) If the search is case insensitive, the good-string heuristic - should be used, unless it scores 0. (Case insensitivity turns all - entries of occ1 to 0.) (4) If (goodLateStart - goodEarlyStart) is - big, the good-string heuristic should score less. -*/ -void QRegExpEngine::heuristicallyChooseHeuristic() -{ - if (minl == 0) { - useGoodStringHeuristic = false; - } else if (trivial) { - useGoodStringHeuristic = true; - } else { - /* - Magic formula: The good string has to constitute a good - proportion of the minimum-length string, and appear at a - more-or-less known index. - */ - int goodStringScore = (64 * goodStr.length() / minl) - - (goodLateStart - goodEarlyStart); - /* - Less magic formula: We pick some characters at random, and - check whether they are good or bad. - */ - int badCharScore = 0; - int step = qMax(1, NumBadChars / 32); - for (int i = 1; i < NumBadChars; i += step) { - if (occ1.at(i) == NoOccurrence) - badCharScore += minl; - else - badCharScore += occ1.at(i); - } - badCharScore /= minl; - useGoodStringHeuristic = (goodStringScore > badCharScore); - } -} -#endif - -#if defined(QT_DEBUG) -void QRegExpEngine::dump() const -{ - int i, j; - qDebug("Case %ssensitive engine", cs ? "" : "in"); - qDebug(" States"); - for (i = 0; i < s.size(); i++) { - qDebug(" %d%s", i, i == InitialState ? " (initial)" : i == FinalState ? " (final)" : ""); -#ifndef QT_NO_REGEXP_CAPTURE - if (nf > 0) - qDebug(" in atom %d", s[i].atom); -#endif - int m = s[i].match; - if ((m & CharClassBit) != 0) { - qDebug(" match character class %d", m ^ CharClassBit); -#ifndef QT_NO_REGEXP_CCLASS - cl[m ^ CharClassBit].dump(); -#else - qDebug(" negative character class"); -#endif - } else if ((m & BackRefBit) != 0) { - qDebug(" match back-reference %d", m ^ BackRefBit); - } else if (m >= 0x20 && m <= 0x7e) { - qDebug(" match 0x%.4x (%c)", m, m); - } else { - qDebug(" match 0x%.4x", m); - } - for (j = 0; j < s[i].outs.size(); j++) { - int next = s[i].outs[j]; - qDebug(" -> %d", next); - if (s[i].reenter.contains(next)) - qDebug(" [reenter %d]", s[i].reenter[next]); - if (s[i].anchors.value(next) != 0) - qDebug(" [anchors 0x%.8x]", s[i].anchors[next]); - } - } -#ifndef QT_NO_REGEXP_CAPTURE - if (nf > 0) { - qDebug(" Atom Parent Capture"); - for (i = 0; i < nf; i++) { - if (f[i].capture == QRegExpAtom::NoCapture) { - qDebug(" %6d %6d nil", i, f[i].parent); - } else { - int cap = f[i].capture; - bool official = captureForOfficialCapture.contains(cap); - qDebug(" %6d %6d %6d %s", i, f[i].parent, f[i].capture, - official ? "official" : ""); - } - } - } -#endif -#ifndef QT_NO_REGEXP_ANCHOR_ALT - for (i = 0; i < aa.size(); i++) - qDebug(" Anchor alternation 0x%.8x: 0x%.8x 0x%.9x", i, aa[i].a, aa[i].b); -#endif -} -#endif - -void QRegExpEngine::setup() -{ - ref.storeRelaxed(1); -#ifndef QT_NO_REGEXP_CAPTURE - f.resize(32); - nf = 0; - cf = -1; -#endif - officialncap = 0; - ncap = 0; -#ifndef QT_NO_REGEXP_OPTIM - caretAnchored = true; - trivial = true; -#endif - valid = false; -#ifndef QT_NO_REGEXP_BACKREF - nbrefs = 0; -#endif -#ifndef QT_NO_REGEXP_OPTIM - useGoodStringHeuristic = true; - minl = 0; - occ1.fill(0, NumBadChars); -#endif -} - -int QRegExpEngine::setupState(int match) -{ -#ifndef QT_NO_REGEXP_CAPTURE - s += QRegExpAutomatonState(cf, match); -#else - s += QRegExpAutomatonState(match); -#endif - return s.size() - 1; -} - -#ifndef QT_NO_REGEXP_CAPTURE -/* - Functions startAtom() and finishAtom() should be called to delimit - atoms. When a state is created, it is assigned to the current atom. - The information is later used for capturing. -*/ -int QRegExpEngine::startAtom(bool officialCapture) -{ - if ((nf & (nf + 1)) == 0 && nf + 1 >= f.size()) - f.resize((nf + 1) << 1); - f[nf].parent = cf; - cf = nf++; - f[cf].capture = officialCapture ? QRegExpAtom::OfficialCapture : QRegExpAtom::NoCapture; - return cf; -} - -void QRegExpEngine::finishAtom(int atom, bool needCapture) -{ - if (greedyQuantifiers && needCapture && f[atom].capture == QRegExpAtom::NoCapture) - f[atom].capture = QRegExpAtom::UnofficialCapture; - cf = f.at(atom).parent; -} -#endif - -#ifndef QT_NO_REGEXP_LOOKAHEAD -/* - Creates a lookahead anchor. -*/ -int QRegExpEngine::addLookahead(QRegExpEngine *eng, bool negative) -{ - int n = ahead.size(); - if (n == MaxLookaheads) { - error(RXERR_LIMIT); - return 0; - } - ahead += new QRegExpLookahead(eng, negative); - return Anchor_FirstLookahead << n; -} -#endif - -#ifndef QT_NO_REGEXP_CAPTURE -/* - We want the longest leftmost captures. -*/ -static bool isBetterCapture(int ncap, const int *begin1, const int *end1, const int *begin2, - const int *end2) -{ - for (int i = 0; i < ncap; i++) { - int delta = begin2[i] - begin1[i]; // it has to start early... - if (delta == 0) - delta = end1[i] - end2[i]; // ...and end late - - if (delta != 0) - return delta > 0; - } - return false; -} -#endif - -/* - Returns \c true if anchor a matches at position pos + i in the input - string, otherwise false. -*/ -bool QRegExpMatchState::testAnchor(int i, int a, const int *capBegin) -{ - int j; - -#ifndef QT_NO_REGEXP_ANCHOR_ALT - if ((a & QRegExpEngine::Anchor_Alternation) != 0) - return testAnchor(i, eng->aa.at(a ^ QRegExpEngine::Anchor_Alternation).a, capBegin) - || testAnchor(i, eng->aa.at(a ^ QRegExpEngine::Anchor_Alternation).b, capBegin); -#endif - - if ((a & QRegExpEngine::Anchor_Caret) != 0) { - if (pos + i != caretPos) - return false; - } - if ((a & QRegExpEngine::Anchor_Dollar) != 0) { - if (pos + i != len) - return false; - } -#ifndef QT_NO_REGEXP_ESCAPE - if ((a & (QRegExpEngine::Anchor_Word | QRegExpEngine::Anchor_NonWord)) != 0) { - bool before = false; - bool after = false; - if (pos + i != 0) - before = isWord(in[pos + i - 1]); - if (pos + i != len) - after = isWord(in[pos + i]); - if ((a & QRegExpEngine::Anchor_Word) != 0 && (before == after)) - return false; - if ((a & QRegExpEngine::Anchor_NonWord) != 0 && (before != after)) - return false; - } -#endif -#ifndef QT_NO_REGEXP_LOOKAHEAD - if ((a & QRegExpEngine::Anchor_LookaheadMask) != 0) { - const QList<QRegExpLookahead *> &ahead = eng->ahead; - for (j = 0; j < ahead.size(); j++) { - if ((a & (QRegExpEngine::Anchor_FirstLookahead << j)) != 0) { - QRegExpMatchState matchState; - matchState.prepareForMatch(ahead[j]->eng); - matchState.match(in + pos + i, len - pos - i, 0, - true, true, caretPos - pos - i); - if ((matchState.captured[0] == 0) == ahead[j]->neg) - return false; - } - } - } -#endif -#ifndef QT_NO_REGEXP_CAPTURE -#ifndef QT_NO_REGEXP_BACKREF - for (j = 0; j < eng->nbrefs; j++) { - if ((a & (QRegExpEngine::Anchor_BackRef1Empty << j)) != 0) { - int i = eng->captureForOfficialCapture.at(j); - if (capBegin[i] != EmptyCapture) - return false; - } - } -#endif -#endif - return true; -} - -#ifndef QT_NO_REGEXP_OPTIM -/* - The three following functions are what Jeffrey Friedl would call - transmissions (or bump-alongs). Using one or the other should make - no difference except in performance. -*/ - -bool QRegExpEngine::goodStringMatch(QRegExpMatchState &matchState) const -{ - int k = matchState.pos + goodEarlyStart; - QStringMatcher matcher(goodStr.unicode(), goodStr.length(), cs); - while ((k = matcher.indexIn(matchState.in, matchState.len, k)) != -1) { - int from = k - goodLateStart; - int to = k - goodEarlyStart; - if (from > matchState.pos) - matchState.pos = from; - - while (matchState.pos <= to) { - if (matchState.matchHere()) - return true; - ++matchState.pos; - } - ++k; - } - return false; -} - -bool QRegExpEngine::badCharMatch(QRegExpMatchState &matchState) const -{ - int slideHead = 0; - int slideNext = 0; - int i; - int lastPos = matchState.len - minl; - memset(matchState.slideTab, 0, matchState.slideTabSize * sizeof(int)); - - /* - Set up the slide table, used for the bad-character heuristic, - using the table of first occurrence of each character. - */ - for (i = 0; i < minl; i++) { - int sk = occ1[BadChar(matchState.in[matchState.pos + i])]; - if (sk == NoOccurrence) - sk = i + 1; - if (sk > 0) { - int k = i + 1 - sk; - if (k < 0) { - sk = i + 1; - k = 0; - } - if (sk > matchState.slideTab[k]) - matchState.slideTab[k] = sk; - } - } - - if (matchState.pos > lastPos) - return false; - - for (;;) { - if (++slideNext >= matchState.slideTabSize) - slideNext = 0; - if (matchState.slideTab[slideHead] > 0) { - if (matchState.slideTab[slideHead] - 1 > matchState.slideTab[slideNext]) - matchState.slideTab[slideNext] = matchState.slideTab[slideHead] - 1; - matchState.slideTab[slideHead] = 0; - } else { - if (matchState.matchHere()) - return true; - } - - if (matchState.pos == lastPos) - break; - - /* - Update the slide table. This code has much in common with - the initialization code. - */ - int sk = occ1[BadChar(matchState.in[matchState.pos + minl])]; - if (sk == NoOccurrence) { - matchState.slideTab[slideNext] = minl; - } else if (sk > 0) { - int k = slideNext + minl - sk; - if (k >= matchState.slideTabSize) - k -= matchState.slideTabSize; - if (sk > matchState.slideTab[k]) - matchState.slideTab[k] = sk; - } - slideHead = slideNext; - ++matchState.pos; - } - return false; -} -#else -bool QRegExpEngine::bruteMatch(QRegExpMatchState &matchState) const -{ - while (matchState.pos <= matchState.len) { - if (matchState.matchHere()) - return true; - ++matchState.pos; - } - return false; -} -#endif - -/* - Here's the core of the engine. It tries to do a match here and now. -*/ -bool QRegExpMatchState::matchHere() -{ - int ncur = 1, nnext = 0; - int i = 0, j, k, m; - bool stop = false; - - matchLen = -1; - oneTestMatchedLen = -1; - curStack[0] = QRegExpEngine::InitialState; - - int ncap = eng->ncap; -#ifndef QT_NO_REGEXP_CAPTURE - if (ncap > 0) { - for (j = 0; j < ncap; j++) { - curCapBegin[j] = EmptyCapture; - curCapEnd[j] = EmptyCapture; - } - } -#endif - -#ifndef QT_NO_REGEXP_BACKREF - while ((ncur > 0 || !sleeping.isEmpty()) && i <= len - pos && !stop) -#else - while (ncur > 0 && i <= len - pos && !stop) -#endif - { - int ch = (i < len - pos) ? in[pos + i].unicode() : 0; - for (j = 0; j < ncur; j++) { - int cur = curStack[j]; - const QRegExpAutomatonState &scur = eng->s.at(cur); - const QList<int> &outs = scur.outs; - for (k = 0; k < outs.size(); k++) { - int next = outs.at(k); - const QRegExpAutomatonState &snext = eng->s.at(next); - bool inside = true; -#if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE) - int needSomeSleep = 0; -#endif - - /* - First, check if the anchors are anchored properly. - */ - int a = scur.anchors.value(next); - if (a != 0 && !testAnchor(i, a, curCapBegin + j * ncap)) - inside = false; - - /* - If indeed they are, check if the input character is - correct for this transition. - */ - if (inside) { - m = snext.match; - if ((m & (QRegExpEngine::CharClassBit | QRegExpEngine::BackRefBit)) == 0) { - if (eng->cs) - inside = (m == ch); - else - inside = (QChar(m).toLower() == QChar(ch).toLower()); - } else if (next == QRegExpEngine::FinalState) { - matchLen = i; - stop = minimal; - inside = true; - } else if ((m & QRegExpEngine::CharClassBit) != 0) { -#ifndef QT_NO_REGEXP_CCLASS - const QRegExpCharClass &cc = eng->cl.at(m ^ QRegExpEngine::CharClassBit); - if (eng->cs) - inside = cc.in(QChar(ch)); - else if (cc.negative()) - inside = cc.in(QChar(ch).toLower()) && - cc.in(QChar(ch).toUpper()); - else - inside = cc.in(QChar(ch).toLower()) || - cc.in(QChar(ch).toUpper()); -#endif -#if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE) - } else { /* ((m & QRegExpEngine::BackRefBit) != 0) */ - int bref = m ^ QRegExpEngine::BackRefBit; - int ell = j * ncap + eng->captureForOfficialCapture.at(bref - 1); - - inside = bref <= ncap && curCapBegin[ell] != EmptyCapture; - if (inside) { - if (eng->cs) - inside = (in[pos + curCapBegin[ell]] == QChar(ch)); - else - inside = (in[pos + curCapBegin[ell]].toLower() - == QChar(ch).toLower()); - } - - if (inside) { - int delta; - if (curCapEnd[ell] == EmptyCapture) - delta = i - curCapBegin[ell]; - else - delta = curCapEnd[ell] - curCapBegin[ell]; - - inside = (delta <= len - (pos + i)); - if (inside && delta > 1) { - int n = 1; - if (eng->cs) { - while (n < delta) { - if (in[pos + curCapBegin[ell] + n] - != in[pos + i + n]) - break; - ++n; - } - } else { - while (n < delta) { - QChar a = in[pos + curCapBegin[ell] + n]; - QChar b = in[pos + i + n]; - if (a.toLower() != b.toLower()) - break; - ++n; - } - } - inside = (n == delta); - if (inside) - needSomeSleep = delta - 1; - } - } -#endif - } - } - - /* - We must now update our data structures. - */ - if (inside) { -#ifndef QT_NO_REGEXP_CAPTURE - int *capBegin, *capEnd; -#endif - /* - If the next state was not encountered yet, all - is fine. - */ - if ((m = inNextStack[next]) == -1) { - m = nnext++; - nextStack[m] = next; - inNextStack[next] = m; -#ifndef QT_NO_REGEXP_CAPTURE - capBegin = nextCapBegin + m * ncap; - capEnd = nextCapEnd + m * ncap; - - /* - Otherwise, we'll first maintain captures in - temporary arrays, and decide at the end whether - it's best to keep the previous capture zones or - the new ones. - */ - } else { - capBegin = tempCapBegin; - capEnd = tempCapEnd; -#endif - } - -#ifndef QT_NO_REGEXP_CAPTURE - /* - Updating the capture zones is much of a task. - */ - if (ncap > 0) { - memcpy(capBegin, curCapBegin + j * ncap, ncap * sizeof(int)); - memcpy(capEnd, curCapEnd + j * ncap, ncap * sizeof(int)); - int c = scur.atom, n = snext.atom; - int p = -1, q = -1; - int cap; - - /* - Lemma 1. For any x in the range [0..nf), we - have f[x].parent < x. - - Proof. By looking at startAtom(), it is - clear that cf < nf holds all the time, and - thus that f[nf].parent < nf. - */ - - /* - If we are reentering an atom, we empty all - capture zones inside it. - */ - if ((q = scur.reenter.value(next)) != 0) { - QBitArray b(eng->nf, false); - b.setBit(q, true); - for (int ell = q + 1; ell < eng->nf; ell++) { - if (b.testBit(eng->f.at(ell).parent)) { - b.setBit(ell, true); - cap = eng->f.at(ell).capture; - if (cap >= 0) { - capBegin[cap] = EmptyCapture; - capEnd[cap] = EmptyCapture; - } - } - } - p = eng->f.at(q).parent; - - /* - Otherwise, close the capture zones we are - leaving. We are leaving f[c].capture, - f[f[c].parent].capture, - f[f[f[c].parent].parent].capture, ..., - until f[x].capture, with x such that - f[x].parent is the youngest common ancestor - for c and n. - - We go up along c's and n's ancestry until - we find x. - */ - } else { - p = c; - q = n; - while (p != q) { - if (p > q) { - cap = eng->f.at(p).capture; - if (cap >= 0) { - if (capBegin[cap] == i) { - capBegin[cap] = EmptyCapture; - capEnd[cap] = EmptyCapture; - } else { - capEnd[cap] = i; - } - } - p = eng->f.at(p).parent; - } else { - q = eng->f.at(q).parent; - } - } - } - - /* - In any case, we now open the capture zones - we are entering. We work upwards from n - until we reach p (the parent of the atom we - reenter or the youngest common ancestor). - */ - while (n > p) { - cap = eng->f.at(n).capture; - if (cap >= 0) { - capBegin[cap] = i; - capEnd[cap] = EmptyCapture; - } - n = eng->f.at(n).parent; - } - /* - If the next state was already in - nextStack, we must choose carefully which - capture zones we want to keep. - */ - if (capBegin == tempCapBegin && - isBetterCapture(ncap, capBegin, capEnd, nextCapBegin + m * ncap, - nextCapEnd + m * ncap)) { - memcpy(nextCapBegin + m * ncap, capBegin, ncap * sizeof(int)); - memcpy(nextCapEnd + m * ncap, capEnd, ncap * sizeof(int)); - } - } -#ifndef QT_NO_REGEXP_BACKREF - /* - We are done with updating the capture zones. - It's now time to put the next state to sleep, - if it needs to, and to remove it from - nextStack. - */ - if (needSomeSleep > 0) { - QList<int> zzZ(2 + 2 * ncap); - zzZ[0] = i + needSomeSleep; - zzZ[1] = next; - if (ncap > 0) { - memcpy(zzZ.data() + 2, capBegin, ncap * sizeof(int)); - memcpy(zzZ.data() + 2 + ncap, capEnd, ncap * sizeof(int)); - } - inNextStack[nextStack[--nnext]] = -1; - sleeping.append(zzZ); - } -#endif -#endif - } - } - } -#ifndef QT_NO_REGEXP_CAPTURE - /* - If we reached the final state, hurray! Copy the captured - zone. - */ - if (ncap > 0 && (m = inNextStack[QRegExpEngine::FinalState]) != -1) { - memcpy(capBegin, nextCapBegin + m * ncap, ncap * sizeof(int)); - memcpy(capEnd, nextCapEnd + m * ncap, ncap * sizeof(int)); - } -#ifndef QT_NO_REGEXP_BACKREF - /* - It's time to wake up the sleepers. - */ - j = 0; - while (j < sleeping.count()) { - if (sleeping.at(j)[0] == i) { - const QList<int> &zzZ = sleeping.at(j); - int next = zzZ[1]; - const int *capBegin = zzZ.data() + 2; - const int *capEnd = zzZ.data() + 2 + ncap; - bool copyOver = true; - - if ((m = inNextStack[next]) == -1) { - m = nnext++; - nextStack[m] = next; - inNextStack[next] = m; - } else { - copyOver = isBetterCapture(ncap, nextCapBegin + m * ncap, nextCapEnd + m * ncap, - capBegin, capEnd); - } - if (copyOver) { - memcpy(nextCapBegin + m * ncap, capBegin, ncap * sizeof(int)); - memcpy(nextCapEnd + m * ncap, capEnd, ncap * sizeof(int)); - } - - sleeping.removeAt(j); - } else { - ++j; - } - } -#endif -#endif - for (j = 0; j < nnext; j++) - inNextStack[nextStack[j]] = -1; - - // avoid needless iteration that confuses oneTestMatchedLen - if (nnext == 1 && nextStack[0] == QRegExpEngine::FinalState -#ifndef QT_NO_REGEXP_BACKREF - && sleeping.isEmpty() -#endif - ) - stop = true; - - qSwap(curStack, nextStack); -#ifndef QT_NO_REGEXP_CAPTURE - qSwap(curCapBegin, nextCapBegin); - qSwap(curCapEnd, nextCapEnd); -#endif - ncur = nnext; - nnext = 0; - ++i; - } - -#ifndef QT_NO_REGEXP_BACKREF - /* - If minimal matching is enabled, we might have some sleepers - left. - */ - if (!sleeping.isEmpty()) - sleeping.clear(); -#endif - - oneTestMatchedLen = i - 1; - return (matchLen >= 0); -} - -#ifndef QT_NO_REGEXP_CCLASS - -QRegExpCharClass::QRegExpCharClass() - : c(0), n(false) -{ -#ifndef QT_NO_REGEXP_OPTIM - occ1.fill(NoOccurrence, NumBadChars); -#endif -} - -void QRegExpCharClass::clear() -{ - c = 0; - r.clear(); - n = false; -} - -void QRegExpCharClass::setNegative(bool negative) -{ - n = negative; -#ifndef QT_NO_REGEXP_OPTIM - occ1.fill(0, NumBadChars); -#endif -} - -void QRegExpCharClass::addCategories(uint cats) -{ - static const int all_cats = FLAG(QChar::Mark_NonSpacing) | - FLAG(QChar::Mark_SpacingCombining) | - FLAG(QChar::Mark_Enclosing) | - FLAG(QChar::Number_DecimalDigit) | - FLAG(QChar::Number_Letter) | - FLAG(QChar::Number_Other) | - FLAG(QChar::Separator_Space) | - FLAG(QChar::Separator_Line) | - FLAG(QChar::Separator_Paragraph) | - FLAG(QChar::Other_Control) | - FLAG(QChar::Other_Format) | - FLAG(QChar::Other_Surrogate) | - FLAG(QChar::Other_PrivateUse) | - FLAG(QChar::Other_NotAssigned) | - FLAG(QChar::Letter_Uppercase) | - FLAG(QChar::Letter_Lowercase) | - FLAG(QChar::Letter_Titlecase) | - FLAG(QChar::Letter_Modifier) | - FLAG(QChar::Letter_Other) | - FLAG(QChar::Punctuation_Connector) | - FLAG(QChar::Punctuation_Dash) | - FLAG(QChar::Punctuation_Open) | - FLAG(QChar::Punctuation_Close) | - FLAG(QChar::Punctuation_InitialQuote) | - FLAG(QChar::Punctuation_FinalQuote) | - FLAG(QChar::Punctuation_Other) | - FLAG(QChar::Symbol_Math) | - FLAG(QChar::Symbol_Currency) | - FLAG(QChar::Symbol_Modifier) | - FLAG(QChar::Symbol_Other); - c |= (all_cats & cats); -#ifndef QT_NO_REGEXP_OPTIM - occ1.fill(0, NumBadChars); -#endif -} - -void QRegExpCharClass::addRange(ushort from, ushort to) -{ - if (from > to) - qSwap(from, to); - int m = r.size(); - r.resize(m + 1); - r[m].from = from; - r[m].len = to - from + 1; - -#ifndef QT_NO_REGEXP_OPTIM - int i; - - if (to - from < NumBadChars) { - if (from % NumBadChars <= to % NumBadChars) { - for (i = from % NumBadChars; i <= to % NumBadChars; i++) - occ1[i] = 0; - } else { - for (i = 0; i <= to % NumBadChars; i++) - occ1[i] = 0; - for (i = from % NumBadChars; i < NumBadChars; i++) - occ1[i] = 0; - } - } else { - occ1.fill(0, NumBadChars); - } -#endif -} - -bool QRegExpCharClass::in(QChar ch) const -{ -#ifndef QT_NO_REGEXP_OPTIM - if (occ1.at(BadChar(ch)) == NoOccurrence) - return n; -#endif - - if (c != 0 && (c & FLAG(ch.category())) != 0) - return !n; - - const int uc = ch.unicode(); - int size = r.size(); - - for (int i = 0; i < size; ++i) { - const QRegExpCharClassRange &range = r.at(i); - if (uint(uc - range.from) < uint(r.at(i).len)) - return !n; - } - return n; -} - -#if defined(QT_DEBUG) -void QRegExpCharClass::dump() const -{ - int i; - qDebug(" %stive character class", n ? "nega" : "posi"); -#ifndef QT_NO_REGEXP_CCLASS - if (c != 0) - qDebug(" categories 0x%.8x", c); -#endif - for (i = 0; i < r.size(); i++) - qDebug(" 0x%.4x through 0x%.4x", r[i].from, r[i].from + r[i].len - 1); -} -#endif -#endif - -QRegExpEngine::Box::Box(QRegExpEngine *engine) - : eng(engine), skipanchors(0) -#ifndef QT_NO_REGEXP_OPTIM - , earlyStart(0), lateStart(0), maxl(0) -#endif -{ -#ifndef QT_NO_REGEXP_OPTIM - occ1.fill(NoOccurrence, NumBadChars); -#endif - minl = 0; -} - -QRegExpEngine::Box &QRegExpEngine::Box::operator=(const Box &b) -{ - eng = b.eng; - ls = b.ls; - rs = b.rs; - lanchors = b.lanchors; - ranchors = b.ranchors; - skipanchors = b.skipanchors; -#ifndef QT_NO_REGEXP_OPTIM - earlyStart = b.earlyStart; - lateStart = b.lateStart; - str = b.str; - leftStr = b.leftStr; - rightStr = b.rightStr; - maxl = b.maxl; - occ1 = b.occ1; -#endif - minl = b.minl; - return *this; -} - -void QRegExpEngine::Box::set(QChar ch) -{ - ls.resize(1); - ls[0] = eng->createState(ch); - rs = ls; -#ifndef QT_NO_REGEXP_OPTIM - str = ch; - leftStr = ch; - rightStr = ch; - maxl = 1; - occ1[BadChar(ch)] = 0; -#endif - minl = 1; -} - -void QRegExpEngine::Box::set(const QRegExpCharClass &cc) -{ - ls.resize(1); - ls[0] = eng->createState(cc); - rs = ls; -#ifndef QT_NO_REGEXP_OPTIM - maxl = 1; - occ1 = cc.firstOccurrence(); -#endif - minl = 1; -} - -#ifndef QT_NO_REGEXP_BACKREF -void QRegExpEngine::Box::set(int bref) -{ - ls.resize(1); - ls[0] = eng->createState(bref); - rs = ls; - if (bref >= 1 && bref <= MaxBackRefs) - skipanchors = Anchor_BackRef0Empty << bref; -#ifndef QT_NO_REGEXP_OPTIM - maxl = InftyLen; -#endif - minl = 0; -} -#endif - -void QRegExpEngine::Box::cat(const Box &b) -{ - eng->addCatTransitions(rs, b.ls); - addAnchorsToEngine(b); - if (minl == 0) { - lanchors.insert(b.lanchors); - if (skipanchors != 0) { - for (int i = 0; i < b.ls.size(); i++) { - int a = eng->anchorConcatenation(lanchors.value(b.ls.at(i), 0), skipanchors); - lanchors.insert(b.ls.at(i), a); - } - } - mergeInto(&ls, b.ls); - } - if (b.minl == 0) { - ranchors.insert(b.ranchors); - if (b.skipanchors != 0) { - for (int i = 0; i < rs.size(); i++) { - int a = eng->anchorConcatenation(ranchors.value(rs.at(i), 0), b.skipanchors); - ranchors.insert(rs.at(i), a); - } - } - mergeInto(&rs, b.rs); - } else { - ranchors = b.ranchors; - rs = b.rs; - } - -#ifndef QT_NO_REGEXP_OPTIM - if (maxl != InftyLen) { - if (rightStr.length() + b.leftStr.length() > - qMax(str.length(), b.str.length())) { - earlyStart = minl - rightStr.length(); - lateStart = maxl - rightStr.length(); - str = rightStr + b.leftStr; - } else if (b.str.length() > str.length()) { - earlyStart = minl + b.earlyStart; - lateStart = maxl + b.lateStart; - str = b.str; - } - } - - if (leftStr.length() == maxl) - leftStr += b.leftStr; - - if (b.rightStr.length() == b.maxl) { - rightStr += b.rightStr; - } else { - rightStr = b.rightStr; - } - - if (maxl == InftyLen || b.maxl == InftyLen) { - maxl = InftyLen; - } else { - maxl += b.maxl; - } - - for (int i = 0; i < NumBadChars; i++) { - if (b.occ1.at(i) != NoOccurrence && minl + b.occ1.at(i) < occ1.at(i)) - occ1[i] = minl + b.occ1.at(i); - } -#endif - - minl += b.minl; - if (minl == 0) - skipanchors = eng->anchorConcatenation(skipanchors, b.skipanchors); - else - skipanchors = 0; -} - -void QRegExpEngine::Box::orx(const Box &b) -{ - mergeInto(&ls, b.ls); - lanchors.insert(b.lanchors); - mergeInto(&rs, b.rs); - ranchors.insert(b.ranchors); - - if (b.minl == 0) { - if (minl == 0) - skipanchors = eng->anchorAlternation(skipanchors, b.skipanchors); - else - skipanchors = b.skipanchors; - } - -#ifndef QT_NO_REGEXP_OPTIM - for (int i = 0; i < NumBadChars; i++) { - if (occ1.at(i) > b.occ1.at(i)) - occ1[i] = b.occ1.at(i); - } - earlyStart = 0; - lateStart = 0; - str = QString(); - leftStr = QString(); - rightStr = QString(); - if (b.maxl > maxl) - maxl = b.maxl; -#endif - if (b.minl < minl) - minl = b.minl; -} - -void QRegExpEngine::Box::plus(int atom) -{ -#ifndef QT_NO_REGEXP_CAPTURE - eng->addPlusTransitions(rs, ls, atom); -#else - Q_UNUSED(atom); - eng->addCatTransitions(rs, ls); -#endif - addAnchorsToEngine(*this); -#ifndef QT_NO_REGEXP_OPTIM - maxl = InftyLen; -#endif -} - -void QRegExpEngine::Box::opt() -{ -#ifndef QT_NO_REGEXP_OPTIM - earlyStart = 0; - lateStart = 0; - str = QString(); - leftStr = QString(); - rightStr = QString(); -#endif - skipanchors = 0; - minl = 0; -} - -void QRegExpEngine::Box::catAnchor(int a) -{ - if (a != 0) { - for (int i = 0; i < rs.size(); i++) { - a = eng->anchorConcatenation(ranchors.value(rs.at(i), 0), a); - ranchors.insert(rs.at(i), a); - } - if (minl == 0) - skipanchors = eng->anchorConcatenation(skipanchors, a); - } -} - -#ifndef QT_NO_REGEXP_OPTIM -void QRegExpEngine::Box::setupHeuristics() -{ - eng->goodEarlyStart = earlyStart; - eng->goodLateStart = lateStart; - eng->goodStr = eng->cs ? str : str.toLower(); - - eng->minl = minl; - if (eng->cs) { - /* - A regular expression such as 112|1 has occ1['2'] = 2 and minl = - 1 at this point. An entry of occ1 has to be at most minl or - infinity for the rest of the algorithm to go well. - - We waited until here before normalizing these cases (instead of - doing it in Box::orx()) because sometimes things improve by - themselves. Consider for example (112|1)34. - */ - for (int i = 0; i < NumBadChars; i++) { - if (occ1.at(i) != NoOccurrence && occ1.at(i) >= minl) - occ1[i] = minl; - } - eng->occ1 = occ1; - } else { - eng->occ1.fill(0, NumBadChars); - } - - eng->heuristicallyChooseHeuristic(); -} -#endif - -#if defined(QT_DEBUG) -void QRegExpEngine::Box::dump() const -{ - int i; - qDebug("Box of at least %d character%s", minl, minl == 1 ? "" : "s"); - qDebug(" Left states:"); - for (i = 0; i < ls.size(); i++) { - if (lanchors.value(ls[i], 0) == 0) - qDebug(" %d", ls[i]); - else - qDebug(" %d [anchors 0x%.8x]", ls[i], lanchors[ls[i]]); - } - qDebug(" Right states:"); - for (i = 0; i < rs.size(); i++) { - if (ranchors.value(rs[i], 0) == 0) - qDebug(" %d", rs[i]); - else - qDebug(" %d [anchors 0x%.8x]", rs[i], ranchors[rs[i]]); - } - qDebug(" Skip anchors: 0x%.8x", skipanchors); -} -#endif - -void QRegExpEngine::Box::addAnchorsToEngine(const Box &to) const -{ - for (int i = 0; i < to.ls.size(); i++) { - for (int j = 0; j < rs.size(); j++) { - int a = eng->anchorConcatenation(ranchors.value(rs.at(j), 0), - to.lanchors.value(to.ls.at(i), 0)); - eng->addAnchors(rs[j], to.ls[i], a); - } - } -} - -#ifndef QT_NO_REGEXP_CCLASS -// fast lookup hash for xml schema extensions -// sorted by name for b-search -static const struct CategoriesRangeMapEntry { - const char name[40]; - uint first, second; -} categoriesRangeMap[] = { - { "AegeanNumbers", 0x10100, 0x1013F }, - { "AlphabeticPresentationForms", 0xFB00, 0xFB4F }, - { "AncientGreekMusicalNotation", 0x1D200, 0x1D24F }, - { "AncientGreekNumbers", 0x10140, 0x1018F }, - { "Arabic", 0x0600, 0x06FF }, - { "ArabicPresentationForms-A", 0xFB50, 0xFDFF }, - { "ArabicPresentationForms-B", 0xFE70, 0xFEFF }, - { "ArabicSupplement", 0x0750, 0x077F }, - { "Armenian", 0x0530, 0x058F }, - { "Arrows", 0x2190, 0x21FF }, - { "BasicLatin", 0x0000, 0x007F }, - { "Bengali", 0x0980, 0x09FF }, - { "BlockElements", 0x2580, 0x259F }, - { "Bopomofo", 0x3100, 0x312F }, - { "BopomofoExtended", 0x31A0, 0x31BF }, - { "BoxDrawing", 0x2500, 0x257F }, - { "BraillePatterns", 0x2800, 0x28FF }, - { "Buginese", 0x1A00, 0x1A1F }, - { "Buhid", 0x1740, 0x175F }, - { "ByzantineMusicalSymbols", 0x1D000, 0x1D0FF }, - { "CJKCompatibility", 0x3300, 0x33FF }, - { "CJKCompatibilityForms", 0xFE30, 0xFE4F }, - { "CJKCompatibilityIdeographs", 0xF900, 0xFAFF }, - { "CJKCompatibilityIdeographsSupplement", 0x2F800, 0x2FA1F }, - { "CJKRadicalsSupplement", 0x2E80, 0x2EFF }, - { "CJKStrokes", 0x31C0, 0x31EF }, - { "CJKSymbolsandPunctuation", 0x3000, 0x303F }, - { "CJKUnifiedIdeographs", 0x4E00, 0x9FFF }, - { "CJKUnifiedIdeographsExtensionA", 0x3400, 0x4DB5 }, - { "CJKUnifiedIdeographsExtensionB", 0x20000, 0x2A6DF }, - { "Cherokee", 0x13A0, 0x13FF }, - { "CombiningDiacriticalMarks", 0x0300, 0x036F }, - { "CombiningDiacriticalMarksSupplement", 0x1DC0, 0x1DFF }, - { "CombiningHalfMarks", 0xFE20, 0xFE2F }, - { "CombiningMarksforSymbols", 0x20D0, 0x20FF }, - { "ControlPictures", 0x2400, 0x243F }, - { "Coptic", 0x2C80, 0x2CFF }, - { "CurrencySymbols", 0x20A0, 0x20CF }, - { "CypriotSyllabary", 0x10800, 0x1083F }, - { "Cyrillic", 0x0400, 0x04FF }, - { "CyrillicSupplement", 0x0500, 0x052F }, - { "Deseret", 0x10400, 0x1044F }, - { "Devanagari", 0x0900, 0x097F }, - { "Dingbats", 0x2700, 0x27BF }, - { "EnclosedAlphanumerics", 0x2460, 0x24FF }, - { "EnclosedCJKLettersandMonths", 0x3200, 0x32FF }, - { "Ethiopic", 0x1200, 0x137F }, - { "EthiopicExtended", 0x2D80, 0x2DDF }, - { "EthiopicSupplement", 0x1380, 0x139F }, - { "GeneralPunctuation", 0x2000, 0x206F }, - { "GeometricShapes", 0x25A0, 0x25FF }, - { "Georgian", 0x10A0, 0x10FF }, - { "GeorgianSupplement", 0x2D00, 0x2D2F }, - { "Glagolitic", 0x2C00, 0x2C5F }, - { "Gothic", 0x10330, 0x1034F }, - { "Greek", 0x0370, 0x03FF }, - { "GreekExtended", 0x1F00, 0x1FFF }, - { "Gujarati", 0x0A80, 0x0AFF }, - { "Gurmukhi", 0x0A00, 0x0A7F }, - { "HalfwidthandFullwidthForms", 0xFF00, 0xFFEF }, - { "HangulCompatibilityJamo", 0x3130, 0x318F }, - { "HangulJamo", 0x1100, 0x11FF }, - { "HangulSyllables", 0xAC00, 0xD7A3 }, - { "Hanunoo", 0x1720, 0x173F }, - { "Hebrew", 0x0590, 0x05FF }, - { "Hiragana", 0x3040, 0x309F }, - { "IPAExtensions", 0x0250, 0x02AF }, - { "IdeographicDescriptionCharacters", 0x2FF0, 0x2FFF }, - { "Kanbun", 0x3190, 0x319F }, - { "KangxiRadicals", 0x2F00, 0x2FDF }, - { "Kannada", 0x0C80, 0x0CFF }, - { "Katakana", 0x30A0, 0x30FF }, - { "KatakanaPhoneticExtensions", 0x31F0, 0x31FF }, - { "Kharoshthi", 0x10A00, 0x10A5F }, - { "Khmer", 0x1780, 0x17FF }, - { "KhmerSymbols", 0x19E0, 0x19FF }, - { "Lao", 0x0E80, 0x0EFF }, - { "Latin-1Supplement", 0x0080, 0x00FF }, - { "LatinExtended-A", 0x0100, 0x017F }, - { "LatinExtended-B", 0x0180, 0x024F }, - { "LatinExtendedAdditional", 0x1E00, 0x1EFF }, - { "LetterlikeSymbols", 0x2100, 0x214F }, - { "Limbu", 0x1900, 0x194F }, - { "LinearBIdeograms", 0x10080, 0x100FF }, - { "LinearBSyllabary", 0x10000, 0x1007F }, - { "Malayalam", 0x0D00, 0x0D7F }, - { "MathematicalAlphanumericSymbols", 0x1D400, 0x1D7FF }, - { "MathematicalOperators", 0x2200, 0x22FF }, - { "MiscellaneousMathematicalSymbols-A", 0x27C0, 0x27EF }, - { "MiscellaneousMathematicalSymbols-B", 0x2980, 0x29FF }, - { "MiscellaneousSymbols", 0x2600, 0x26FF }, - { "MiscellaneousSymbolsandArrows", 0x2B00, 0x2BFF }, - { "MiscellaneousTechnical", 0x2300, 0x23FF }, - { "ModifierToneLetters", 0xA700, 0xA71F }, - { "Mongolian", 0x1800, 0x18AF }, - { "MusicalSymbols", 0x1D100, 0x1D1FF }, - { "Myanmar", 0x1000, 0x109F }, - { "NewTaiLue", 0x1980, 0x19DF }, - { "NumberForms", 0x2150, 0x218F }, - { "Ogham", 0x1680, 0x169F }, - { "OldItalic", 0x10300, 0x1032F }, - { "OldPersian", 0x103A0, 0x103DF }, - { "OpticalCharacterRecognition", 0x2440, 0x245F }, - { "Oriya", 0x0B00, 0x0B7F }, - { "Osmanya", 0x10480, 0x104AF }, - { "PhoneticExtensions", 0x1D00, 0x1D7F }, - { "PhoneticExtensionsSupplement", 0x1D80, 0x1DBF }, - { "PrivateUse", 0xE000, 0xF8FF }, - { "Runic", 0x16A0, 0x16FF }, - { "Shavian", 0x10450, 0x1047F }, - { "Sinhala", 0x0D80, 0x0DFF }, - { "SmallFormVariants", 0xFE50, 0xFE6F }, - { "SpacingModifierLetters", 0x02B0, 0x02FF }, - { "Specials", 0xFFF0, 0xFFFF }, - { "SuperscriptsandSubscripts", 0x2070, 0x209F }, - { "SupplementalArrows-A", 0x27F0, 0x27FF }, - { "SupplementalArrows-B", 0x2900, 0x297F }, - { "SupplementalMathematicalOperators", 0x2A00, 0x2AFF }, - { "SupplementalPunctuation", 0x2E00, 0x2E7F }, - { "SupplementaryPrivateUseArea-A", 0xF0000, 0xFFFFF }, - { "SupplementaryPrivateUseArea-B", 0x100000, 0x10FFFF }, - { "SylotiNagri", 0xA800, 0xA82F }, - { "Syriac", 0x0700, 0x074F }, - { "Tagalog", 0x1700, 0x171F }, - { "Tagbanwa", 0x1760, 0x177F }, - { "Tags", 0xE0000, 0xE007F }, - { "TaiLe", 0x1950, 0x197F }, - { "TaiXuanJingSymbols", 0x1D300, 0x1D35F }, - { "Tamil", 0x0B80, 0x0BFF }, - { "Telugu", 0x0C00, 0x0C7F }, - { "Thaana", 0x0780, 0x07BF }, - { "Thai", 0x0E00, 0x0E7F }, - { "Tibetan", 0x0F00, 0x0FFF }, - { "Tifinagh", 0x2D30, 0x2D7F }, - { "Ugaritic", 0x10380, 0x1039F }, - { "UnifiedCanadianAboriginalSyllabics", 0x1400, 0x167F }, - { "VariationSelectors", 0xFE00, 0xFE0F }, - { "VariationSelectorsSupplement", 0xE0100, 0xE01EF }, - { "VerticalForms", 0xFE10, 0xFE1F }, - { "YiRadicals", 0xA490, 0xA4CF }, - { "YiSyllables", 0xA000, 0xA48F }, - { "YijingHexagramSymbols", 0x4DC0, 0x4DFF } -}; - -inline bool operator<(const CategoriesRangeMapEntry &entry1, const CategoriesRangeMapEntry &entry2) -{ return qstrcmp(entry1.name, entry2.name) < 0; } -inline bool operator<(const char *name, const CategoriesRangeMapEntry &entry) -{ return qstrcmp(name, entry.name) < 0; } -inline bool operator<(const CategoriesRangeMapEntry &entry, const char *name) -{ return qstrcmp(entry.name, name) < 0; } -#endif // QT_NO_REGEXP_CCLASS - -int QRegExpEngine::getChar() -{ - return (yyPos == yyLen) ? EOS : yyIn[yyPos++].unicode(); -} - -int QRegExpEngine::getEscape() -{ -#ifndef QT_NO_REGEXP_ESCAPE - const char tab[] = "afnrtv"; // no b, as \b means word boundary - const char backTab[] = "\a\f\n\r\t\v"; - ushort low; - int i; -#endif - ushort val; - int prevCh = yyCh; - - if (prevCh == EOS) { - error(RXERR_END); - return Tok_Char | '\\'; - } - yyCh = getChar(); -#ifndef QT_NO_REGEXP_ESCAPE - if ((prevCh & ~0xff) == 0) { - const char *p = strchr(tab, prevCh); - if (p != nullptr) - return Tok_Char | backTab[p - tab]; - } -#endif - - switch (prevCh) { -#ifndef QT_NO_REGEXP_ESCAPE - case '0': - val = 0; - for (i = 0; i < 3; i++) { - if (yyCh >= '0' && yyCh <= '7') - val = (val << 3) | (yyCh - '0'); - else - break; - yyCh = getChar(); - } - if ((val & ~0377) != 0) - error(RXERR_OCTAL); - return Tok_Char | val; -#endif -#ifndef QT_NO_REGEXP_ESCAPE - case 'B': - return Tok_NonWord; -#endif -#ifndef QT_NO_REGEXP_CCLASS - case 'D': - // see QChar::isDigit() - yyCharClass->addCategories(uint(-1) ^ FLAG(QChar::Number_DecimalDigit)); - return Tok_CharClass; - case 'S': - // see QChar::isSpace() - yyCharClass->addCategories(uint(-1) ^ (FLAG(QChar::Separator_Space) | - FLAG(QChar::Separator_Line) | - FLAG(QChar::Separator_Paragraph) | - FLAG(QChar::Other_Control))); - yyCharClass->addRange(0x0000, 0x0008); - yyCharClass->addRange(0x000e, 0x001f); - yyCharClass->addRange(0x007f, 0x0084); - yyCharClass->addRange(0x0086, 0x009f); - return Tok_CharClass; - case 'W': - // see QChar::isLetterOrNumber() and QChar::isMark() - yyCharClass->addCategories(uint(-1) ^ (FLAG(QChar::Mark_NonSpacing) | - FLAG(QChar::Mark_SpacingCombining) | - FLAG(QChar::Mark_Enclosing) | - FLAG(QChar::Number_DecimalDigit) | - FLAG(QChar::Number_Letter) | - FLAG(QChar::Number_Other) | - FLAG(QChar::Letter_Uppercase) | - FLAG(QChar::Letter_Lowercase) | - FLAG(QChar::Letter_Titlecase) | - FLAG(QChar::Letter_Modifier) | - FLAG(QChar::Letter_Other) | - FLAG(QChar::Punctuation_Connector))); - yyCharClass->addRange(0x203f, 0x2040); - yyCharClass->addSingleton(0x2040); - yyCharClass->addSingleton(0x2054); - yyCharClass->addSingleton(0x30fb); - yyCharClass->addRange(0xfe33, 0xfe34); - yyCharClass->addRange(0xfe4d, 0xfe4f); - yyCharClass->addSingleton(0xff3f); - yyCharClass->addSingleton(0xff65); - return Tok_CharClass; -#endif -#ifndef QT_NO_REGEXP_ESCAPE - case 'b': - return Tok_Word; -#endif -#ifndef QT_NO_REGEXP_CCLASS - case 'd': - // see QChar::isDigit() - yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit)); - return Tok_CharClass; - case 's': - // see QChar::isSpace() - yyCharClass->addCategories(FLAG(QChar::Separator_Space) | - FLAG(QChar::Separator_Line) | - FLAG(QChar::Separator_Paragraph)); - yyCharClass->addRange(0x0009, 0x000d); - yyCharClass->addSingleton(0x0085); - return Tok_CharClass; - case 'w': - // see QChar::isLetterOrNumber() and QChar::isMark() - yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) | - FLAG(QChar::Mark_SpacingCombining) | - FLAG(QChar::Mark_Enclosing) | - FLAG(QChar::Number_DecimalDigit) | - FLAG(QChar::Number_Letter) | - FLAG(QChar::Number_Other) | - FLAG(QChar::Letter_Uppercase) | - FLAG(QChar::Letter_Lowercase) | - FLAG(QChar::Letter_Titlecase) | - FLAG(QChar::Letter_Modifier) | - FLAG(QChar::Letter_Other)); - yyCharClass->addSingleton(0x005f); // '_' - return Tok_CharClass; - case 'I': - if (!xmlSchemaExtensions) - break; - yyCharClass->setNegative(!yyCharClass->negative()); - Q_FALLTHROUGH(); - case 'i': - if (xmlSchemaExtensions) { - yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) | - FLAG(QChar::Mark_SpacingCombining) | - FLAG(QChar::Mark_Enclosing) | - FLAG(QChar::Number_DecimalDigit) | - FLAG(QChar::Number_Letter) | - FLAG(QChar::Number_Other) | - FLAG(QChar::Letter_Uppercase) | - FLAG(QChar::Letter_Lowercase) | - FLAG(QChar::Letter_Titlecase) | - FLAG(QChar::Letter_Modifier) | - FLAG(QChar::Letter_Other)); - yyCharClass->addSingleton(0x003a); // ':' - yyCharClass->addSingleton(0x005f); // '_' - yyCharClass->addRange(0x0041, 0x005a); // [A-Z] - yyCharClass->addRange(0x0061, 0x007a); // [a-z] - yyCharClass->addRange(0xc0, 0xd6); - yyCharClass->addRange(0xd8, 0xf6); - yyCharClass->addRange(0xf8, 0x2ff); - yyCharClass->addRange(0x370, 0x37d); - yyCharClass->addRange(0x37f, 0x1fff); - yyCharClass->addRange(0x200c, 0x200d); - yyCharClass->addRange(0x2070, 0x218f); - yyCharClass->addRange(0x2c00, 0x2fef); - yyCharClass->addRange(0x3001, 0xd7ff); - yyCharClass->addRange(0xf900, 0xfdcf); - yyCharClass->addRange(0xfdf0, 0xfffd); - yyCharClass->addRange((ushort)0x10000, (ushort)0xeffff); - return Tok_CharClass; - } else { - break; - } - case 'C': - if (!xmlSchemaExtensions) - break; - yyCharClass->setNegative(!yyCharClass->negative()); - Q_FALLTHROUGH(); - case 'c': - if (xmlSchemaExtensions) { - yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) | - FLAG(QChar::Mark_SpacingCombining) | - FLAG(QChar::Mark_Enclosing) | - FLAG(QChar::Number_DecimalDigit) | - FLAG(QChar::Number_Letter) | - FLAG(QChar::Number_Other) | - FLAG(QChar::Letter_Uppercase) | - FLAG(QChar::Letter_Lowercase) | - FLAG(QChar::Letter_Titlecase) | - FLAG(QChar::Letter_Modifier) | - FLAG(QChar::Letter_Other)); - yyCharClass->addSingleton(0x002d); // '-' - yyCharClass->addSingleton(0x002e); // '.' - yyCharClass->addSingleton(0x003a); // ':' - yyCharClass->addSingleton(0x005f); // '_' - yyCharClass->addSingleton(0xb7); - yyCharClass->addRange(0x0030, 0x0039); // [0-9] - yyCharClass->addRange(0x0041, 0x005a); // [A-Z] - yyCharClass->addRange(0x0061, 0x007a); // [a-z] - yyCharClass->addRange(0xc0, 0xd6); - yyCharClass->addRange(0xd8, 0xf6); - yyCharClass->addRange(0xf8, 0x2ff); - yyCharClass->addRange(0x370, 0x37d); - yyCharClass->addRange(0x37f, 0x1fff); - yyCharClass->addRange(0x200c, 0x200d); - yyCharClass->addRange(0x2070, 0x218f); - yyCharClass->addRange(0x2c00, 0x2fef); - yyCharClass->addRange(0x3001, 0xd7ff); - yyCharClass->addRange(0xf900, 0xfdcf); - yyCharClass->addRange(0xfdf0, 0xfffd); - yyCharClass->addRange((ushort)0x10000, (ushort)0xeffff); - yyCharClass->addRange(0x0300, 0x036f); - yyCharClass->addRange(0x203f, 0x2040); - return Tok_CharClass; - } else { - break; - } - case 'P': - if (!xmlSchemaExtensions) - break; - yyCharClass->setNegative(!yyCharClass->negative()); - Q_FALLTHROUGH(); - case 'p': - if (xmlSchemaExtensions) { - if (yyCh != '{') { - error(RXERR_CHARCLASS); - return Tok_CharClass; - } - - QByteArray category; - yyCh = getChar(); - while (yyCh != '}') { - if (yyCh == EOS) { - error(RXERR_END); - return Tok_CharClass; - } - category.append(yyCh); - yyCh = getChar(); - } - yyCh = getChar(); // skip closing '}' - - int catlen = category.length(); - if (catlen == 1 || catlen == 2) { - switch (category.at(0)) { - case 'M': - if (catlen == 1) { - yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) | - FLAG(QChar::Mark_SpacingCombining) | - FLAG(QChar::Mark_Enclosing)); - } else { - switch (category.at(1)) { - case 'n': yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing)); break; // Mn - case 'c': yyCharClass->addCategories(FLAG(QChar::Mark_SpacingCombining)); break; // Mc - case 'e': yyCharClass->addCategories(FLAG(QChar::Mark_Enclosing)); break; // Me - default: error(RXERR_CATEGORY); break; - } - } - break; - case 'N': - if (catlen == 1) { - yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit) | - FLAG(QChar::Number_Letter) | - FLAG(QChar::Number_Other)); - } else { - switch (category.at(1)) { - case 'd': yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit)); break; // Nd - case 'l': yyCharClass->addCategories(FLAG(QChar::Number_Letter)); break; // Hl - case 'o': yyCharClass->addCategories(FLAG(QChar::Number_Other)); break; // No - default: error(RXERR_CATEGORY); break; - } - } - break; - case 'Z': - if (catlen == 1) { - yyCharClass->addCategories(FLAG(QChar::Separator_Space) | - FLAG(QChar::Separator_Line) | - FLAG(QChar::Separator_Paragraph)); - } else { - switch (category.at(1)) { - case 's': yyCharClass->addCategories(FLAG(QChar::Separator_Space)); break; // Zs - case 'l': yyCharClass->addCategories(FLAG(QChar::Separator_Line)); break; // Zl - case 'p': yyCharClass->addCategories(FLAG(QChar::Separator_Paragraph)); break; // Zp - default: error(RXERR_CATEGORY); break; - } - } - break; - case 'C': - if (catlen == 1) { - yyCharClass->addCategories(FLAG(QChar::Other_Control) | - FLAG(QChar::Other_Format) | - FLAG(QChar::Other_Surrogate) | - FLAG(QChar::Other_PrivateUse) | - FLAG(QChar::Other_NotAssigned)); - } else { - switch (category.at(1)) { - case 'c': yyCharClass->addCategories(FLAG(QChar::Other_Control)); break; // Cc - case 'f': yyCharClass->addCategories(FLAG(QChar::Other_Format)); break; // Cf - case 's': yyCharClass->addCategories(FLAG(QChar::Other_Surrogate)); break; // Cs - case 'o': yyCharClass->addCategories(FLAG(QChar::Other_PrivateUse)); break; // Co - case 'n': yyCharClass->addCategories(FLAG(QChar::Other_NotAssigned)); break; // Cn - default: error(RXERR_CATEGORY); break; - } - } - break; - case 'L': - if (catlen == 1) { - yyCharClass->addCategories(FLAG(QChar::Letter_Uppercase) | - FLAG(QChar::Letter_Lowercase) | - FLAG(QChar::Letter_Titlecase) | - FLAG(QChar::Letter_Modifier) | - FLAG(QChar::Letter_Other)); - } else { - switch (category.at(1)) { - case 'u': yyCharClass->addCategories(FLAG(QChar::Letter_Uppercase)); break; // Lu - case 'l': yyCharClass->addCategories(FLAG(QChar::Letter_Lowercase)); break; // Ll - case 't': yyCharClass->addCategories(FLAG(QChar::Letter_Titlecase)); break; // Lt - case 'm': yyCharClass->addCategories(FLAG(QChar::Letter_Modifier)); break; // Lm - case 'o': yyCharClass->addCategories(FLAG(QChar::Letter_Other)); break; // Lo - default: error(RXERR_CATEGORY); break; - } - } - break; - case 'P': - if (catlen == 1) { - yyCharClass->addCategories(FLAG(QChar::Punctuation_Connector) | - FLAG(QChar::Punctuation_Dash) | - FLAG(QChar::Punctuation_Open) | - FLAG(QChar::Punctuation_Close) | - FLAG(QChar::Punctuation_InitialQuote) | - FLAG(QChar::Punctuation_FinalQuote) | - FLAG(QChar::Punctuation_Other)); - } else { - switch (category.at(1)) { - case 'c': yyCharClass->addCategories(FLAG(QChar::Punctuation_Connector)); break; // Pc - case 'd': yyCharClass->addCategories(FLAG(QChar::Punctuation_Dash)); break; // Pd - case 's': yyCharClass->addCategories(FLAG(QChar::Punctuation_Open)); break; // Ps - case 'e': yyCharClass->addCategories(FLAG(QChar::Punctuation_Close)); break; // Pe - case 'i': yyCharClass->addCategories(FLAG(QChar::Punctuation_InitialQuote)); break; // Pi - case 'f': yyCharClass->addCategories(FLAG(QChar::Punctuation_FinalQuote)); break; // Pf - case 'o': yyCharClass->addCategories(FLAG(QChar::Punctuation_Other)); break; // Po - default: error(RXERR_CATEGORY); break; - } - } - break; - case 'S': - if (catlen == 1) { - yyCharClass->addCategories(FLAG(QChar::Symbol_Math) | - FLAG(QChar::Symbol_Currency) | - FLAG(QChar::Symbol_Modifier) | - FLAG(QChar::Symbol_Other)); - } else { - switch (category.at(1)) { - case 'm': yyCharClass->addCategories(FLAG(QChar::Symbol_Math)); break; // Sm - case 'c': yyCharClass->addCategories(FLAG(QChar::Symbol_Currency)); break; // Sc - case 'k': yyCharClass->addCategories(FLAG(QChar::Symbol_Modifier)); break; // Sk - case 'o': yyCharClass->addCategories(FLAG(QChar::Symbol_Other)); break; // So - default: error(RXERR_CATEGORY); break; - } - } - break; - default: - error(RXERR_CATEGORY); - break; - } - } else if (catlen > 2 && category.at(0) == 'I' && category.at(1) == 's') { - static const int N = sizeof(categoriesRangeMap) / sizeof(categoriesRangeMap[0]); - const char * const categoryFamily = category.constData() + 2; - const CategoriesRangeMapEntry *r = std::lower_bound(categoriesRangeMap, categoriesRangeMap + N, categoryFamily); - if (r != categoriesRangeMap + N && qstrcmp(r->name, categoryFamily) == 0) - yyCharClass->addRange(r->first, r->second); - else - error(RXERR_CATEGORY); - } else { - error(RXERR_CATEGORY); - } - return Tok_CharClass; - } else { - break; - } -#endif -#ifndef QT_NO_REGEXP_ESCAPE - case 'x': - val = 0; - for (i = 0; i < 4; i++) { - low = QChar(yyCh).toLower().unicode(); - if (low >= '0' && low <= '9') - val = (val << 4) | (low - '0'); - else if (low >= 'a' && low <= 'f') - val = (val << 4) | (low - 'a' + 10); - else - break; - yyCh = getChar(); - } - return Tok_Char | val; -#endif - default: - break; - } - if (prevCh >= '1' && prevCh <= '9') { -#ifndef QT_NO_REGEXP_BACKREF - val = prevCh - '0'; - while (yyCh >= '0' && yyCh <= '9') { - val = (val * 10) + (yyCh - '0'); - yyCh = getChar(); - } - return Tok_BackRef | val; -#else - error(RXERR_DISABLED); -#endif - } - return Tok_Char | prevCh; -} - -#ifndef QT_NO_REGEXP_INTERVAL -int QRegExpEngine::getRep(int def) -{ - if (yyCh >= '0' && yyCh <= '9') { - int rep = 0; - do { - rep = 10 * rep + yyCh - '0'; - if (rep >= InftyRep) { - error(RXERR_REPETITION); - rep = def; - } - yyCh = getChar(); - } while (yyCh >= '0' && yyCh <= '9'); - return rep; - } else { - return def; - } -} -#endif - -#ifndef QT_NO_REGEXP_LOOKAHEAD -void QRegExpEngine::skipChars(int n) -{ - if (n > 0) { - yyPos += n - 1; - yyCh = getChar(); - } -} -#endif - -void QRegExpEngine::error(const char *msg) -{ - if (yyError.isEmpty()) - yyError = QLatin1String(msg); -} - -void QRegExpEngine::startTokenizer(const QChar *rx, int len) -{ - yyIn = rx; - yyPos0 = 0; - yyPos = 0; - yyLen = len; - yyCh = getChar(); - yyCharClass.reset(new QRegExpCharClass); - yyMinRep = 0; - yyMaxRep = 0; - yyError = QString(); -} - -int QRegExpEngine::getToken() -{ -#ifndef QT_NO_REGEXP_CCLASS - ushort pendingCh = 0; - bool charPending; - bool rangePending; - int tok; -#endif - int prevCh = yyCh; - - yyPos0 = yyPos - 1; -#ifndef QT_NO_REGEXP_CCLASS - yyCharClass->clear(); -#endif - yyMinRep = 0; - yyMaxRep = 0; - yyCh = getChar(); - - switch (prevCh) { - case EOS: - yyPos0 = yyPos; - return Tok_Eos; - case '$': - return Tok_Dollar; - case '(': - if (yyCh == '?') { - prevCh = getChar(); - yyCh = getChar(); - switch (prevCh) { -#ifndef QT_NO_REGEXP_LOOKAHEAD - case '!': - return Tok_NegLookahead; - case '=': - return Tok_PosLookahead; -#endif - case ':': - return Tok_MagicLeftParen; - case '<': - error(RXERR_LOOKBEHIND); - return Tok_MagicLeftParen; - default: - error(RXERR_LOOKAHEAD); - return Tok_MagicLeftParen; - } - } else { - return Tok_LeftParen; - } - case ')': - return Tok_RightParen; - case '*': - yyMinRep = 0; - yyMaxRep = InftyRep; - return Tok_Quantifier; - case '+': - yyMinRep = 1; - yyMaxRep = InftyRep; - return Tok_Quantifier; - case '.': -#ifndef QT_NO_REGEXP_CCLASS - yyCharClass->setNegative(true); -#endif - return Tok_CharClass; - case '?': - yyMinRep = 0; - yyMaxRep = 1; - return Tok_Quantifier; - case '[': -#ifndef QT_NO_REGEXP_CCLASS - if (yyCh == '^') { - yyCharClass->setNegative(true); - yyCh = getChar(); - } - charPending = false; - rangePending = false; - do { - if (yyCh == '-' && charPending && !rangePending) { - rangePending = true; - yyCh = getChar(); - } else { - if (charPending && !rangePending) { - yyCharClass->addSingleton(pendingCh); - charPending = false; - } - if (yyCh == '\\') { - yyCh = getChar(); - tok = getEscape(); - if (tok == Tok_Word) - tok = '\b'; - } else { - tok = Tok_Char | yyCh; - yyCh = getChar(); - } - if (tok == Tok_CharClass) { - if (rangePending) { - yyCharClass->addSingleton('-'); - yyCharClass->addSingleton(pendingCh); - charPending = false; - rangePending = false; - } - } else if ((tok & Tok_Char) != 0) { - if (rangePending) { - yyCharClass->addRange(pendingCh, tok ^ Tok_Char); - charPending = false; - rangePending = false; - } else { - pendingCh = tok ^ Tok_Char; - charPending = true; - } - } else { - error(RXERR_CHARCLASS); - } - } - } while (yyCh != ']' && yyCh != EOS); - if (rangePending) - yyCharClass->addSingleton('-'); - if (charPending) - yyCharClass->addSingleton(pendingCh); - if (yyCh == EOS) - error(RXERR_END); - else - yyCh = getChar(); - return Tok_CharClass; -#else - error(RXERR_END); - return Tok_Char | '['; -#endif - case '\\': - return getEscape(); - case ']': - error(RXERR_LEFTDELIM); - return Tok_Char | ']'; - case '^': - return Tok_Caret; - case '{': -#ifndef QT_NO_REGEXP_INTERVAL - yyMinRep = getRep(0); - yyMaxRep = yyMinRep; - if (yyCh == ',') { - yyCh = getChar(); - yyMaxRep = getRep(InftyRep); - } - if (yyMaxRep < yyMinRep) - error(RXERR_INTERVAL); - if (yyCh != '}') - error(RXERR_REPETITION); - yyCh = getChar(); - return Tok_Quantifier; -#else - error(RXERR_DISABLED); - return Tok_Char | '{'; -#endif - case '|': - return Tok_Bar; - case '}': - error(RXERR_LEFTDELIM); - return Tok_Char | '}'; - default: - return Tok_Char | prevCh; - } -} - -int QRegExpEngine::parse(const QChar *pattern, int len) -{ - valid = true; - startTokenizer(pattern, len); - yyTok = getToken(); -#ifndef QT_NO_REGEXP_CAPTURE - yyMayCapture = true; -#else - yyMayCapture = false; -#endif - -#ifndef QT_NO_REGEXP_CAPTURE - int atom = startAtom(false); -#endif - QRegExpCharClass anything; - Box box(this); // create InitialState - box.set(anything); - Box rightBox(this); // create FinalState - rightBox.set(anything); - - Box middleBox(this); - parseExpression(&middleBox); -#ifndef QT_NO_REGEXP_CAPTURE - finishAtom(atom, false); -#endif -#ifndef QT_NO_REGEXP_OPTIM - middleBox.setupHeuristics(); -#endif - box.cat(middleBox); - box.cat(rightBox); - yyCharClass.reset(); - -#ifndef QT_NO_REGEXP_CAPTURE - for (int i = 0; i < nf; ++i) { - switch (f[i].capture) { - case QRegExpAtom::NoCapture: - break; - case QRegExpAtom::OfficialCapture: - f[i].capture = ncap; - captureForOfficialCapture.append(ncap); - ++ncap; - ++officialncap; - break; - case QRegExpAtom::UnofficialCapture: - f[i].capture = greedyQuantifiers ? ncap++ : QRegExpAtom::NoCapture; - } - } - -#ifndef QT_NO_REGEXP_BACKREF -#ifndef QT_NO_REGEXP_OPTIM - if (officialncap == 0 && nbrefs == 0) { - ncap = nf = 0; - f.clear(); - } -#endif - // handle the case where there's a \5 with no corresponding capture - // (captureForOfficialCapture.size() != officialncap) - for (int i = 0; i < nbrefs - officialncap; ++i) { - captureForOfficialCapture.append(ncap); - ++ncap; - } -#endif -#endif - - if (!yyError.isEmpty()) - return -1; - -#ifndef QT_NO_REGEXP_OPTIM - const QRegExpAutomatonState &sinit = s.at(InitialState); - caretAnchored = !sinit.anchors.isEmpty(); - if (caretAnchored) { - const QMap<int, int> &anchors = sinit.anchors; - QMap<int, int>::const_iterator a; - for (a = anchors.constBegin(); a != anchors.constEnd(); ++a) { - if ( -#ifndef QT_NO_REGEXP_ANCHOR_ALT - (*a & Anchor_Alternation) != 0 || -#endif - (*a & Anchor_Caret) == 0) - { - caretAnchored = false; - break; - } - } - } -#endif - - // cleanup anchors - int numStates = s.count(); - for (int i = 0; i < numStates; ++i) { - QRegExpAutomatonState &state = s[i]; - if (!state.anchors.isEmpty()) { - QMap<int, int>::iterator a = state.anchors.begin(); - while (a != state.anchors.end()) { - if (a.value() == 0) - a = state.anchors.erase(a); - else - ++a; - } - } - } - - return yyPos0; -} - -void QRegExpEngine::parseAtom(Box *box) -{ -#ifndef QT_NO_REGEXP_LOOKAHEAD - QRegExpEngine *eng = nullptr; - bool neg; - int len; -#endif - - if ((yyTok & Tok_Char) != 0) { - box->set(QChar(yyTok ^ Tok_Char)); - } else { -#ifndef QT_NO_REGEXP_OPTIM - trivial = false; -#endif - switch (yyTok) { - case Tok_Dollar: - box->catAnchor(Anchor_Dollar); - break; - case Tok_Caret: - box->catAnchor(Anchor_Caret); - break; -#ifndef QT_NO_REGEXP_LOOKAHEAD - case Tok_PosLookahead: - case Tok_NegLookahead: - neg = (yyTok == Tok_NegLookahead); - eng = new QRegExpEngine(cs, greedyQuantifiers); - len = eng->parse(yyIn + yyPos - 1, yyLen - yyPos + 1); - if (len >= 0) - skipChars(len); - else - error(RXERR_LOOKAHEAD); - box->catAnchor(addLookahead(eng, neg)); - yyTok = getToken(); - if (yyTok != Tok_RightParen) - error(RXERR_LOOKAHEAD); - break; -#endif -#ifndef QT_NO_REGEXP_ESCAPE - case Tok_Word: - box->catAnchor(Anchor_Word); - break; - case Tok_NonWord: - box->catAnchor(Anchor_NonWord); - break; -#endif - case Tok_LeftParen: - case Tok_MagicLeftParen: - yyTok = getToken(); - parseExpression(box); - if (yyTok != Tok_RightParen) - error(RXERR_END); - break; - case Tok_CharClass: - box->set(*yyCharClass); - break; - case Tok_Quantifier: - error(RXERR_REPETITION); - break; - default: -#ifndef QT_NO_REGEXP_BACKREF - if ((yyTok & Tok_BackRef) != 0) - box->set(yyTok ^ Tok_BackRef); - else -#endif - error(RXERR_DISABLED); - } - } - yyTok = getToken(); -} - -void QRegExpEngine::parseFactor(Box *box) -{ -#ifndef QT_NO_REGEXP_CAPTURE - int outerAtom = greedyQuantifiers ? startAtom(false) : -1; - int innerAtom = startAtom(yyMayCapture && yyTok == Tok_LeftParen); - bool magicLeftParen = (yyTok == Tok_MagicLeftParen); -#else - const int innerAtom = -1; -#endif - -#ifndef QT_NO_REGEXP_INTERVAL -#define YYREDO() \ - yyIn = in, yyPos0 = pos0, yyPos = pos, yyLen = len, yyCh = ch, \ - *yyCharClass = charClass, yyMinRep = 0, yyMaxRep = 0, yyTok = tok - - const QChar *in = yyIn; - int pos0 = yyPos0; - int pos = yyPos; - int len = yyLen; - int ch = yyCh; - QRegExpCharClass charClass; - if (yyTok == Tok_CharClass) - charClass = *yyCharClass; - int tok = yyTok; - bool mayCapture = yyMayCapture; -#endif - - parseAtom(box); -#ifndef QT_NO_REGEXP_CAPTURE - finishAtom(innerAtom, magicLeftParen); -#endif - - bool hasQuantifier = (yyTok == Tok_Quantifier); - if (hasQuantifier) { -#ifndef QT_NO_REGEXP_OPTIM - trivial = false; -#endif - if (yyMaxRep == InftyRep) { - box->plus(innerAtom); -#ifndef QT_NO_REGEXP_INTERVAL - } else if (yyMaxRep == 0) { - box->clear(); -#endif - } - if (yyMinRep == 0) - box->opt(); - -#ifndef QT_NO_REGEXP_INTERVAL - yyMayCapture = false; - int alpha = (yyMinRep == 0) ? 0 : yyMinRep - 1; - int beta = (yyMaxRep == InftyRep) ? 0 : yyMaxRep - (alpha + 1); - - Box rightBox(this); - int i; - - for (i = 0; i < beta; i++) { - YYREDO(); - Box leftBox(this); - parseAtom(&leftBox); - leftBox.cat(rightBox); - leftBox.opt(); - rightBox = leftBox; - } - for (i = 0; i < alpha; i++) { - YYREDO(); - Box leftBox(this); - parseAtom(&leftBox); - leftBox.cat(rightBox); - rightBox = leftBox; - } - rightBox.cat(*box); - *box = rightBox; -#endif - yyTok = getToken(); -#ifndef QT_NO_REGEXP_INTERVAL - yyMayCapture = mayCapture; -#endif - } -#undef YYREDO -#ifndef QT_NO_REGEXP_CAPTURE - if (greedyQuantifiers) - finishAtom(outerAtom, hasQuantifier); -#endif -} - -void QRegExpEngine::parseTerm(Box *box) -{ -#ifndef QT_NO_REGEXP_OPTIM - if (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar) - parseFactor(box); -#endif - while (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar) { - Box rightBox(this); - parseFactor(&rightBox); - box->cat(rightBox); - } -} - -void QRegExpEngine::parseExpression(Box *box) -{ - parseTerm(box); - while (yyTok == Tok_Bar) { -#ifndef QT_NO_REGEXP_OPTIM - trivial = false; -#endif - Box rightBox(this); - yyTok = getToken(); - parseTerm(&rightBox); - box->orx(rightBox); - } -} - -/* - The struct QRegExpPrivate contains the private data of a regular - expression other than the automaton. It makes it possible for many - QRegExp objects to use the same QRegExpEngine object with different - QRegExpPrivate objects. -*/ -struct QRegExpPrivate -{ - QRegExpEngine *eng; - QRegExpEngineKey engineKey; - bool minimal; -#ifndef QT_NO_REGEXP_CAPTURE - QString t; // last string passed to QRegExp::indexIn() or lastIndexIn() - QStringList capturedCache; // what QRegExp::capturedTexts() returned last -#endif - QRegExpMatchState matchState; - - inline QRegExpPrivate() - : eng(nullptr), engineKey(QString(), QRegExp::RegExp, Qt::CaseSensitive), minimal(false) { } - inline QRegExpPrivate(const QRegExpEngineKey &key) - : eng(nullptr), engineKey(key), minimal(false) {} -}; - -#if !defined(QT_NO_REGEXP_OPTIM) -struct QRECache -{ - typedef QHash<QRegExpEngineKey, QRegExpEngine *> EngineCache; - typedef QCache<QRegExpEngineKey, QRegExpEngine> UnusedEngineCache; - EngineCache usedEngines; - UnusedEngineCache unusedEngines; -}; -Q_GLOBAL_STATIC(QRECache, engineCache) -static QBasicMutex engineCacheMutex; -#endif // QT_NO_REGEXP_OPTIM - -static void derefEngine(QRegExpEngine *eng, const QRegExpEngineKey &key) -{ -#if !defined(QT_NO_REGEXP_OPTIM) - const auto locker = qt_scoped_lock(engineCacheMutex); - if (!eng->ref.deref()) { - if (QRECache *c = engineCache()) { - c->unusedEngines.insert(key, eng, 4 + key.pattern.length() / 4); - c->usedEngines.remove(key); - } else { - delete eng; - } - } -#else - Q_UNUSED(key); - if (!eng->ref.deref()) - delete eng; -#endif -} - -static void prepareEngine_helper(QRegExpPrivate *priv) -{ - Q_ASSERT(!priv->eng); - -#if !defined(QT_NO_REGEXP_OPTIM) - const auto locker = qt_scoped_lock(engineCacheMutex); - if (QRECache *c = engineCache()) { - priv->eng = c->unusedEngines.take(priv->engineKey); - if (!priv->eng) - priv->eng = c->usedEngines.value(priv->engineKey); - if (!priv->eng) - priv->eng = new QRegExpEngine(priv->engineKey); - else - priv->eng->ref.ref(); - - c->usedEngines.insert(priv->engineKey, priv->eng); - return; - } -#endif // QT_NO_REGEXP_OPTIM - - priv->eng = new QRegExpEngine(priv->engineKey); -} - -inline static void prepareEngine(QRegExpPrivate *priv) -{ - if (priv->eng) - return; - prepareEngine_helper(priv); - priv->matchState.prepareForMatch(priv->eng); -} - -static void prepareEngineForMatch(QRegExpPrivate *priv, const QString &str) -{ - prepareEngine(priv); - priv->matchState.prepareForMatch(priv->eng); -#ifndef QT_NO_REGEXP_CAPTURE - priv->t = str; - priv->capturedCache.clear(); -#else - Q_UNUSED(str); -#endif -} - -static void invalidateEngine(QRegExpPrivate *priv) -{ - if (priv->eng) { - derefEngine(priv->eng, priv->engineKey); - priv->eng = nullptr; - priv->matchState.drain(); - } -} - -/*! - \enum QRegExp::CaretMode - - The CaretMode enum defines the different meanings of the caret - (\b{^}) in a regular expression. The possible values are: - - \value CaretAtZero - The caret corresponds to index 0 in the searched string. - - \value CaretAtOffset - The caret corresponds to the start offset of the search. - - \value CaretWontMatch - The caret never matches. -*/ - -/*! - \enum QRegExp::PatternSyntax - - The syntax used to interpret the meaning of the pattern. - - \value RegExp A rich Perl-like pattern matching syntax. This is - the default. - - \value RegExp2 Like RegExp, but with \l{greedy quantifiers}. - (Introduced in Qt 4.2.) - - \value Wildcard This provides a simple pattern matching syntax - similar to that used by shells (command interpreters) for "file - globbing". See \l{QRegExp wildcard matching}. - - \value WildcardUnix This is similar to Wildcard but with the - behavior of a Unix shell. The wildcard characters can be escaped - with the character "\\". - - \value FixedString The pattern is a fixed string. This is - equivalent to using the RegExp pattern on a string in - which all metacharacters are escaped using escape(). - - \value W3CXmlSchema11 The pattern is a regular expression as - defined by the W3C XML Schema 1.1 specification. - - \sa setPatternSyntax() -*/ - -/*! - Constructs an empty regexp. - - \sa isValid(), errorString() -*/ -QRegExp::QRegExp() -{ - priv = new QRegExpPrivate; - prepareEngine(priv); -} - -/*! - Constructs a regular expression object for the given \a pattern - string. The pattern must be given using wildcard notation if \a - syntax is \l Wildcard; the default is \l RegExp. The pattern is - case sensitive, unless \a cs is Qt::CaseInsensitive. Matching is - greedy (maximal), but can be changed by calling - setMinimal(). - - \sa setPattern(), setCaseSensitivity(), setPatternSyntax() -*/ -QRegExp::QRegExp(const QString &pattern, Qt::CaseSensitivity cs, PatternSyntax syntax) -{ - priv = new QRegExpPrivate(QRegExpEngineKey(pattern, syntax, cs)); - prepareEngine(priv); -} - -/*! - Constructs a regular expression as a copy of \a rx. - - \sa operator=() -*/ -QRegExp::QRegExp(const QRegExp &rx) -{ - priv = new QRegExpPrivate; - operator=(rx); -} - -/*! - Destroys the regular expression and cleans up its internal data. -*/ -QRegExp::~QRegExp() -{ - invalidateEngine(priv); - delete priv; -} - -/*! - Copies the regular expression \a rx and returns a reference to the - copy. The case sensitivity, wildcard, and minimal matching options - are also copied. -*/ -QRegExp &QRegExp::operator=(const QRegExp &rx) -{ - prepareEngine(rx.priv); // to allow sharing - QRegExpEngine *otherEng = rx.priv->eng; - if (otherEng) - otherEng->ref.ref(); - invalidateEngine(priv); - priv->eng = otherEng; - priv->engineKey = rx.priv->engineKey; - priv->minimal = rx.priv->minimal; -#ifndef QT_NO_REGEXP_CAPTURE - priv->t = rx.priv->t; - priv->capturedCache = rx.priv->capturedCache; -#endif - if (priv->eng) - priv->matchState.prepareForMatch(priv->eng); - priv->matchState.captured = rx.priv->matchState.captured; - return *this; -} - -/*! - \fn QRegExp &QRegExp::operator=(QRegExp &&other) - - Move-assigns \a other to this QRegExp instance. - - \since 5.2 -*/ - -/*! - \fn void QRegExp::swap(QRegExp &other) - \since 4.8 - - Swaps regular expression \a other with this regular - expression. This operation is very fast and never fails. -*/ - -/*! - Returns \c true if this regular expression is equal to \a rx; - otherwise returns \c false. - - Two QRegExp objects are equal if they have the same pattern - strings and the same settings for case sensitivity, wildcard and - minimal matching. -*/ -bool QRegExp::operator==(const QRegExp &rx) const -{ - return priv->engineKey == rx.priv->engineKey && priv->minimal == rx.priv->minimal; -} - -/*! - \since 5.6 - \relates QRegExp - - Returns the hash value for \a key, using - \a seed to seed the calculation. -*/ -size_t qHash(const QRegExp &key, size_t seed) noexcept -{ - QtPrivate::QHashCombine hash; - seed = hash(seed, key.priv->engineKey); - seed = hash(seed, key.priv->minimal); - return seed; -} - -/*! - \fn bool QRegExp::operator!=(const QRegExp &rx) const - - Returns \c true if this regular expression is not equal to \a rx; - otherwise returns \c false. - - \sa operator==() -*/ - -/*! - Returns \c true if the pattern string is empty; otherwise returns - false. - - If you call exactMatch() with an empty pattern on an empty string - it will return true; otherwise it returns \c false since it operates - over the whole string. If you call indexIn() with an empty pattern - on \e any string it will return the start offset (0 by default) - because the empty pattern matches the 'emptiness' at the start of - the string. In this case the length of the match returned by - matchedLength() will be 0. - - See QString::isEmpty(). -*/ - -bool QRegExp::isEmpty() const -{ - return priv->engineKey.pattern.isEmpty(); -} - -/*! - Returns \c true if the regular expression is valid; otherwise returns - false. An invalid regular expression never matches. - - The pattern \b{[a-z} is an example of an invalid pattern, since - it lacks a closing square bracket. - - Note that the validity of a regexp may also depend on the setting - of the wildcard flag, for example \b{*.html} is a valid - wildcard regexp but an invalid full regexp. - - \sa errorString() -*/ -bool QRegExp::isValid() const -{ - if (priv->engineKey.pattern.isEmpty()) { - return true; - } else { - prepareEngine(priv); - return priv->eng->isValid(); - } -} - -/*! - Returns the pattern string of the regular expression. The pattern - has either regular expression syntax or wildcard syntax, depending - on patternSyntax(). - - \sa patternSyntax(), caseSensitivity() -*/ -QString QRegExp::pattern() const -{ - return priv->engineKey.pattern; -} - -/*! - Sets the pattern string to \a pattern. The case sensitivity, - wildcard, and minimal matching options are not changed. - - \sa setPatternSyntax(), setCaseSensitivity() -*/ -void QRegExp::setPattern(const QString &pattern) -{ - if (priv->engineKey.pattern != pattern) { - invalidateEngine(priv); - priv->engineKey.pattern = pattern; - } -} - -/*! - Returns Qt::CaseSensitive if the regexp is matched case - sensitively; otherwise returns Qt::CaseInsensitive. - - \sa patternSyntax(), pattern(), isMinimal() -*/ -Qt::CaseSensitivity QRegExp::caseSensitivity() const -{ - return priv->engineKey.cs; -} - -/*! - Sets case sensitive matching to \a cs. - - If \a cs is Qt::CaseSensitive, \b{\\.txt$} matches - \c{readme.txt} but not \c{README.TXT}. - - \sa setPatternSyntax(), setPattern(), setMinimal() -*/ -void QRegExp::setCaseSensitivity(Qt::CaseSensitivity cs) -{ - if ((bool)cs != (bool)priv->engineKey.cs) { - invalidateEngine(priv); - priv->engineKey.cs = cs; - } -} - -/*! - Returns the syntax used by the regular expression. The default is - QRegExp::RegExp. - - \sa pattern(), caseSensitivity() -*/ -QRegExp::PatternSyntax QRegExp::patternSyntax() const -{ - return priv->engineKey.patternSyntax; -} - -/*! - Sets the syntax mode for the regular expression. The default is - QRegExp::RegExp. - - Setting \a syntax to QRegExp::Wildcard enables simple shell-like - \l{QRegExp wildcard matching}. For example, \b{r*.txt} matches the - string \c{readme.txt} in wildcard mode, but does not match - \c{readme}. - - Setting \a syntax to QRegExp::FixedString means that the pattern - is interpreted as a plain string. Special characters (e.g., - backslash) don't need to be escaped then. - - \sa setPattern(), setCaseSensitivity(), escape() -*/ -void QRegExp::setPatternSyntax(PatternSyntax syntax) -{ - if (syntax != priv->engineKey.patternSyntax) { - invalidateEngine(priv); - priv->engineKey.patternSyntax = syntax; - } -} - -/*! - Returns \c true if minimal (non-greedy) matching is enabled; - otherwise returns \c false. - - \sa caseSensitivity(), setMinimal() -*/ -bool QRegExp::isMinimal() const -{ - return priv->minimal; -} - -/*! - Enables or disables minimal matching. If \a minimal is false, - matching is greedy (maximal) which is the default. - - For example, suppose we have the input string "We must be - <b>bold</b>, very <b>bold</b>!" and the pattern - \b{<b>.*</b>}. With the default greedy (maximal) matching, - the match is "We must be \underline{<b>bold</b>, very - <b>bold</b>}!". But with minimal (non-greedy) matching, the - first match is: "We must be \underline{<b>bold</b>}, very - <b>bold</b>!" and the second match is "We must be <b>bold</b>, - very \underline{<b>bold</b>}!". In practice we might use the pattern - \b{<b>[^<]*\</b>} instead, although this will still fail for - nested tags. - - \sa setCaseSensitivity() -*/ -void QRegExp::setMinimal(bool minimal) -{ - priv->minimal = minimal; -} - -// ### Qt 5: make non-const -/*! - Returns \c true if \a str is matched exactly by this regular - expression; otherwise returns \c false. You can determine how much of - the string was matched by calling matchedLength(). - - For a given regexp string R, exactMatch("R") is the equivalent of - indexIn("^R$") since exactMatch() effectively encloses the regexp - in the start of string and end of string anchors, except that it - sets matchedLength() differently. - - For example, if the regular expression is \b{blue}, then - exactMatch() returns \c true only for input \c blue. For inputs \c - bluebell, \c blutak and \c lightblue, exactMatch() returns \c false - and matchedLength() will return 4, 3 and 0 respectively. - - Although const, this function sets matchedLength(), - capturedTexts(), and pos(). - - \sa indexIn(), lastIndexIn() -*/ -bool QRegExp::exactMatch(const QString &str) const -{ - prepareEngineForMatch(priv, str); - priv->matchState.match(str.unicode(), str.length(), 0, priv->minimal, true, 0); - if (priv->matchState.captured[1] == str.length()) { - return true; - } else { - priv->matchState.captured[0] = 0; - priv->matchState.captured[1] = priv->matchState.oneTestMatchedLen; - return false; - } -} - -/*! - Returns the regexp as a QVariant -*/ -QRegExp::operator QVariant() const -{ -QT_WARNING_PUSH QT_WARNING_DISABLE_DEPRECATED - QVariant v; - v.setValue(*this); - return v; -QT_WARNING_POP -} - -// ### Qt 5: make non-const -/*! - Attempts to find a match in \a str from position \a offset (0 by - default). If \a offset is -1, the search starts at the last - character; if -2, at the next to last character; etc. - - Returns the position of the first match, or -1 if there was no - match. - - The \a caretMode parameter can be used to instruct whether \b{^} - should match at index 0 or at \a offset. - - You might prefer to use QString::indexOf(), QString::contains(), - or even QStringList::filter(). To replace matches use - QString::replace(). - - Example: - \snippet code/src_corelib_text_qregexp.cpp 13 - - Although const, this function sets matchedLength(), - capturedTexts() and pos(). - - If the QRegExp is a wildcard expression (see setPatternSyntax()) - and want to test a string against the whole wildcard expression, - use exactMatch() instead of this function. - - \sa lastIndexIn(), exactMatch() -*/ - -int QRegExp::indexIn(const QString &str, int offset, CaretMode caretMode) const -{ - prepareEngineForMatch(priv, str); - if (offset < 0) - offset += str.length(); - priv->matchState.match(str.unicode(), str.length(), offset, - priv->minimal, false, caretIndex(offset, caretMode)); - return priv->matchState.captured[0]; -} - -// ### Qt 5: make non-const -/*! - Attempts to find a match backwards in \a str from position \a - offset. If \a offset is -1 (the default), the search starts at the - last character; if -2, at the next to last character; etc. - - Returns the position of the first match, or -1 if there was no - match. - - The \a caretMode parameter can be used to instruct whether \b{^} - should match at index 0 or at \a offset. - - Although const, this function sets matchedLength(), - capturedTexts() and pos(). - - \warning Searching backwards is much slower than searching - forwards. - - \sa indexIn(), exactMatch() -*/ - -int QRegExp::lastIndexIn(const QString &str, int offset, CaretMode caretMode) const -{ - prepareEngineForMatch(priv, str); - if (offset < 0) - offset += str.length(); - if (offset < 0 || offset > str.length()) { - memset(priv->matchState.captured, -1, priv->matchState.capturedSize*sizeof(int)); - return -1; - } - - while (offset >= 0) { - priv->matchState.match(str.unicode(), str.length(), offset, - priv->minimal, true, caretIndex(offset, caretMode)); - if (priv->matchState.captured[0] == offset) - return offset; - --offset; - } - return -1; -} - -/*! - Returns the length of the last matched string, or -1 if there was - no match. - - \sa exactMatch(), indexIn(), lastIndexIn() -*/ -int QRegExp::matchedLength() const -{ - return priv->matchState.captured[1]; -} - - -/*! - Replaces every occurrence of this regular expression in - \a str with \a after and returns the result. - - For regular expressions containing \l{capturing parentheses}, - occurrences of \b{\\1}, \b{\\2}, ..., in \a after are replaced - with \a{rx}.cap(1), cap(2), ... - - \sa indexIn(), lastIndexIn(), QRegExp::cap() -*/ -QString QRegExp::replaceIn(const QString &str, const QString &after) const -{ - struct QStringCapture - { - int pos; - int len; - int no; - }; - - QRegExp rx2(*this); - - if (str.isEmpty() && rx2.indexIn(str) == -1) - return str; - - QString s(str); - - int index = 0; - int numCaptures = rx2.captureCount(); - int al = after.length(); - QRegExp::CaretMode caretMode = QRegExp::CaretAtZero; - - if (numCaptures > 0) { - const QChar *uc = after.unicode(); - int numBackRefs = 0; - - for (int i = 0; i < al - 1; i++) { - if (uc[i] == QLatin1Char('\\')) { - int no = uc[i + 1].digitValue(); - if (no > 0 && no <= numCaptures) - numBackRefs++; - } - } - - /* - This is the harder case where we have back-references. - */ - if (numBackRefs > 0) { - QVarLengthArray<QStringCapture, 16> captures(numBackRefs); - int j = 0; - - for (int i = 0; i < al - 1; i++) { - if (uc[i] == QLatin1Char('\\')) { - int no = uc[i + 1].digitValue(); - if (no > 0 && no <= numCaptures) { - QStringCapture capture; - capture.pos = i; - capture.len = 2; - - if (i < al - 2) { - int secondDigit = uc[i + 2].digitValue(); - if (secondDigit != -1 && ((no * 10) + secondDigit) <= numCaptures) { - no = (no * 10) + secondDigit; - ++capture.len; - } - } - - capture.no = no; - captures[j++] = capture; - } - } - } - - while (index <= s.length()) { - index = rx2.indexIn(s, index, caretMode); - if (index == -1) - break; - - QString after2(after); - for (j = numBackRefs - 1; j >= 0; j--) { - const QStringCapture &capture = captures[j]; - after2.replace(capture.pos, capture.len, rx2.cap(capture.no)); - } - - s.replace(index, rx2.matchedLength(), after2); - index += after2.length(); - - // avoid infinite loop on 0-length matches (e.g., QRegExp("[a-z]*")) - if (rx2.matchedLength() == 0) - ++index; - - caretMode = QRegExp::CaretWontMatch; - } - return s; - } - } - - /* - This is the simple and optimized case where we don't have - back-references. - */ - while (index != -1) { - struct { - int pos; - int length; - } replacements[2048]; - - int pos = 0; - int adjust = 0; - while (pos < 2047) { - index = rx2.indexIn(s, index, caretMode); - if (index == -1) - break; - int ml = rx2.matchedLength(); - replacements[pos].pos = index; - replacements[pos++].length = ml; - index += ml; - adjust += al - ml; - // avoid infinite loop - if (!ml) - index++; - } - if (!pos) - break; - replacements[pos].pos = s.size(); - int newlen = s.size() + adjust; - - // to continue searching at the right position after we did - // the first round of replacements - if (index != -1) - index += adjust; - QString newstring; - newstring.reserve(newlen + 1); - QChar *newuc = newstring.data(); - QChar *uc = newuc; - int copystart = 0; - int i = 0; - while (i < pos) { - int copyend = replacements[i].pos; - int size = copyend - copystart; - memcpy(static_cast<void*>(uc), static_cast<const void *>(s.constData() + copystart), size * sizeof(QChar)); - uc += size; - memcpy(static_cast<void *>(uc), static_cast<const void *>(after.constData()), al * sizeof(QChar)); - uc += al; - copystart = copyend + replacements[i].length; - i++; - } - memcpy(static_cast<void *>(uc), static_cast<const void *>(s.constData() + copystart), (s.size() - copystart) * sizeof(QChar)); - newstring.resize(newlen); - s = newstring; - caretMode = QRegExp::CaretWontMatch; - } - return s; - -} - - -/*! - \fn QString QRegExp::removeIn(const QString &str) - - Removes every occurrence of this regular expression \a str, and - returns the result - - Does the same as replaceIn(str, QString()). - - \sa indexIn(), lastIndexIn(), replaceIn() -*/ - - -/*! - \fn QString QRegExp::countIn(const QString &str) - - Returns the number of times this regular expression matches - in \a str. - - \sa indexIn(), lastIndexIn(), replaceIn() -*/ - -int QRegExp::countIn(const QString &str) const -{ - QRegExp rx2(*this); - int count = 0; - int index = -1; - int len = str.length(); - while (index < len - 1) { // count overlapping matches - index = rx2.indexIn(str, index + 1); - if (index == -1) - break; - count++; - } - return count; -} - -/*! - Splits \a str into substrings wherever this regular expression - matches, and returns the list of those strings. If this regular - expression does not match anywhere in the string, split() returns a - single-element list containing \a str. - - \sa QStringList::join(), section(), QString::split() -*/ -QStringList QRegExp::splitString(const QString &str, Qt::SplitBehavior behavior) const -{ - QRegExp rx2(*this); - QStringList list; - int start = 0; - int extra = 0; - int end; - while ((end = rx2.indexIn(str, start + extra)) != -1) { - int matchedLen = rx2.matchedLength(); - if (start != end || behavior == Qt::KeepEmptyParts) - list.append(str.mid(start, end - start)); - start = end + matchedLen; - extra = (matchedLen == 0) ? 1 : 0; - } - if (start != str.size() || behavior == Qt::KeepEmptyParts) - list.append(str.mid(start, -1)); - return list; -} - -/*! - \fn QStringList QStringList::filter(const QRegExp &rx) const - - \overload - - Returns a list of all the strings that match the regular - expression \a rx. -*/ -QStringList QRegExp::filterList(const QStringList &stringList) const -{ - QStringList res; - for (const QString &s : stringList) { - if (containedIn(s)) - res << s; - } - return res; -} - -/*! - Replaces every occurrence of the regexp \a rx, in each of the - string lists's strings, with \a after. Returns a reference to the - string list. -*/ -QStringList QRegExp::replaceIn(const QStringList &stringList, const QString &after) const -{ - QStringList list; - for (const QString &s : stringList) - list << replaceIn(s, after); - return list; -} - -/*! - Returns the index position of the first exact match of this regexp in - \a list, searching forward from index position \a from. Returns - -1 if no item matched. - - \sa lastIndexIn(), contains(), exactMatch() -*/ -int QRegExp::indexIn(const QStringList &list, int from) const -{ - QRegExp rx2(*this); - if (from < 0) - from = qMax(from + list.size(), 0); - for (int i = from; i < list.size(); ++i) { - if (rx2.exactMatch(list.at(i))) - return i; - } - return -1; -} - -/*! - Returns the index position of the last exact match of this regexp in - \a list, searching backward from index position \a from. If \a - from is -1 (the default), the search starts at the last item. - Returns -1 if no item matched. - - \sa indexOf(), contains(), QRegExp::exactMatch() -*/ -int QRegExp::lastIndexIn(const QStringList &list, int from) const -{ - QRegExp rx2(*this); - if (from < 0) - from += list.size(); - else if (from >= list.size()) - from = list.size() - 1; - for (int i = from; i >= 0; --i) { - if (rx2.exactMatch(list.at(i))) - return i; - } - return -1; -} - -#ifndef QT_NO_REGEXP_CAPTURE - -/*! - \since 4.6 - Returns the number of captures contained in the regular expression. - */ -int QRegExp::captureCount() const -{ - prepareEngine(priv); - return priv->eng->captureCount(); -} - -/*! - Returns a list of the captured text strings. - - The first string in the list is the entire matched string. Each - subsequent list element contains a string that matched a - (capturing) subexpression of the regexp. - - For example: - \snippet code/src_corelib_text_qregexp.cpp 14 - - The above example also captures elements that may be present but - which we have no interest in. This problem can be solved by using - non-capturing parentheses: - - \snippet code/src_corelib_text_qregexp.cpp 15 - - Note that if you want to iterate over the list, you should iterate - over a copy, e.g. - \snippet code/src_corelib_text_qregexp.cpp 16 - - Some regexps can match an indeterminate number of times. For - example if the input string is "Offsets: 12 14 99 231 7" and the - regexp, \c{rx}, is \b{(\\d+)+}, we would hope to get a list of - all the numbers matched. However, after calling - \c{rx.indexIn(str)}, capturedTexts() will return the list ("12", - "12"), i.e. the entire match was "12" and the first subexpression - matched was "12". The correct approach is to use cap() in a - \l{QRegExp#cap_in_a_loop}{loop}. - - The order of elements in the string list is as follows. The first - element is the entire matching string. Each subsequent element - corresponds to the next capturing open left parentheses. Thus - capturedTexts()[1] is the text of the first capturing parentheses, - capturedTexts()[2] is the text of the second and so on - (corresponding to $1, $2, etc., in some other regexp languages). - - \sa cap(), pos() -*/ -QStringList QRegExp::capturedTexts() const -{ - if (priv->capturedCache.isEmpty()) { - prepareEngine(priv); - const int *captured = priv->matchState.captured; - int n = priv->matchState.capturedSize; - - for (int i = 0; i < n; i += 2) { - QString m; - if (captured[i + 1] == 0) - m = QLatin1String(""); // ### Qt 5: don't distinguish between null and empty - else if (captured[i] >= 0) - m = priv->t.mid(captured[i], captured[i + 1]); - priv->capturedCache.append(m); - } - priv->t.clear(); - } - return priv->capturedCache; -} - -/*! - \internal -*/ -QStringList QRegExp::capturedTexts() -{ - return const_cast<const QRegExp *>(this)->capturedTexts(); -} - -/*! - Returns the text captured by the \a nth subexpression. The entire - match has index 0 and the parenthesized subexpressions have - indexes starting from 1 (excluding non-capturing parentheses). - - \snippet code/src_corelib_text_qregexp.cpp 17 - - The order of elements matched by cap() is as follows. The first - element, cap(0), is the entire matching string. Each subsequent - element corresponds to the next capturing open left parentheses. - Thus cap(1) is the text of the first capturing parentheses, cap(2) - is the text of the second, and so on. - - \sa capturedTexts(), pos() -*/ -QString QRegExp::cap(int nth) const -{ - return capturedTexts().value(nth); -} - -/*! - \internal -*/ -QString QRegExp::cap(int nth) -{ - return const_cast<const QRegExp *>(this)->cap(nth); -} - -/*! - Returns the position of the \a nth captured text in the searched - string. If \a nth is 0 (the default), pos() returns the position - of the whole match. - - Example: - \snippet code/src_corelib_text_qregexp.cpp 18 - - For zero-length matches, pos() always returns -1. (For example, if - cap(4) would return an empty string, pos(4) returns -1.) This is - a feature of the implementation. - - \sa cap(), capturedTexts() -*/ -int QRegExp::pos(int nth) const -{ - if (nth < 0 || nth >= priv->matchState.capturedSize / 2) - return -1; - else - return priv->matchState.captured[2 * nth]; -} - -/*! - \internal -*/ -int QRegExp::pos(int nth) -{ - return const_cast<const QRegExp *>(this)->pos(nth); -} - -/*! - Returns a text string that explains why a regexp pattern is - invalid the case being; otherwise returns "no error occurred". - - \sa isValid() -*/ -QString QRegExp::errorString() const -{ - if (isValid()) { - return QString::fromLatin1(RXERR_OK); - } else { - return priv->eng->errorString(); - } -} - -/*! - \internal -*/ -QString QRegExp::errorString() -{ - return const_cast<const QRegExp *>(this)->errorString(); -} - -#endif - -/*! - Returns the string \a str with every regexp special character - escaped with a backslash. The special characters are $, (,), *, +, - ., ?, [, \,], ^, {, | and }. - - Example: - - \snippet code/src_corelib_text_qregexp.cpp 19 - - This function is useful to construct regexp patterns dynamically: - - \snippet code/src_corelib_text_qregexp.cpp 20 - - \sa setPatternSyntax() -*/ -QString QRegExp::escape(const QString &str) -{ - QString quoted; - const int count = str.count(); - quoted.reserve(count * 2); - const QLatin1Char backslash('\\'); - for (int i = 0; i < count; i++) { - switch (str.at(i).toLatin1()) { - case '$': - case '(': - case ')': - case '*': - case '+': - case '.': - case '?': - case '[': - case '\\': - case ']': - case '^': - case '{': - case '|': - case '}': - quoted.append(backslash); - } - quoted.append(str.at(i)); - } - return quoted; -} - - -#ifndef QT_NO_DATASTREAM -/*! - \relates QRegExp - - Writes the regular expression \a regExp to stream \a out. - - \sa {Serializing Qt Data Types} -*/ -QDataStream &operator<<(QDataStream &out, const QRegExp ®Exp) -{ - return out << regExp.pattern() << (quint8)regExp.caseSensitivity() - << (quint8)regExp.patternSyntax() - << (quint8)!!regExp.isMinimal(); -} - -/*! - \relates QRegExp - - Reads a regular expression from stream \a in into \a regExp. - - \sa {Serializing Qt Data Types} -*/ -QDataStream &operator>>(QDataStream &in, QRegExp ®Exp) -{ - QString pattern; - quint8 cs; - quint8 patternSyntax; - quint8 isMinimal; - - in >> pattern >> cs >> patternSyntax >> isMinimal; - - QRegExp newRegExp(pattern, Qt::CaseSensitivity(cs), - QRegExp::PatternSyntax(patternSyntax)); - - newRegExp.setMinimal(isMinimal); - regExp = newRegExp; - return in; -} -#endif // QT_NO_DATASTREAM - -#ifndef QT_NO_DEBUG_STREAM -QDebug operator<<(QDebug dbg, const QRegExp &r) -{ - QDebugStateSaver saver(dbg); - dbg.nospace() << "QRegExp(patternSyntax=" << r.patternSyntax() - << ", pattern='"<< r.pattern() << "')"; - return dbg; -} -#endif - -QT_END_NAMESPACE diff --git a/src/corelib/text/qregexp.h b/src/corelib/text/qregexp.h deleted file mode 100644 index 0c117fd17f..0000000000 --- a/src/corelib/text/qregexp.h +++ /dev/null @@ -1,151 +0,0 @@ -/**************************************************************************** -** -** Copyright (C) 2016 The Qt Company Ltd. -** Contact: https://www.qt.io/licensing/ -** -** This file is part of the QtCore module of the Qt Toolkit. -** -** $QT_BEGIN_LICENSE:LGPL$ -** Commercial License Usage -** Licensees holding valid commercial Qt licenses may use this file in -** accordance with the commercial license agreement provided with the -** Software or, alternatively, in accordance with the terms contained in -** a written agreement between you and The Qt Company. For licensing terms -** and conditions see https://www.qt.io/terms-conditions. For further -** information use the contact form at https://www.qt.io/contact-us. -** -** GNU Lesser General Public License Usage -** Alternatively, this file may be used under the terms of the GNU Lesser -** General Public License version 3 as published by the Free Software -** Foundation and appearing in the file LICENSE.LGPL3 included in the -** packaging of this file. Please review the following information to -** ensure the GNU Lesser General Public License version 3 requirements -** will be met: https://www.gnu.org/licenses/lgpl-3.0.html. -** -** GNU General Public License Usage -** Alternatively, this file may be used under the terms of the GNU -** General Public License version 2.0 or (at your option) the GNU General -** Public license version 3 or any later version approved by the KDE Free -** Qt Foundation. The licenses are as published by the Free Software -** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3 -** included in the packaging of this file. Please review the following -** information to ensure the GNU General Public License requirements will -** be met: https://www.gnu.org/licenses/gpl-2.0.html and -** https://www.gnu.org/licenses/gpl-3.0.html. -** -** $QT_END_LICENSE$ -** -****************************************************************************/ - -#ifndef QREGEXP_H -#define QREGEXP_H - -#include <QtCore/qglobal.h> - -#ifndef QT_NO_REGEXP - -#include <QtCore/qstring.h> -#include <QtCore/qvariant.h> - -QT_BEGIN_NAMESPACE - - -struct QRegExpPrivate; -class QStringList; -class QRegExp; - -Q_CORE_EXPORT size_t qHash(const QRegExp &key, size_t seed = 0) noexcept; - -class Q_CORE_EXPORT QRegExp -{ -public: - enum PatternSyntax { - RegExp, - Wildcard, - FixedString, - RegExp2, - WildcardUnix, - W3CXmlSchema11 }; - enum CaretMode { CaretAtZero, CaretAtOffset, CaretWontMatch }; - - QRegExp(); - explicit QRegExp(const QString &pattern, Qt::CaseSensitivity cs = Qt::CaseSensitive, - PatternSyntax syntax = RegExp); - QRegExp(const QRegExp &rx); - ~QRegExp(); - QRegExp &operator=(const QRegExp &rx); - QRegExp &operator=(QRegExp &&other) noexcept { swap(other); return *this; } - void swap(QRegExp &other) noexcept { qSwap(priv, other.priv); } - - bool operator==(const QRegExp &rx) const; - inline bool operator!=(const QRegExp &rx) const { return !operator==(rx); } - - bool isEmpty() const; - bool isValid() const; - QString pattern() const; - void setPattern(const QString &pattern); - Qt::CaseSensitivity caseSensitivity() const; - void setCaseSensitivity(Qt::CaseSensitivity cs); - PatternSyntax patternSyntax() const; - void setPatternSyntax(PatternSyntax syntax); - - bool isMinimal() const; - void setMinimal(bool minimal); - - bool exactMatch(const QString &str) const; - - operator QVariant() const; - - int indexIn(const QString &str, int offset = 0, CaretMode caretMode = CaretAtZero) const; - int lastIndexIn(const QString &str, int offset = -1, CaretMode caretMode = CaretAtZero) const; - int matchedLength() const; -#ifndef QT_NO_REGEXP_CAPTURE - int captureCount() const; - QStringList capturedTexts() const; - QStringList capturedTexts(); - QString cap(int nth = 0) const; - QString cap(int nth = 0); - int pos(int nth = 0) const; - int pos(int nth = 0); - QString errorString() const; - QString errorString(); -#endif - - QString replaceIn(const QString &str, const QString &after) const; - QString removeIn(const QString &str) const - { return replaceIn(str, QString()); } - bool containedIn(const QString &str) const - { return indexIn(str) != -1; } - int countIn(const QString &str) const; - - QStringList splitString(const QString &str, Qt::SplitBehavior behavior = Qt::KeepEmptyParts) const; - - int indexIn(const QStringList &list, int from) const; - int lastIndexIn(const QStringList &list, int from) const; - QStringList replaceIn(const QStringList &stringList, const QString &after) const; - QStringList filterList(const QStringList &stringList) const; - - static QString escape(const QString &str); - - friend Q_CORE_EXPORT size_t qHash(const QRegExp &key, size_t seed) noexcept; - -private: - QRegExpPrivate *priv; -}; - -#ifndef QT_NO_DATASTREAM -Q_CORE_EXPORT QDataStream &operator<<(QDataStream &out, const QRegExp ®Exp); -Q_CORE_EXPORT QDataStream &operator>>(QDataStream &in, QRegExp ®Exp); -#endif - -#ifndef QT_NO_DEBUG_STREAM -Q_CORE_EXPORT QDebug operator<<(QDebug, const QRegExp &); -#endif - -QT_END_NAMESPACE - -Q_DECLARE_METATYPE(QRegExp) - -#endif // QT_NO_REGEXP - -#endif // QREGEXP_H diff --git a/src/corelib/text/text.pri b/src/corelib/text/text.pri index 89fbdddd83..0d9a6af454 100644 --- a/src/corelib/text/text.pri +++ b/src/corelib/text/text.pri @@ -16,7 +16,6 @@ HEADERS += \ text/qlocale_p.h \ text/qlocale_tools_p.h \ text/qlocale_data_p.h \ - text/qregexp.h \ text/qstring.h \ text/qstringalgorithms.h \ text/qstringalgorithms_p.h \ @@ -41,7 +40,6 @@ SOURCES += \ text/qcollator.cpp \ text/qlocale.cpp \ text/qlocale_tools.cpp \ - text/qregexp.cpp \ text/qstring.cpp \ text/qstringbuilder.cpp \ text/qstringconverter.cpp \ diff --git a/src/tools/uic/qclass_lib_map.h b/src/tools/uic/qclass_lib_map.h index b63b05107b..c0862a9c02 100644 --- a/src/tools/uic/qclass_lib_map.h +++ b/src/tools/uic/qclass_lib_map.h @@ -214,7 +214,6 @@ QT_CLASS_LIB(QPointF, QtCore, qpoint.h) QT_CLASS_LIB(QQueue, QtCore, qqueue.h) QT_CLASS_LIB(QRect, QtCore, qrect.h) QT_CLASS_LIB(QRectF, QtCore, qrect.h) -QT_CLASS_LIB(QRegExp, QtCore, qregexp.h) QT_CLASS_LIB(QScopedPointerDeleter, QtCore, qscopedpointer.h) QT_CLASS_LIB(QScopedPointerArrayDeleter, QtCore, qscopedpointer.h) QT_CLASS_LIB(QScopedPointerPodDeleter, QtCore, qscopedpointer.h) |