From 9f13a7d020749e936dfe0b4c0a1d46f4dbee810f Mon Sep 17 00:00:00 2001 From: Eskil Abrahamsen Blomfeldt Date: Fri, 2 Mar 2012 16:20:55 +0100 Subject: Make cache of opentype tables in Harfbuzz face lazy The mechanism in fontconfig which determines if a certain character is available (FcCharSetHasChar()) may give false positives, in which case we would load and unload those fonts per every char for which FC gave us a false positive. This was a major performance regression. Specifically the false positives happened when looking at e.g. italic variants of certain multilingual fonts, since we only check the charset of the font family as a whole and not of the specific variant, which may only support a subset of the chars. To optimize this, we remove the deletion of the font engines after loading them, but also wait with loading the opentype tables until they are actually needed. This means that for the false positives, we will load the font, but the cached data for each unused font will be much smaller. Change-Id: Idfc794401a2080da5946bf65204eb947aeb635ed Reviewed-by: Lars Knoll --- src/corelib/tools/qharfbuzz.cpp | 7 ++++++- src/corelib/tools/qharfbuzz_p.h | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'src/corelib/tools') diff --git a/src/corelib/tools/qharfbuzz.cpp b/src/corelib/tools/qharfbuzz.cpp index 7d08547ab8..11126b814d 100644 --- a/src/corelib/tools/qharfbuzz.cpp +++ b/src/corelib/tools/qharfbuzz.cpp @@ -122,7 +122,12 @@ HB_Bool qShapeItem(HB_ShaperItem *item) HB_Face qHBNewFace(void *font, HB_GetFontTableFunc tableFunc) { - return HB_NewFace(font, tableFunc); + return HB_AllocFace(font, tableFunc); +} + +HB_Face qHBLoadFace(HB_Face face) +{ + return HB_LoadFace(face); } void qHBFreeFace(HB_Face face) diff --git a/src/corelib/tools/qharfbuzz_p.h b/src/corelib/tools/qharfbuzz_p.h index cc575ddffa..3cef3a55dd 100644 --- a/src/corelib/tools/qharfbuzz_p.h +++ b/src/corelib/tools/qharfbuzz_p.h @@ -68,6 +68,7 @@ Q_CORE_EXPORT HB_Bool qShapeItem(HB_ShaperItem *item); // ### temporary Q_CORE_EXPORT HB_Face qHBNewFace(void *font, HB_GetFontTableFunc tableFunc); Q_CORE_EXPORT void qHBFreeFace(HB_Face); +Q_CORE_EXPORT HB_Face qHBLoadFace(HB_Face face); Q_DECLARE_TYPEINFO(HB_GlyphAttributes, Q_PRIMITIVE_TYPE); Q_DECLARE_TYPEINFO(HB_FixedPoint, Q_PRIMITIVE_TYPE); -- cgit v1.2.3 From c7cb455a47c42e8e658e3433defee613f8643cd2 Mon Sep 17 00:00:00 2001 From: Giuseppe D'Angelo Date: Mon, 23 Jan 2012 22:47:59 +0000 Subject: QRegularExpression: add QRegularExpression* set of classes Added QRegularExpression, QRegularExpressionMatch and QRegularExpressionMatchIterator as PCRE-enabled, regexp classes. Documentation is included, as well as a first round of autotests. Task-number: QTBUG-23489 Change-Id: Id47031b80602c913ccd2fd740070e3024ea06abc Reviewed-by: Thiago Macieira Reviewed-by: Lars Knoll --- src/corelib/tools/qregularexpression.cpp | 2022 ++++++++++++++++++++++++++++++ src/corelib/tools/qregularexpression.h | 245 ++++ src/corelib/tools/tools.pri | 8 + 3 files changed, 2275 insertions(+) create mode 100644 src/corelib/tools/qregularexpression.cpp create mode 100644 src/corelib/tools/qregularexpression.h (limited to 'src/corelib/tools') diff --git a/src/corelib/tools/qregularexpression.cpp b/src/corelib/tools/qregularexpression.cpp new file mode 100644 index 0000000000..488a454aaa --- /dev/null +++ b/src/corelib/tools/qregularexpression.cpp @@ -0,0 +1,2022 @@ +/**************************************************************************** +** +** Copyright (C) 2012 Giuseppe D'Angelo . +** Copyright (C) 2012 Nokia Corporation and/or its subsidiary(-ies). +** Contact: http://www.qt-project.org/ +** +** This file is part of the QtCore module of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL$ +** GNU Lesser General Public License Usage +** This file may be used under the terms of the GNU Lesser General Public +** License version 2.1 as published by the Free Software Foundation and +** appearing in the file LICENSE.LGPL included in the packaging of this +** file. Please review the following information to ensure the GNU Lesser +** General Public License version 2.1 requirements will be met: +** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. +** +** In addition, as a special exception, Nokia gives you certain additional +** rights. These rights are described in the Nokia Qt LGPL Exception +** version 1.1, included in the file LGPL_EXCEPTION.txt in this package. +** +** GNU General Public License Usage +** Alternatively, this file may be used under the terms of the GNU General +** Public License version 3.0 as published by the Free Software Foundation +** and appearing in the file LICENSE.GPL included in the packaging of this +** file. Please review the following information to ensure the GNU General +** Public License version 3.0 requirements will be met: +** http://www.gnu.org/copyleft/gpl.html. +** +** Other Usage +** Alternatively, this file may be used in accordance with the terms and +** conditions contained in a signed written agreement between you and Nokia. +** +** +** +** +** +** +** $QT_END_LICENSE$ +** +****************************************************************************/ + +#include "qregularexpression.h" + +#include +#include +#include +#include +#include + +#include + +// after how many usages we optimize the regexp +static const unsigned int OPTIMIZE_AFTER_USE_COUNT = 10; + +QT_BEGIN_NAMESPACE + +/*! + \class QRegularExpression + \reentrant + + \brief The QRegularExpression class provides pattern matching using regular + expressions. + + \since 5.0 + + \ingroup tools + \ingroup shared + + \keyword regular expression + + Regular expressions, or \e{regexps}, are a very powerful tool to handle + strings and texts. This is useful in many contexts, e.g., + + \table + \row \i Validation + \i A regexp can test whether a substring meets some criteria, + e.g. is an integer or contains no whitespace. + \row \i Searching + \i A regexp provides more powerful pattern matching than + simple substring matching, e.g., match one of the words + \e{mail}, \e{letter} or \e{correspondence}, but none of the + words \e{email}, \e{mailman}, \e{mailer}, \e{letterbox}, etc. + \row \i Search and Replace + \i A regexp can replace all occurrences of a substring with a + different substring, e.g., replace all occurrences of \e{&} + with \e{\&} except where the \e{&} is already followed by + an \e{amp;}. + \row \i String Splitting + \i A regexp can be used to identify where a string should be + split apart, e.g. splitting tab-delimited strings. + \endtable + + This document is by no means a complete reference to pattern matching using + regular expressions, and the following parts will require the reader to + have some basic knowledge about Perl-like regular expressions and their + pattern syntax. + + Good references about regular expressions include: + + \list + \o \e {Mastering Regular Expressions} (Third Edition) by Jeffrey E. F. + Friedl, ISBN 0-596-52812-4; + \o the \l{http://pcre.org/pcre.txt} {pcrepattern(3)} man page, describing + the pattern syntax supported by PCRE (the reference implementation of + Perl-compatible regular expressions); + \o the \l{http://perldoc.perl.org/perlre.html} {Perl's regular expression + documentation} and the \l{http://perldoc.perl.org/perlretut.html} {Perl's + regular expression tutorial}. + \endlist + + \tableofcontents + + \section1 Introduction + + QRegularExpression implements Perl-compatible regular expressions. It fully + supports Unicode. For an overview of the regular expression syntax + supported by QRegularExpression, please refer to the aforementioned + pcrepattern(3) man page. A regular expression is made up of two things: a + \bold{pattern string} and a set of \bold{pattern options} that change the + meaning of the pattern string. + + You can set the pattern string by passing a string to the QRegularExpression + constructor: + + \snippet doc/src/snippets/code/src_corelib_tools_qregularexpression.cpp 0 + + This sets the pattern string to \c{a pattern}. You can also use the + setPattern() function to set a pattern on an existing QRegularExpression + object: + + \snippet doc/src/snippets/code/src_corelib_tools_qregularexpression.cpp 1 + + Note that due to C++ literal strings rules, you must escape all backslashes + inside the pattern string with another backslash: + + \snippet doc/src/snippets/code/src_corelib_tools_qregularexpression.cpp 2 + + The pattern() function returns the pattern that it's currently set for a + QRegularExpression object: + + \snippet doc/src/snippets/code/src_corelib_tools_qregularexpression.cpp 3 + + \section1 Pattern options + + The meaning of the pattern string can be modified by setting one or more + \e{pattern options}. For instance, it is possible to set a pattern to match + case insensitively by setting the QRegularExpression::CaseInsensitiveOption. + + You can set the options by passing them to the QRegularExpression + constructor, as in: + + \snippet doc/src/snippets/code/src_corelib_tools_qregularexpression.cpp 4 + + Alternatively, you can use the setPatternOptions() function on an existing + QRegularExpressionObject: + + \snippet doc/src/snippets/code/src_corelib_tools_qregularexpression.cpp 5 + + It is possible to get the pattern options currently set on a + QRegularExpression object by using the patternOptions() function: + + \snippet doc/src/snippets/code/src_corelib_tools_qregularexpression.cpp 6 + + Please refer to the QRegularExpression::PatternOption enum documentation for + more information about each pattern option. + + \section1 Match type and match options + + The last two arguments of the match() and the globalMatch() functions set + the match type and the match options. The match type is a value of the + QRegularExpression::MatchType enum; the "traditional" matching algorithm is + chosen by using the NormalMatch match type (the default). It is also + possible to enable partial matching of the regular expression against a + subject string: see the \l{partial matching} section for more details. + + The match options are a set of one or more QRegularExpression::MatchOption + values. They change the way a specific match of a regular expression + against a subject string is done. Please refer to the + QRegularExpression::MatchOption enum documentation for more details. + + \target normal matching + \section1 Normal matching + + In order to perform a match you can simply invoke the match() function + passing a string to match against. We refer to this string as the + \e{subject string}. The result of the match() function is a + QRegularExpressionMatch object that can be used to inspect the results of + the match. For instance: + + \snippet doc/src/snippets/code/src_corelib_tools_qregularexpression.cpp 7 + + If a match is successful, the (implicit) capturing group number 0 can be + used to retrieve the substring matched by the entire pattern (see also the + section about \l{extracting captured substrings}): + + \snippet doc/src/snippets/code/src_corelib_tools_qregularexpression.cpp 8 + + It's also possible to start a match at an arbitrary offset inside the + subject string by passing the offset as an argument of the + match() function. In the following example \c{"12 abc"} + is not matched because the match is started at offset 1: + + \snippet doc/src/snippets/code/src_corelib_tools_qregularexpression.cpp 9 + + \target extracting captured substrings + \section2 Extracting captured substrings + + The QRegularExpressionMatch object contains also information about the + substrings captured by the capturing groups in the pattern string. The + \l{QRegularExpressionMatch::}{captured()} function will return the string + captured by the n-th capturing group: + + \snippet doc/src/snippets/code/src_corelib_tools_qregularexpression.cpp 10 + + Capturing groups in the pattern are numbered starting from 1, and the + implicit capturing group 0 is used to capture the substring that matched + the entire pattern. + + It's also possible to retrieve the starting and the ending offsets (inside + the subject string) of each captured substring, by using the + \l{QRegularExpressionMatch::}{capturedStart()} and the + \l{QRegularExpressionMatch::}{capturedEnd()} functions: + + \snippet doc/src/snippets/code/src_corelib_tools_qregularexpression.cpp 11 + + All of these functions have an overload taking a QString as a parameter + in order to extract \e{named} captured substrings. For instance: + + \snippet doc/src/snippets/code/src_corelib_tools_qregularexpression.cpp 12 + + \target global matching + \section1 Global matching + + \e{Global matching} is useful to find all the occurrences of a given + regular expression inside a subject string. Suppose that we want to extract + all the words from a given string, where a word is a substring matching + the pattern \c{\w+}. + + QRegularExpression::globalMatch returns a QRegularExpressionMatchIterator, + which is a Java-like forward iterator that can be used to iterate over the + results. For instance: + + \snippet doc/src/snippets/code/src_corelib_tools_qregularexpression.cpp 13 + + Since it's a Java-like iterator, the QRegularExpressionMatchIterator will + point immediately before the first result. Every result is returned as a + QRegularExpressionMatch object. The + \l{QRegularExpressionMatchIterator::}{hasNext()} function will return true + if there's at least one more result, and + \l{QRegularExpressionMatchIterator::}{next()} will return the next result + and advance the iterator. Continuing from the previous example: + + \snippet doc/src/snippets/code/src_corelib_tools_qregularexpression.cpp 14 + + You can also use \l{QRegularExpressionMatchIterator::}{peekNext()} to get + the next result without advancing the iterator. + + It is possible to pass a starting offset and one or more match options to + the globalMatch() function, exactly like normal matching with match(). + + \target partial matching + \section1 Partial matching + + A \e{partial match} is obtained when the end of the subject string is + reached, but more characters are needed to successfully complete the match. + Note that a partial match is usually much more inefficient than a normal + match because many optimizations of the matching algorithm cannot be + employed. + + A partial match must be explicitly requested by specifying a match type of + PartialPreferCompleteMatch or PartialPreferFirstMatch when calling + QRegularExpression::match or QRegularExpression::globalMatch. If a partial + match is found, then calling the \l{QRegularExpressionMatch::}{hasMatch()} + function on the QRegularExpressionMatch object returned by match() will + return \c{false}, but \l{QRegularExpressionMatch::}{hasPartialMatch()} will return + \c{true}. + + When a partial match is found, no captured substrings are returned, and the + (implicit) capturing group 0 corresponding to the whole match captures the + partially matched substring of the subject string. + + Note that asking for a partial match can still lead to a complete match, if + one is found; in this case, \l{QRegularExpressionMatch::}{hasMatch()} will + return \c{true} and \l{QRegularExpressionMatch::}{hasPartialMatch()} + \c{false}. It never happens that a QRegularExpressionMatch reports both a + partial and a complete match. + + Partial matching is mainly useful in two scenarios: validating user input + in real time and incremental/multi-segment matching. + + \target + \section2 Validating user input + + Suppose that we would like the user to input a date in a specific + format, for instance "MMM dd, yyyy". We can check the input validity with + a pattern like: + + \c{^(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) \d\d?, \d\d\d\d$} + + (This pattern doesn't catch invalid days, but let's keep it for the + example's purposes). + + We would like to validate the input with this regular expression \e{while} + the user is typing it, so that we can report an error in the input as soon + as it is committed (for instance, the user typed the wrong key). In order + to do so we must distinguish three cases: + + \list + \o the input cannot possibly match the regular expression; + \o the input does match the regular expression; + \o the input does not match the regular expression right now, + but it will if more charaters will be added to it. + \endlist + + Note that these three cases represent exactly the possible states of a + QValidator (see the QValidator::State enum). + + In particular, in the last case we want the regular expression engine to + report a partial match: we are successfully matching the pattern against + the subject string but the matching cannot continue because the end of the + subject is encountered. Notice, however, that the matching algorithm should + continue and try all possibilities, and in case a complete (non-partial) + match is found, then this one should be reported, and the input string + accepted as fully valid. + + This behaviour is implemented by the PartialPreferCompleteMatch match type. + For instance: + + \snippet doc/src/snippets/code/src_corelib_tools_qregularexpression.cpp 15 + + If matching the same regular expression against the subject string leads to + a complete match, it is reported as usual: + + \snippet doc/src/snippets/code/src_corelib_tools_qregularexpression.cpp 16 + + Another example with a different pattern, showing the behaviour of + preferring a complete match over a partial one: + + \snippet doc/src/snippets/code/src_corelib_tools_qregularexpression.cpp 17 + + In this case, the subpattern \c{abc\\w+X} partially matches the subject + string; however, the subpattern \c{def} matches the subject string + completely, and therefore a complete match is reported. + + In case multiple partial matches are found when matching (but no complete + match), then the QRegularExpressionMatch will report the first one that it + is found. For instance: + + \snippet doc/src/snippets/code/src_corelib_tools_qregularexpression.cpp 18 + + \section2 Incremental/multi-segment matching + + Incremental matching is another use case of partial matching. Suppose that + we want to find the occurrences of a regular expression inside a large text + (that is, substrings matching the regular expression). In order to do so we + would like to "feed" the large text to the regular expression engines in + smaller chunks. The obvious problem is what happens if the substring that + matches the regular expression spans across two or more chunks. + + In this case, the regular expression engine should report a partial match, + so that we can match again adding new data and (eventually) get a complete + match. This implies that the regular expression engine may assume that + there are other characters \e{beyond the end} of the subject string. This + is not to be taken literally -- the engine will never try to access + any character after the last one in the subject. + + QRegularExpression implements this behaviour when using the + PartialPreferFirstMatch match type. This match type reports a partial match + as soon as it is found, and other match alternatives are not tried + (even if they could lead to a complete match). For instance: + + \snippet doc/src/snippets/code/src_corelib_tools_qregularexpression.cpp 19 + + This happens because when matching the first branch of the alternation + operator a partial match is found, and therefore matching stops, without + trying the second branch. Another example: + + \snippet doc/src/snippets/code/src_corelib_tools_qregularexpression.cpp 20 + + This shows what could seem a counterintuitve behaviour of quantifiers: + since \c{?} is greedy, then the engine tries first to continue the match + after having matched \c{"abc"}; but then the matching reaches the end of the + subject string, and therefore a partial match is reported. This is + even more surprising in the following example: + + \snippet doc/src/snippets/code/src_corelib_tools_qregularexpression.cpp 21 + + It's easy to understand this behaviour if we remember that the engine + expects the subject string to be only a substring of the whole text we're + looking for a match into (that is, how we said before, that the engine + assumes that there are other characters beyond the end of the subject + string). + + Since the \c{*} quantifier is greedy, then reporting a complete match could + be an error, because after the current subject \c{"abc"} there may be other + occurrences of \c{"abc"}. For instance, the complete text could have been + "abcabcX", and therefore the \e{right} match to report (in the complete + text) would have been \c{"abcabc"}; by matching only against the leading + \c{"abc"} we instead get a partial match. + + \section1 Error handling + + It is possible for a QRegularExpression object to be invalid because of + syntax errors in the pattern string. The isValid() function will return + true if the regular expression is valid, or false otherwise: + + \snippet doc/src/snippets/code/src_corelib_tools_qregularexpression.cpp 22 + + You can get more information about the specific error by calling the + errorString() function; moreover, the patternErrorOffset() function + will return the offset inside the pattern string + + \snippet doc/src/snippets/code/src_corelib_tools_qregularexpression.cpp 23 + + If a match is attempted with an invalid QRegularExpression, then the + returned QRegularExpressionMatch object will be invalid as well (that is, + its \l{QRegularExpressionMatch::}{isValid()} function will return false). + The same applies for attempting a global match. + + \section1 Unsupported Perl-compatible regular expressions features + + QRegularExpression does not support all the features available in + Perl-compatible regular expressions. The most notable one is the fact that + duplicated names for capturing groups are not supported, and using them can + lead to undefined behaviour. + + This may change in a future version of Qt. + + \section1 Notes for QRegExp users + + The QRegularExpression class introduced in Qt 5 is a big improvement upon + QRegExp, in terms of APIs offered, supported pattern syntax and speed of + execution. The biggest difference is that QRegularExpression simply holds a + regular expression, and it's \e{not} modified when a match is requested. + Instead, a QRegularExpressionMatch object is returned, in order to check + the result of a match and extract the captured substring. The same applies + with global matching and QRegularExpressionMatchIterator. + + Other differences are outlined below. + + \section2 Exact matching + + QRegExp::exactMatch in Qt 4 served for two purposes: it exactly matched + a regular expression against a subject string, and it implemented partial + matching. In fact, if an exact match was not found, one could still find + out how much of the subject string was matched by the regular expression + by calling QRegExp::matchedLength(). If the returned length was equal + to the subject string's length, then one could desume that a partial match + was found. + + QRegularExpression supports partial matching explicitly by means of the + appropriate MatchType. If instead you simply want to be sure that the + subject string matches the regular expression exactly, you can wrap the + pattern between a couple of anchoring expressions. Simply + putting the pattern between the \c{^} and the \c{$} anchors is enough + in most cases: + + \snippet doc/src/snippets/code/src_corelib_tools_qregularexpression.cpp 24 + + However, remember that the \c{$} anchor not only matches at the end of the + string, but also at a newline character right before the end of the string; + that is, the previous pattern matches against the string "this pattern must + match exactly\n". Also, the behaviour of both the \c{^} and the \c{$} + anchors changes if the MultiLineOption is set either explicitely (as a + pattern option) or implicitly (as a directive inside the pattern string). + + Therefore, in the most general case, you should wrap the pattern between + the \c{\A} and the \c{\z} anchors: + + \snippet doc/src/snippets/code/src_corelib_tools_qregularexpression.cpp 25 + + Note the usage of the non-capturing group in order to preserve the meaning + of the branch operator inside the pattern. + + \section2 Global matching + + Due to limitations of the QRegExp API it was impossible to implement global + matching correctly (that is, like Perl does). In particular, patterns that + can match 0 characters (like \c{"a*"}) are problematic. + + QRegularExpression::globalMatch implements Perl global match correctly, and + the returned iterator can be used to examine each result. + + \section2 Wildcard matching + + There is no equivalent of wildcard matching in QRegularExpression. + Nevertheless, rewriting a regular expression in wildcard syntax to a + Perl-compatible regular expression is a very easy task, given the fact + that wildcard syntax supported by QRegExp is very simple. + + \section2 Other pattern syntaxes + + QRegularExpression supports only Perl-compatible regular expressions. + + \section2 Minimal matching + + QRegExp::setMinimal implemented minimal matching by simply reversing the + greediness of the quantifiers (QRegExp did not support lazy quantifiers, + like \c{*?}, \c{+?}, etc.). QRegularExpression instead does support greedy, + lazy and possessive quantifiers. The InvertedGreedinessOption + pattern option can be useful to emulate the effects of QRegExp::setMinimal: + if enabled, it inverts the greediness of quantifiers (greedy ones become + lazy and vice versa). + + \section2 Caret modes + + The AnchoredMatchOption match option can be used to emulate the + QRegExp::CaretAtOffset behaviour. There is no equivalent for the other + QRegExp::CaretMode modes. + + \sa QRegularExpressionMatch, QRegularExpressionMatchIterator +*/ + +/*! + \class QRegularExpressionMatch + \reentrant + + \brief The QRegularExpressionMatch class provides the results of a matching + a QRegularExpression against a string. + + \since 5.0 + + \ingroup tools + \ingroup shared + + \keyword regular expression match + + A QRegularExpressionMatch object can be obtained by calling the + QRegularExpression::match() function, or as a single result of a global + match from a QRegularExpressionMatchIterator. + + The success or the failure of a match attempt can be inspected by calling + the hasMatch() function. QRegularExpressionMatch also reports a successful + partial match through the hasPartialMatch() function. + + In addition, QRegularExpressionMatch returns the substrings captured by the + capturing groups in the pattern string. The implicit capturing group with + index 0 captures the result of the whole match. The captured() function + returns each substring captured, either by the capturing group's index or + by its name: + + \snippet doc/src/snippets/code/src_corelib_tools_qregularexpression.cpp 29 + + For each captured substring it is possible to query its starting and ending + offsets in the subject string by calling the capturedStart() and the + capturedEnd() function, respectively. The length of each captured + substring is available using the capturedLength() function. + + The convenience function capturedTexts() will return \e{all} the captured + substrings at once (including the substring matched by the entire pattern) + in the order they have been captured by captring groups; that is, + \c{captured(i) == capturedTexts().at(i)}. + + You can retrieve the QRegularExpression object the subject string was + matched against by calling the regularExpression() function; the + match type and the match options are available as well by calling + the matchType() and the matchOptions() respectively. + + Please refer to the QRegularExpression documentation for more information + about the Qt regular expression classes. + + \sa QRegularExpression +*/ + +/*! + \class QRegularExpressionMatchIterator + \reentrant + + \brief The QRegularExpressionMatchIterator class provides an iterator on + the results of a global match of a QRegularExpression object against a string. + + \since 5.0 + + \ingroup tools + \ingroup shared + + \keyword regular expression iterator + + A QRegularExpressionMatchIterator object is a forward only Java-like + iterator; it can be obtained by calling the + QRegularExpression::globalMatch() function. A new + QRegularExpressionMatchIterator will be positioned before the first result. + You can then call the hasNext() function to check if there are more + results available; if so, the next() function will return the next + result and advance the iterator. + + Each result is a QRegularExpressionMatch object holding all the information + for that result (including captured substrings). + + For instance: + + \snippet doc/src/snippets/code/src_corelib_tools_qregularexpression.cpp 30 + + Moreover, QRegularExpressionMatchIterator offers a peekNext() function + to get the next result \e{without} advancing the iterator. + + You can retrieve the QRegularExpression object the subject string was + matched against by calling the regularExpression() function; the + match type and the match options are available as well by calling + the matchType() and the matchOptions() respectively. + + Please refer to the QRegularExpression documentation for more information + about the Qt regular expression classes. + + \sa QRegularExpression, QRegularExpressionMatch +*/ + + +/*! + \enum QRegularExpression::PatternOption + + The PatternOption enum defines modifiers to the way the pattern string + should be interpreted, and therefore the way the pattern matches against a + subject string. + + \value NoPatternOption + No pattern options are set. + + \value CaseInsensitiveOption + The pattern should match against the subject string in a case + insensitive way. This option corresponds to the /i modifier in Perl + regular expressions. + + \value DotMatchesEverythingOption + The dot metacharacter (\c{.}) in the pattern string is allowed to match + any character in the subject string, including newlines (normally, the + dot does not match newlines). This option corresponds to the \c{/s} + modifier in Perl regular expressions. + + \value MultilineOption + The caret (\c{^}) and the dollar (\c{$}) metacharacters in the pattern + string are allowed to match, respectively, immediately after and + immediately before any newline in the subject string, as well as at the + very beginning and at the very end of the subject string. This option + corresponds to the \c{/m} modifier in Perl regular expressions. + + \value ExtendedPatternSyntaxOption + Any whitespace in the pattern string which is not escaped and outside a + character class is ignored. Moreover, an unescaped sharp (\bold{#}) + outside a character class causes all the following characters, until + the first newline (included), to be ignored. This can be used to + increase the readability of a pattern string as well as put comments + inside regular expressions; this is particulary useful if the pattern + string is loaded from a file or written by the user, because in C++ + code it is always possible to use the rules for string literals to put + comments outside the pattern string. This option corresponds to the \c{/x} + modifier in Perl regular expressions. + + \value InvertedGreedinessOption + The greediness of the quantifiers is inverted: \c{*}, \c{+}, \c{?}, + \c{{m,n}}, etc. become lazy, while their lazy versions (\c{*?}, + \c{+?}, \c{??}, \c{{m,n}?}, etc.) become greedy. There is no equivalent + for this option in Perl regular expressions. + + \value DontCaptureOption + The non-named capturing groups do not capture substrings; named + capturing groups still work as intended, as well as the implicit + capturing group number 0 corresponding to the entire match. There is no + equivalent for this option in Perl regular expressions. + + \value UseUnicodePropertiesOption + The meaning of the \c{\w}, \c{\d}, etc., character types, as well as + the meaning of their counterparts (\c{\W}, \c{\D}, etc.), is changed + from matching ASCII charaters only to matching any character with the + corresponding Unicode property. For instance, \c{\d} is changed to + match any character with the Unicode Nd (decimal digit) property; + \c{\w} to match any character with either the Unicode L (letter) or N + (digit) property, plus underscore, and so on. This option corresponds + to the \c{/u} modifier in Perl regular expressions. +*/ + +/*! + \enum QRegularExpression::MatchType + + The MatchType enum defines the type of the match that should be attempted + against the subject string. + + \value NormalMatch + A normal match is done. + + \value PartialPreferCompleteMatch + The pattern string is matched partially against the subject string. If + a partial match is found, then it is recorded, and other matching + alternatives are tried as usual. If a complete match is then found, + then it's preferred to the partial match; in this case only the + complete match is reported. If instead no complete match is found (but + only the partial one), then the partial one is reported. + + \value PartialPreferFirstMatch + The pattern string is matched partially against the subject string. If + a partial match is found, then matching stops and the partial match is + reported. In this case, other matching alternatives (potentially + leading to a complete match) are not tried. Moreover, this match type + assumes that the subject string only a substring of a larger text, and + that (in this text) there are other characters beyond the end of the + subject string. This can lead to surprising results; see the discussion + in the \l{partial matching} section for more details. +*/ + +/*! + \enum QRegularExpression::MatchOption + + \value NoMatchOption + No match options are set. + + \value AnchoredMatchOption + The match is constrained to start exactly at the offset passed to + match() in order to be successful, even if the pattern string does not + contain any metacharacter that anchors the match at that point. +*/ + +/*! + \internal +*/ +static int convertToPcreOptions(QRegularExpression::PatternOptions patternOptions) +{ + int options = 0; + + if (patternOptions & QRegularExpression::CaseInsensitiveOption) + options |= PCRE_CASELESS; + if (patternOptions & QRegularExpression::DotMatchesEverythingOption) + options |= PCRE_DOTALL; + if (patternOptions & QRegularExpression::MultilineOption) + options |= PCRE_MULTILINE; + if (patternOptions & QRegularExpression::ExtendedPatternSyntaxOption) + options |= PCRE_EXTENDED; + if (patternOptions & QRegularExpression::InvertedGreedinessOption) + options |= PCRE_UNGREEDY; + if (patternOptions & QRegularExpression::DontCaptureOption) + options |= PCRE_NO_AUTO_CAPTURE; + if (patternOptions & QRegularExpression::UseUnicodePropertiesOption) + options |= PCRE_UCP; + + return options; +} + +/*! + \internal +*/ +static int convertToPcreOptions(QRegularExpression::MatchOptions matchOptions) +{ + int options = 0; + + if (matchOptions & QRegularExpression::AnchoredMatchOption) + options |= PCRE_ANCHORED; + + return options; +} + +struct QRegularExpressionPrivate : QSharedData +{ + QRegularExpressionPrivate(); + ~QRegularExpressionPrivate(); + QRegularExpressionPrivate(const QRegularExpressionPrivate &other); + + void cleanCompiledPattern(); + void compilePattern(); + void getPatternInfo(); + void optimizePattern(); + + QRegularExpressionMatchPrivate *doMatch(const QString &subject, + int offset, + QRegularExpression::MatchType matchType, + QRegularExpression::MatchOptions matchOptions, + const QRegularExpressionMatchPrivate *previous = 0) const; + + int captureIndexForName(const QString &name) const; + + QString pattern; + QRegularExpression::PatternOptions patternOptions; + + // *All* of the following members are set managed while holding this mutex, + // except for isDirty which is set to true by QRegularExpression setters + // (right after a detach happened). + // On the other hand, after the compilation and studying, + // it's safe to *use* (i.e. read) them from multiple threads at the same time. + // Therefore, doMatch doesn't need to lock this mutex. + QMutex mutex; + + // The PCRE pointers are reference-counted by the QRegularExpressionPrivate + // objects themselves; when the private is copied (i.e. a detach happened) + // they are set to 0 + pcre16 *compiledPattern; + pcre16_extra *studyData; + const char *errorString; + int errorOffset; + int capturingCount; + unsigned int usedCount; + bool usingCrLfNewlines; + bool isDirty; +}; + +struct QRegularExpressionMatchPrivate : QSharedData +{ + QRegularExpressionMatchPrivate(const QRegularExpression &re, + const QString &subject, + QRegularExpression::MatchType matchType, + QRegularExpression::MatchOptions matchOptions, + int capturingCount); + + QRegularExpressionMatch nextMatch() const; + + QRegularExpression regularExpression; + QString subject; + // the capturedOffsets vector contains pairs of (start, end) positions + // for each captured substring + QVector capturedOffsets; + + QRegularExpression::MatchType matchType; + QRegularExpression::MatchOptions matchOptions; + + int capturedCount; + + bool hasMatch; + bool hasPartialMatch; + bool isValid; +}; + +struct QRegularExpressionMatchIteratorPrivate : QSharedData +{ + QRegularExpressionMatchIteratorPrivate(const QRegularExpression re, + QRegularExpression::MatchType matchType, + QRegularExpression::MatchOptions matchOptions, + const QRegularExpressionMatch &next); + + bool hasNext() const; + QRegularExpressionMatch next; + QRegularExpression regularExpression; + QRegularExpression::MatchType matchType; + QRegularExpression::MatchOptions matchOptions; +}; + +/*! + \internal +*/ +QRegularExpression::QRegularExpression(QRegularExpressionPrivate &dd) + : d(&dd) +{ +} + +/*! + \internal +*/ +QRegularExpressionPrivate::QRegularExpressionPrivate() + : pattern(), patternOptions(0), + mutex(), + compiledPattern(0), studyData(0), + errorString(0), errorOffset(-1), + capturingCount(0), + usedCount(0), + usingCrLfNewlines(false), + isDirty(true) +{ +} + +/*! + \internal +*/ +QRegularExpressionPrivate::~QRegularExpressionPrivate() +{ + cleanCompiledPattern(); +} + +/*! + \internal + + Copies the private, which means copying only the pattern and the pattern + options. The compiledPattern and the studyData pointers are NOT copied (we + do not own them any more), and in general all the members set when + compiling a pattern are set to default values. isDirty is set back to true + so that the pattern has to be recompiled again. +*/ +QRegularExpressionPrivate::QRegularExpressionPrivate(const QRegularExpressionPrivate &other) + : QSharedData(other), + pattern(other.pattern), patternOptions(other.patternOptions), + mutex(), + compiledPattern(0), studyData(0), + errorString(0), + errorOffset(-1), capturingCount(0), + usedCount(0), + usingCrLfNewlines(false), isDirty(true) +{ +} + +/*! + \internal +*/ +void QRegularExpressionPrivate::cleanCompiledPattern() +{ + pcre16_free(compiledPattern); + pcre16_free_study(studyData); + usedCount = 0; + compiledPattern = 0; + studyData = 0; + usingCrLfNewlines = false; + errorOffset = -1; + capturingCount = 0; +} + +/*! + \internal +*/ +void QRegularExpressionPrivate::compilePattern() +{ + QMutexLocker lock(&mutex); + + if (!isDirty) + return; + + isDirty = false; + cleanCompiledPattern(); + + int options = convertToPcreOptions(patternOptions); + options |= PCRE_UTF16; + + int errorCode; + compiledPattern = pcre16_compile2(pattern.utf16(), options, + &errorCode, &errorString, &errorOffset, 0); + + if (!compiledPattern) + return; + + Q_ASSERT(errorCode == 0); + Q_ASSERT(studyData == 0); // studying (=>optimizing) is always done later + errorOffset = -1; + + getPatternInfo(); +} + +/*! + \internal +*/ +void QRegularExpressionPrivate::getPatternInfo() +{ + Q_ASSERT(compiledPattern); + + pcre16_fullinfo(compiledPattern, 0, PCRE_INFO_CAPTURECOUNT, &capturingCount); + + // detect the settings for the newline + int patternNewlineSetting; + pcre16_fullinfo(compiledPattern, studyData, PCRE_INFO_OPTIONS, &patternNewlineSetting); + patternNewlineSetting &= PCRE_NEWLINE_CR | PCRE_NEWLINE_LF | PCRE_NEWLINE_CRLF + | PCRE_NEWLINE_ANY | PCRE_NEWLINE_ANYCRLF; + if (patternNewlineSetting == 0) { + // no option was specified in the regexp, grab PCRE build defaults + int pcreNewlineSetting; + pcre16_config(PCRE_CONFIG_NEWLINE, &pcreNewlineSetting); + switch (pcreNewlineSetting) { + case 13: + patternNewlineSetting = PCRE_NEWLINE_CR; break; + case 10: + patternNewlineSetting = PCRE_NEWLINE_LF; break; + case 3338: // (13<<8 | 10) + patternNewlineSetting = PCRE_NEWLINE_CRLF; break; + case -2: + patternNewlineSetting = PCRE_NEWLINE_ANYCRLF; break; + case -1: + patternNewlineSetting = PCRE_NEWLINE_ANY; break; + default: + qWarning("QRegularExpressionPrivate::compilePattern(): " + "PCRE_CONFIG_NEWLINE returned an unknown newline"); + break; + } + } + + usingCrLfNewlines = (patternNewlineSetting == PCRE_NEWLINE_CRLF) || + (patternNewlineSetting == PCRE_NEWLINE_ANY) || + (patternNewlineSetting == PCRE_NEWLINE_ANYCRLF); +} + +/*! + \internal +*/ +void QRegularExpressionPrivate::optimizePattern() +{ + Q_ASSERT(compiledPattern); + + QMutexLocker lock(&mutex); + + if (studyData || (++usedCount != OPTIMIZE_AFTER_USE_COUNT)) + return; + + int studyOptions = PCRE_STUDY_JIT_COMPILE; + const char *err; + studyData = pcre16_study(compiledPattern, studyOptions, &err); + + if (!studyData && err) + qWarning("QRegularExpressionPrivate::optimizePattern(): pcre_study failed: %s", err); +} + +/*! + \internal + + Returns the capturing group number for the given name. Duplicated names for + capturing groups are not supported. +*/ +int QRegularExpressionPrivate::captureIndexForName(const QString &name) const +{ + Q_ASSERT(!name.isEmpty()); + + int index = pcre16_get_stringnumber(compiledPattern, name.utf16()); + if (index >= 0) + return index; + + return -1; +} + +/*! + \internal + + Performs a match of type \a matchType on the given \a subject string with + options \a matchOptions and returns the QRegularExpressionMatchPrivate of + the result. It also advances a match if a previous result is given as \a + previous. + + Advancing a match is a tricky algorithm. If the previous match matched a + non-empty string, we just do an ordinary match at the offset position. + + If the previous match matched an empty string, then an anchored, non-empty + match is attempted at the offset position. If that succeeds, then we got + the next match and we can return it. Otherwise, we advance by 1 position + (which can be one or two code units in UTF-16!) and reattempt a "normal" + match. We also have the problem of detecting the current newline format: if + the new advanced offset is pointing to the beginning of a CRLF sequence, we + must advance over it. +*/ +QRegularExpressionMatchPrivate *QRegularExpressionPrivate::doMatch(const QString &subject, + int offset, + QRegularExpression::MatchType matchType, + QRegularExpression::MatchOptions matchOptions, + const QRegularExpressionMatchPrivate *previous) const +{ + if (offset < 0) + offset += subject.length(); + + QRegularExpression re(*const_cast(this)); + + if (offset < 0 || offset > subject.length()) + return new QRegularExpressionMatchPrivate(re, subject, matchType, matchOptions, 0); + + if (!compiledPattern) { + qWarning("QRegularExpressionPrivate::doMatch(): called on an invalid QRegularExpression object"); + return new QRegularExpressionMatchPrivate(re, subject, matchType, matchOptions, 0); + } + + QRegularExpressionMatchPrivate *priv = new QRegularExpressionMatchPrivate(re, subject, + matchType, matchOptions, + capturingCount); + + // this is mutex protected + const_cast(this)->optimizePattern(); + + int pcreOptions = convertToPcreOptions(matchOptions); + + if (matchType == QRegularExpression::PartialPreferCompleteMatch) + pcreOptions |= PCRE_PARTIAL_SOFT; + else if (matchType == QRegularExpression::PartialPreferFirstMatch) + pcreOptions |= PCRE_PARTIAL_HARD; + + bool previousMatchWasEmpty = false; + if (previous && previous->hasMatch && + (previous->capturedOffsets.at(0) == previous->capturedOffsets.at(1))) { + previousMatchWasEmpty = true; + } + + int * const captureOffsets = priv->capturedOffsets.data(); + const int captureOffsetsCount = priv->capturedOffsets.size(); + + const unsigned short * const subjectUtf16 = subject.utf16(); + const int subjectLength = subject.length(); + + int result; + + if (!previousMatchWasEmpty) { + result = pcre16_exec(compiledPattern, studyData, + subjectUtf16, subjectLength, + offset, pcreOptions, + captureOffsets, captureOffsetsCount); + } else { + result = pcre16_exec(compiledPattern, studyData, + subjectUtf16, subjectLength, + offset, pcreOptions | PCRE_NOTEMPTY_ATSTART | PCRE_ANCHORED, + captureOffsets, captureOffsetsCount); + + if (result == PCRE_ERROR_NOMATCH) { + ++offset; + + if (usingCrLfNewlines + && offset < subjectLength + && subjectUtf16[offset - 1] == QLatin1Char('\r') + && subjectUtf16[offset] == QLatin1Char('\n')) { + ++offset; + } else if (offset < subjectLength + && QChar::isLowSurrogate(subjectUtf16[offset])) { + ++offset; + } + + result = pcre16_exec(compiledPattern, studyData, + subjectUtf16, subjectLength, + offset, pcreOptions, + captureOffsets, captureOffsetsCount); + } + } + +#ifdef QREGULAREXPRESSION_DEBUG + qDebug() << "Matching" << pattern << "against" << subject + << offset << matchType << matchOptions << previousMatchWasEmpty + << "result" << result; +#endif + + // result == 0 means not enough space in captureOffsets; should never happen + Q_ASSERT(result != 0); + + if (result > 0) { + // full match + priv->isValid = true; + priv->hasMatch = true; + priv->capturedCount = result; + priv->capturedOffsets.resize(result * 2); + } else { + // no match, partial match or error + priv->hasPartialMatch = (result == PCRE_ERROR_PARTIAL); + priv->isValid = (result == PCRE_ERROR_NOMATCH || result == PCRE_ERROR_PARTIAL); + + if (result == PCRE_ERROR_PARTIAL) { + // partial match: + // leave the start and end capture offsets (i.e. cap(0)) + priv->capturedCount = 1; + priv->capturedOffsets.resize(2); + } else { + // no match or error + priv->capturedCount = 0; + priv->capturedOffsets.clear(); + } + } + + return priv; +} + +/*! + \internal +*/ +QRegularExpressionMatchPrivate::QRegularExpressionMatchPrivate(const QRegularExpression &re, + const QString &subject, + QRegularExpression::MatchType matchType, + QRegularExpression::MatchOptions matchOptions, + int capturingCount) + : regularExpression(re), subject(subject), + matchType(matchType), matchOptions(matchOptions), + capturedCount(0), + hasMatch(false), hasPartialMatch(false), isValid(false) +{ + Q_ASSERT(capturingCount >= 0); + const int captureOffsetsCount = (capturingCount + 1) * 3; + capturedOffsets.resize(captureOffsetsCount); +} + + +/*! + \internal +*/ +QRegularExpressionMatch QRegularExpressionMatchPrivate::nextMatch() const +{ + Q_ASSERT(isValid); + Q_ASSERT(hasMatch || hasPartialMatch); + + QRegularExpressionMatchPrivate *nextPrivate = regularExpression.d->doMatch(subject, + capturedOffsets.at(1), + matchType, + matchOptions, + this); + return QRegularExpressionMatch(*nextPrivate); +} + +/*! + \internal +*/ +QRegularExpressionMatchIteratorPrivate::QRegularExpressionMatchIteratorPrivate(const QRegularExpression re, + QRegularExpression::MatchType matchType, + QRegularExpression::MatchOptions matchOptions, + const QRegularExpressionMatch &next) + : next(next), + regularExpression(re), + matchType(matchType), matchOptions(matchOptions) +{ +} + +/*! + \internal +*/ +bool QRegularExpressionMatchIteratorPrivate::hasNext() const +{ + return next.isValid() && (next.hasMatch() || next.hasPartialMatch()); +} + +// PUBLIC API + +/*! + Constructs a QRegularExpression object with an empty pattern and no pattern + options. + + \sa setPattern(), setPatternOptions() +*/ +QRegularExpression::QRegularExpression() + : d(new QRegularExpressionPrivate) +{ +} + +/*! + Constructs a QRegularExpression object using the given \a pattern as + pattern and the \a options as the pattern options. + + \sa setPattern(), setPatternOptions() +*/ +QRegularExpression::QRegularExpression(const QString &pattern, PatternOptions options) + : d(new QRegularExpressionPrivate) +{ + d->pattern = pattern; + d->patternOptions = options; +} + +/*! + Constructs a QRegularExpression object as a copy of \a re. + + \sa operator=() +*/ +QRegularExpression::QRegularExpression(const QRegularExpression &re) + : d(re.d) +{ +} + +/*! + Destroys the QRegularExpression object. +*/ +QRegularExpression::~QRegularExpression() +{ +} + +/*! + Assigns the regular expression \a re to this object, and returns a reference + to the copy. Both the pattern and the pattern options are copied. +*/ +QRegularExpression &QRegularExpression::operator=(const QRegularExpression &re) +{ + d = re.d; + return *this; +} + +/*! + \fn void QRegularExpression::swap(QRegularExpression &other) + + Swaps the regular expression \a other with this regular expression. This + operation is very fast and never fails. +*/ + +/*! + Returns the pattern string of the regular expression. + + \sa setPattern(), patternOptions() +*/ +QString QRegularExpression::pattern() const +{ + return d->pattern; +} + +/*! + Sets the pattern string of the regular expression to \a pattern. The + pattern options are left unchanged. + + \sa pattern(), setPatternOptions() +*/ +void QRegularExpression::setPattern(const QString &pattern) +{ + d.detach(); + d->isDirty = true; + d->pattern = pattern; +} + +/*! + Returns the pattern options for the regular expression. + + \sa setPatternOptions(), pattern() +*/ +QRegularExpression::PatternOptions QRegularExpression::patternOptions() const +{ + return d->patternOptions; +} + +/*! + Sets the given \a options as the pattern options of the regular expression. + The pattern string is left unchanged. + + \sa patternOptions(), setPattern() +*/ +void QRegularExpression::setPatternOptions(PatternOptions options) +{ + d.detach(); + d->isDirty = true; + d->patternOptions = options; +} + +/*! + Returns true if the regular expression is a valid regular expression (that + is, it contains no syntax errors, etc.), or false otherwise. Use + errorString() to obtain a textual description of the error. + + \sa errorString(), patternErrorOffset() +*/ +bool QRegularExpression::isValid() const +{ + d.data()->compilePattern(); + return d->compiledPattern; +} + +/*! + Returns a textual description of the error found when checking the validity + of the regular expression, or "no error" if no error was found. + + \sa isValid(), patternErrorOffset() +*/ +QString QRegularExpression::errorString() const +{ + d.data()->compilePattern(); + if (d->errorString) + return QCoreApplication::translate("QRegularExpression", d->errorString, 0, QCoreApplication::UnicodeUTF8); + return QCoreApplication::translate("QRegularExpression", "no error", 0, QCoreApplication::UnicodeUTF8); +} + +/*! + Returns the offset, inside the pattern string, at which an error was found + when checking the validity of the regular expression. If no error was + found, then -1 is returned. + + \sa pattern(), isValid(), errorString() +*/ +int QRegularExpression::patternErrorOffset() const +{ + d.data()->compilePattern(); + return d->errorOffset; +} + +/*! + Attempts to match the regular expression against the given \a subject + string, starting at the position \a offset inside the subject, using a + match of type \a matchType and honoring the given \a matchOptions. + + The returned QRegularExpressionMatch object contains the results of the + match. + + \sa QRegularExpressionMatch, {normal matching} +*/ +QRegularExpressionMatch QRegularExpression::match(const QString &subject, + int offset, + MatchType matchType, + MatchOptions matchOptions) const +{ + d.data()->compilePattern(); + + QRegularExpressionMatchPrivate *priv = d->doMatch(subject, offset, matchType, matchOptions); + return QRegularExpressionMatch(*priv); +} + +/*! + Attempts to perform a global match of the regular expression against the + given \a subject string, starting at the position \a offset inside the + subject, using a match of type \a matchType and honoring the given \a + matchOptions. + + The returned QRegularExpressionMatchIterator is positioned before the + first match result (if any). + + \sa QRegularExpressionMatchIterator, {global matching} +*/ +QRegularExpressionMatchIterator QRegularExpression::globalMatch(const QString &subject, + int offset, + MatchType matchType, + MatchOptions matchOptions) const +{ + QRegularExpressionMatchIteratorPrivate *priv = + new QRegularExpressionMatchIteratorPrivate(*this, + matchType, + matchOptions, + match(subject, offset, matchType, matchOptions)); + + return QRegularExpressionMatchIterator(*priv); +} + +/*! + Returns true if the regular expression is equal to \a re, or false + otherwise. Two QRegularExpression objects are equal if they have + the same pattern string and the same pattern options. + + \sa operator!=() +*/ +bool QRegularExpression::operator==(const QRegularExpression &re) const +{ + return (pattern() == re.pattern() && patternOptions() == re.patternOptions()); +} + +/*! + \fn bool QRegularExpression::operator!=(const QRegularExpression &re) const + + Returns true if the regular expression is different from \a re, or + false otherwise. + + \sa operator==() +*/ + +/*! + Escapes all characters of \a str so that they no longer have any special + meaning when used as a regular expression pattern string, and returns + the escaped string. For instance: + + \snippet doc/src/snippets/code/src_corelib_tools_qregularexpression.cpp 26 + + This is very convenient in order to build patterns from arbitrary strings: + + \snippet doc/src/snippets/code/src_corelib_tools_qregularexpression.cpp 27 + + \note This function implements Perl's quotemeta algorithm and escapes with + a backslash all characters in \a str, except for the characters in the + \c{[A-Z]}, \c{[a-z]} and \c{[0-9]} ranges, as well as the underscore + (\c{_}) character. The only difference with Perl is that a literal NUL + inside \a str is escaped with the sequence \c{"\\\\0"} (backslash + + \c{'0'}), instead of \c{"\\\\\\0"} (backslash + \c{NUL}). +*/ +QString QRegularExpression::escape(const QString &str) +{ + QString result; + const int count = str.size(); + result.reserve(count * 2); + + // everything but [a-zA-Z0-9_] gets escaped, + // cf. perldoc -f quotemeta + for (int i = 0; i < count; ++i) { + const QChar current = str.at(i); + + if (current == QChar::Null) { + // unlike Perl, a literal NUL must be escaped with + // "\\0" (backslash + 0) and not "\\\0" (backslash + NUL), + // because pcre16_compile uses a NUL-terminated string + result.append(QLatin1Char('\\')); + result.append(QLatin1Char('0')); + } else if ( (current < QLatin1Char('a') || current > QLatin1Char('z')) && + (current < QLatin1Char('A') || current > QLatin1Char('Z')) && + (current < QLatin1Char('0') || current > QLatin1Char('9')) && + current != QLatin1Char('_') ) + { + result.append(QLatin1Char('\\')); + result.append(current); + if (current.isHighSurrogate() && i < (count - 1)) + result.append(str.at(++i)); + } else { + result.append(current); + } + } + + result.squeeze(); + return result; +} + +/*! + Destroys the match result. +*/ +QRegularExpressionMatch::~QRegularExpressionMatch() +{ +} + +/*! + Constructs a match result by copying the result of the given \a match. + + \sa operator=() +*/ +QRegularExpressionMatch::QRegularExpressionMatch(const QRegularExpressionMatch &match) + : d(match.d) +{ +} + +/*! + Assigns the match result \a match to this object, and returns a reference + to the copy. +*/ +QRegularExpressionMatch &QRegularExpressionMatch::operator=(const QRegularExpressionMatch &match) +{ + d = match.d; + return *this; +} + +/*! + \fn void QRegularExpressionMatch::swap(QRegularExpressionMatch &other) + + Swaps the match result \a other with this match result. This + operation is very fast and never fails. +*/ + +/*! + \internal +*/ +QRegularExpressionMatch::QRegularExpressionMatch(QRegularExpressionMatchPrivate &dd) + : d(&dd) +{ +} + +/*! + Returns the QRegularExpression object whose match() function returned this + object. + + \sa QRegularExpression::match(), matchType(), matchOptions() +*/ +QRegularExpression QRegularExpressionMatch::regularExpression() const +{ + return d->regularExpression; +} + + +/*! + Returns the match type that was used to get this QRegularExpressionMatch + object, that is, the match type that was passed to + QRegularExpression::match() or QRegularExpression::globalMatch(). + + \sa QRegularExpression::match(), regularExpression(), matchOptions() +*/ +QRegularExpression::MatchType QRegularExpressionMatch::matchType() const +{ + return d->matchType; +} + +/*! + Returns the match options that were used to get this + QRegularExpressionMatch object, that is, the match options that were passed + to QRegularExpression::match() or QRegularExpression::globalMatch(). + + \sa QRegularExpression::match(), regularExpression(), matchType() +*/ +QRegularExpression::MatchOptions QRegularExpressionMatch::matchOptions() const +{ + return d->matchOptions; +} + +/*! + Returns the index of the last capturing group that captured something, + including the implicit capturing group 0. This can be used to extract all + the substrings that were captured: + + \snippet doc/src/snippets/code/src_corelib_tools_qregularexpression.cpp 28 + + Note that some of the capturing groups with an index less than + lastCapturedIndex() could have not matched, and therefore captured nothing. + + If the regular expression did not match, this function returns -1. + + \sa captured(), capturedStart(), capturedEnd(), capturedLength() +*/ +int QRegularExpressionMatch::lastCapturedIndex() const +{ + return d->capturedCount - 1; +} + +/*! + Returns the substring captured by the \a nth capturing group. If the \a nth + capturing group did not capture a string or doesn't exist, returns a null + QString. + + \sa capturedRef(), lastCapturedIndex(), capturedStart(), capturedEnd(), + capturedLength(), QString::isNull() +*/ +QString QRegularExpressionMatch::captured(int nth) const +{ + if (nth < 0 || nth > lastCapturedIndex()) + return QString(); + + int start = capturedStart(nth); + + if (start == -1) // didn't capture + return QString(); + + return d->subject.mid(start, capturedLength(nth)); +} + +/*! + Returns a reference to the substring captured by the \a nth capturing group. + If the \a nth capturing group did not capture a string or doesn't exist, + returns a null QStringRef. + + \sa captured(), lastCapturedIndex(), capturedStart(), capturedEnd(), + capturedLength(), QStringRef::isNull() +*/ +QStringRef QRegularExpressionMatch::capturedRef(int nth) const +{ + if (nth < 0 || nth > lastCapturedIndex()) + return QStringRef(); + + int start = capturedStart(nth); + + if (start == -1) // didn't capture + return QStringRef(); + + return d->subject.midRef(start, capturedLength(nth)); +} + +/*! + Returns the substring captured by the capturing group named \a name. If the + capturing group named \a name did not capture a string or doesn't exist, + returns a null QString. + + \sa capturedRef(), capturedStart(), capturedEnd(), capturedLength(), + QString::isNull() +*/ +QString QRegularExpressionMatch::captured(const QString &name) const +{ + if (name.isEmpty()) { + qWarning("QRegularExpressionMatch::captured: empty capturing group name passed"); + return QString(); + } + int nth = d->regularExpression.d->captureIndexForName(name); + if (nth == -1) + return QString(); + return captured(nth); +} + +/*! + Returns a reference to the string captured by the capturing group named \a + name. If the capturing group named \a name did not capture a string or + doesn't exist, returns a null QStringRef. + + \sa captured(), capturedStart(), capturedEnd(), capturedLength(), + QStringRef::isNull() +*/ +QStringRef QRegularExpressionMatch::capturedRef(const QString &name) const +{ + if (name.isEmpty()) { + qWarning("QRegularExpressionMatch::capturedRef: empty capturing group name passed"); + return QStringRef(); + } + int nth = d->regularExpression.d->captureIndexForName(name); + if (nth == -1) + return QStringRef(); + return capturedRef(nth); +} + +/*! + Returns a list of all strings captured by capturing groups, in the order + the groups themselves appear in the pattern string. +*/ +QStringList QRegularExpressionMatch::capturedTexts() const +{ + QStringList texts; + for (int i = 0; i <= lastCapturedIndex(); ++i) + texts << captured(i); + return texts; +} + +/*! + Returns the offset inside the subject string corresponding to the + starting position of the substring captured by the \a nth capturing group. + If the \a nth capturing group did not capture a string or doesn't exist, + returns -1. + + \sa capturedEnd(), capturedLength(), captured() +*/ +int QRegularExpressionMatch::capturedStart(int nth) const +{ + if (nth < 0 || nth > lastCapturedIndex()) + return -1; + + return d->capturedOffsets.at(nth * 2); +} + +/*! + Returns the length of the substring captured by the \a nth capturing group. + + \note This function returns 0 if the \a nth capturing group did not capture + a string or doesn't exist. + + \sa capturedStart(), capturedEnd(), captured() +*/ +int QRegularExpressionMatch::capturedLength(int nth) const +{ + // bound checking performed by these two functions + return capturedEnd(nth) - capturedStart(nth); +} + +/*! + Returns the offset inside the subject string immediately after the ending + position of the substring captured by the \a nth capturing group. If the \a + nth capturing group did not capture a string or doesn't exist, returns -1. + + \sa capturedStart(), capturedLength(), captured() +*/ +int QRegularExpressionMatch::capturedEnd(int nth) const +{ + if (nth < 0 || nth > lastCapturedIndex()) + return -1; + + return d->capturedOffsets.at(nth * 2 + 1); +} + +/*! + Returns the offset inside the subject string corresponding to the starting + position of the substring captured by the capturing group named \a name. + If the capturing group named \a name did not capture a string or doesn't + exist, returns -1. + + \sa capturedEnd(), capturedLength(), captured() +*/ +int QRegularExpressionMatch::capturedStart(const QString &name) const +{ + if (name.isEmpty()) { + qWarning("QRegularExpressionMatch::capturedStart: empty capturing group name passed"); + return -1; + } + int nth = d->regularExpression.d->captureIndexForName(name); + if (nth == -1) + return -1; + return capturedStart(nth); +} + +/*! + Returns the offset inside the subject string corresponding to the starting + position of the substring captured by the capturing group named \a name. + + \note This function returns 0 if the capturing group named \a name did not + capture a string or doesn't exist. + + \sa capturedStart(), capturedEnd(), captured() +*/ +int QRegularExpressionMatch::capturedLength(const QString &name) const +{ + if (name.isEmpty()) { + qWarning("QRegularExpressionMatch::capturedLength: empty capturing group name passed"); + return 0; + } + int nth = d->regularExpression.d->captureIndexForName(name); + if (nth == -1) + return 0; + return capturedLength(nth); +} + +/*! + Returns the offset inside the subject string immediately after the ending + position of the substring captured by the capturing group named \a name. If + the capturing group named \a name did not capture a string or doesn't + exist, returns -1. + + \sa capturedStart(), capturedLength(), captured() +*/ +int QRegularExpressionMatch::capturedEnd(const QString &name) const +{ + if (name.isEmpty()) { + qWarning("QRegularExpressionMatch::capturedEnd: empty capturing group name passed"); + return -1; + } + int nth = d->regularExpression.d->captureIndexForName(name); + if (nth == -1) + return -1; + return capturedEnd(nth); +} + +/*! + Returns true if the regular expression matched against the subject string, + or false otherwise. + + \sa QRegularExpression::match(), hasPartialMatch() +*/ +bool QRegularExpressionMatch::hasMatch() const +{ + return d->hasMatch; +} + +/*! + Returns true if the regular expression partially matched against the + subject string, or false otherwise. + + \note Only a match that explicitely used the one of the partial match types + can yield a partial match. Still, if such a match succeeds totally, this + function will return false, while hasMatch() will return true. + + \sa QRegularExpression::match(), QRegularExpression::MatchType, hasMatch() +*/ +bool QRegularExpressionMatch::hasPartialMatch() const +{ + return d->hasPartialMatch; +} + +/*! + Returns true if the match object was obtained as a result from the + QRegularExpression::match() function invoked on a valid QRegularExpression + object; returns false if the QRegularExpression was invalid. + + \sa QRegularExpression::match(), QRegularExpression::isValid() +*/ +bool QRegularExpressionMatch::isValid() const +{ + return d->isValid; +} + +/*! + \internal +*/ +QRegularExpressionMatchIterator::QRegularExpressionMatchIterator(QRegularExpressionMatchIteratorPrivate &dd) + : d(&dd) +{ +} + +/*! + Destroys the QRegularExpressionMatchIterator object. +*/ +QRegularExpressionMatchIterator::~QRegularExpressionMatchIterator() +{ +} + +/*! + Constructs a QRegularExpressionMatchIterator object as a copy of \a + iterator. + + \sa operator=() +*/ +QRegularExpressionMatchIterator::QRegularExpressionMatchIterator(const QRegularExpressionMatchIterator &iterator) + : d(iterator.d) +{ +} + +/*! + Assigns the iterator \a iterator to this object, and returns a reference to + the copy. +*/ +QRegularExpressionMatchIterator &QRegularExpressionMatchIterator::operator=(const QRegularExpressionMatchIterator &iterator) +{ + d = iterator.d; + return *this; +} + +/*! + \fn void QRegularExpressionMatchIterator::swap(QRegularExpressionMatchIterator &other) + + Swaps the iterator \a other with this iterator object. This operation is + very fast and never fails. +*/ + +/*! + Returns true if the iterator object was obtained as a result from the + QRegularExpression::globalMatch() function invoked on a valid + QRegularExpression object; returns false if the QRegularExpression was + invalid. + + \sa QRegularExpression::globalMatch(), QRegularExpression::isValid() +*/ +bool QRegularExpressionMatchIterator::isValid() const +{ + return d->next.isValid(); +} + +/*! + Returns true if there is at least one match result ahead of the iterator; + otherwise it returns false. + + \sa next() +*/ +bool QRegularExpressionMatchIterator::hasNext() const +{ + return d->hasNext(); +} + +/*! + Returns the next match result without moving the iterator. + + \note Calling this function when the iterator is at the end of the result + set leads to undefined results. +*/ +QRegularExpressionMatch QRegularExpressionMatchIterator::peekNext() const +{ + if (!hasNext()) + qWarning("QRegularExpressionMatchIterator::peekNext() called on an iterator already at end"); + + return d->next; +} + +/*! + Returns the next match result and advances the iterator by one position. + + \note Calling this function when the iterator is at the end of the result + set leads to undefined results. +*/ +QRegularExpressionMatch QRegularExpressionMatchIterator::next() +{ + if (!hasNext()) { + qWarning("QRegularExpressionMatchIterator::next() called on an iterator already at end"); + return d->next; + } + + QRegularExpressionMatch current = d->next; + d->next = d->next.d.constData()->nextMatch(); + return current; +} + +/*! + Returns the QRegularExpression object whose globalMatch() function returned + this object. + + \sa QRegularExpression::globalMatch(), matchType(), matchOptions() +*/ +QRegularExpression QRegularExpressionMatchIterator::regularExpression() const +{ + return d->regularExpression; +} + +/*! + Returns the match type that was used to get this + QRegularExpressionMatchIterator object, that is, the match type that was + passed to QRegularExpression::globalMatch(). + + \sa QRegularExpression::globalMatch(), regularExpression(), matchOptions() +*/ +QRegularExpression::MatchType QRegularExpressionMatchIterator::matchType() const +{ + return d->matchType; +} + +/*! + Returns the match options that were used to get this + QRegularExpressionMatchIterator object, that is, the match options that + were passed to QRegularExpression::globalMatch(). + + \sa QRegularExpression::globalMatch(), regularExpression(), matchType() +*/ +QRegularExpression::MatchOptions QRegularExpressionMatchIterator::matchOptions() const +{ + return d->matchOptions; +} + +#ifndef QT_NO_DATASTREAM +/*! + \relates QRegularExpression + + Writes the regular expression \a re to stream \a out. + + \sa {Serializing Qt Data Types} +*/ +QDataStream &operator<<(QDataStream &out, const QRegularExpression &re) +{ + out << re.pattern() << quint32(re.patternOptions()); + return out; +} + +/*! + \relates QRegularExpression + + Reads a regular expression from stream \a in into \a re. + + \sa {Serializing Qt Data Types} +*/ +QDataStream &operator>>(QDataStream &in, QRegularExpression &re) +{ + QString pattern; + quint32 patternOptions; + in >> pattern >> patternOptions; + re.setPattern(pattern); + re.setPatternOptions(QRegularExpression::PatternOptions(patternOptions)); + return in; +} +#endif + +#ifndef QT_NO_DEBUG_STREAM +/*! + \relates QRegularExpression + + Writes the regular expression \a re into the debug object \a debug for + debugging purposes. + + \sa {Debugging Techniques} +*/ +QDebug operator<<(QDebug debug, const QRegularExpression &re) +{ + debug.nospace() << "QRegularExpression(" << re.pattern() << ", " << re.patternOptions() << ")"; + return debug.space(); +} + +/*! + \relates QRegularExpressionMatch + + Writes the match object \a match into the debug object \a debug for + debugging purposes. + + \sa {Debugging Techniques} +*/ +QDebug operator<<(QDebug debug, const QRegularExpressionMatch &match) +{ + debug.nospace() << "QRegularExpressionMatch("; + + if (!match.isValid()) { + debug << "Invalid)"; + return debug.space(); + } + + debug << "Valid"; + + if (match.hasMatch()) { + debug << ", has match: "; + for (int i = 0; i <= match.lastCapturedIndex(); ++i) { + debug << i + << ":(" << match.capturedStart(i) << ", " << match.capturedEnd(i) + << ", " << match.captured(i) << ")"; + if (i < match.lastCapturedIndex()) + debug << ", "; + } + } else if (match.hasPartialMatch()) { + debug << ", has partial match: (" + << match.capturedStart(0) << ", " + << match.capturedEnd(0) << ", " + << match.captured(0) << ")"; + } else { + debug << ", no match"; + } + + debug << ")"; + + return debug.space(); +} +#endif + +QT_END_NAMESPACE diff --git a/src/corelib/tools/qregularexpression.h b/src/corelib/tools/qregularexpression.h new file mode 100644 index 0000000000..c9bcb1e7ba --- /dev/null +++ b/src/corelib/tools/qregularexpression.h @@ -0,0 +1,245 @@ +/**************************************************************************** +** +** Copyright (C) 2012 Giuseppe D'Angelo . +** Contact: http://www.qt-project.org/ +** +** This file is part of the QtCore module of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL$ +** GNU Lesser General Public License Usage +** This file may be used under the terms of the GNU Lesser General Public +** License version 2.1 as published by the Free Software Foundation and +** appearing in the file LICENSE.LGPL included in the packaging of this +** file. Please review the following information to ensure the GNU Lesser +** General Public License version 2.1 requirements will be met: +** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. +** +** In addition, as a special exception, Nokia gives you certain additional +** rights. These rights are described in the Nokia Qt LGPL Exception +** version 1.1, included in the file LGPL_EXCEPTION.txt in this package. +** +** GNU General Public License Usage +** Alternatively, this file may be used under the terms of the GNU General +** Public License version 3.0 as published by the Free Software Foundation +** and appearing in the file LICENSE.GPL included in the packaging of this +** file. Please review the following information to ensure the GNU General +** Public License version 3.0 requirements will be met: +** http://www.gnu.org/copyleft/gpl.html. +** +** Other Usage +** Alternatively, this file may be used in accordance with the terms and +** conditions contained in a signed written agreement between you and Nokia. +** +** +** +** +** +** +** $QT_END_LICENSE$ +** +****************************************************************************/ + +#ifndef QREGULAREXPRESSION_H +#define QREGULAREXPRESSION_H + +#ifndef QT_NO_REGEXP + +#include +#include +#include + +QT_BEGIN_HEADER + +QT_BEGIN_NAMESPACE + +class QRegularExpressionMatch; +class QRegularExpressionMatchIterator; +struct QRegularExpressionPrivate; + +class Q_CORE_EXPORT QRegularExpression +{ +public: + enum PatternOption { + NoPatternOption = 0x0000, + CaseInsensitiveOption = 0x0001, + DotMatchesEverythingOption = 0x0002, + MultilineOption = 0x0004, + ExtendedPatternSyntaxOption = 0x0008, + InvertedGreedinessOption = 0x0010, + DontCaptureOption = 0x0020, + UseUnicodePropertiesOption = 0x0040 + }; + Q_DECLARE_FLAGS(PatternOptions, PatternOption) + + PatternOptions patternOptions() const; + void setPatternOptions(PatternOptions options); + + QRegularExpression(); + explicit QRegularExpression(const QString &pattern, PatternOptions options = NoPatternOption); + QRegularExpression(const QRegularExpression &re); + ~QRegularExpression(); + QRegularExpression &operator=(const QRegularExpression &re); + +#ifdef Q_COMPILER_RVALUE_REFS + inline QRegularExpression &operator=(QRegularExpression &&re) + { d.swap(re.d); return *this; } +#endif + + inline void swap(QRegularExpression &re) { d.swap(re.d); } + + QString pattern() const; + void setPattern(const QString &pattern); + + bool isValid() const; + int patternErrorOffset() const; + QString errorString() const; + + enum MatchType { + NormalMatch = 0, + PartialPreferCompleteMatch, + PartialPreferFirstMatch + }; + + enum MatchOption { + NoMatchOption = 0x0000, + AnchoredMatchOption = 0x0001 + }; + Q_DECLARE_FLAGS(MatchOptions, MatchOption) + + QRegularExpressionMatch match(const QString &subject, + int offset = 0, + MatchType matchType = NormalMatch, + MatchOptions matchOptions = NoMatchOption) const; + + QRegularExpressionMatchIterator globalMatch(const QString &subject, + int offset = 0, + MatchType matchType = NormalMatch, + MatchOptions matchOptions = NoMatchOption) const; + + static QString escape(const QString &str); + + bool operator==(const QRegularExpression &re) const; + inline bool operator!=(const QRegularExpression &re) const { return !operator==(re); } + +private: + friend struct QRegularExpressionPrivate; + friend class QRegularExpressionMatch; + friend struct QRegularExpressionMatchPrivate; + friend class QRegularExpressionMatchIterator; + + QRegularExpression(QRegularExpressionPrivate &dd); + QExplicitlySharedDataPointer d; +}; + +Q_DECLARE_OPERATORS_FOR_FLAGS(QRegularExpression::PatternOptions) +Q_DECLARE_OPERATORS_FOR_FLAGS(QRegularExpression::MatchOptions) +Q_DECLARE_TYPEINFO(QRegularExpression, Q_MOVABLE_TYPE); + +#ifndef QT_NO_DATASTREAM +Q_CORE_EXPORT QDataStream &operator<<(QDataStream &out, const QRegularExpression &re); +Q_CORE_EXPORT QDataStream &operator>>(QDataStream &in, QRegularExpression &re); +#endif + +#ifndef QT_NO_DEBUG_STREAM +Q_CORE_EXPORT QDebug operator<<(QDebug debug, const QRegularExpression &re); +#endif + +struct QRegularExpressionMatchPrivate; + +class Q_CORE_EXPORT QRegularExpressionMatch +{ +public: + ~QRegularExpressionMatch(); + QRegularExpressionMatch(const QRegularExpressionMatch &match); + QRegularExpressionMatch &operator=(const QRegularExpressionMatch &match); + +#ifdef Q_COMPILER_RVALUE_REFS + inline QRegularExpressionMatch &operator=(QRegularExpressionMatch &&match) + { d.swap(match.d); return *this; } +#endif + inline void swap(QRegularExpressionMatch &match) { d.swap(match.d); } + + QRegularExpression regularExpression() const; + QRegularExpression::MatchType matchType() const; + QRegularExpression::MatchOptions matchOptions() const; + + bool hasMatch() const; + bool hasPartialMatch() const; + + bool isValid() const; + + int lastCapturedIndex() const; + + QString captured(int nth = 0) const; + QStringRef capturedRef(int nth = 0) const; + + QString captured(const QString &name) const; + QStringRef capturedRef(const QString &name) const; + + QStringList capturedTexts() const; + + int capturedStart(int nth = 0) const; + int capturedLength(int nth = 0) const; + int capturedEnd(int nth = 0) const; + + int capturedStart(const QString &name) const; + int capturedLength(const QString &name) const; + int capturedEnd(const QString &name) const; + +private: + friend class QRegularExpression; + friend struct QRegularExpressionMatchPrivate; + friend class QRegularExpressionMatchIterator; + + QRegularExpressionMatch(QRegularExpressionMatchPrivate &dd); + QSharedDataPointer d; +}; + +Q_DECLARE_TYPEINFO(QRegularExpressionMatch, Q_MOVABLE_TYPE); + +#ifndef QT_NO_DEBUG_STREAM +Q_CORE_EXPORT QDebug operator<<(QDebug debug, const QRegularExpressionMatch &match); +#endif + +struct QRegularExpressionMatchIteratorPrivate; + +class Q_CORE_EXPORT QRegularExpressionMatchIterator +{ +public: + ~QRegularExpressionMatchIterator(); + QRegularExpressionMatchIterator(const QRegularExpressionMatchIterator &iterator); + QRegularExpressionMatchIterator &operator=(const QRegularExpressionMatchIterator &iterator); +#ifdef Q_COMPILER_RVALUE_REFS + inline QRegularExpressionMatchIterator &operator=(QRegularExpressionMatchIterator &&iterator) + { d.swap(iterator.d); return *this; } +#endif + void swap(QRegularExpressionMatchIterator &iterator) { d.swap(iterator.d); } + + bool isValid() const; + + bool hasNext() const; + QRegularExpressionMatch next(); + QRegularExpressionMatch peekNext() const; + + QRegularExpression regularExpression() const; + QRegularExpression::MatchType matchType() const; + QRegularExpression::MatchOptions matchOptions() const; + +private: + friend class QRegularExpression; + + QRegularExpressionMatchIterator(QRegularExpressionMatchIteratorPrivate &dd); + QSharedDataPointer d; +}; + +Q_DECLARE_TYPEINFO(QRegularExpressionMatchIterator, Q_MOVABLE_TYPE); + +QT_END_NAMESPACE + +Q_DECLARE_METATYPE(QRegularExpression) + +QT_END_HEADER + +#endif // QT_NO_REGEXP + +#endif // QREGULAREXPRESSION_H diff --git a/src/corelib/tools/tools.pri b/src/corelib/tools/tools.pri index 3740975b12..250789a969 100644 --- a/src/corelib/tools/tools.pri +++ b/src/corelib/tools/tools.pri @@ -30,6 +30,7 @@ HEADERS += \ tools/qqueue.h \ tools/qrect.h \ tools/qregexp.h \ + tools/qregularexpression.h \ tools/qringbuffer_p.h \ tools/qrefcount.h \ tools/qscopedpointer.h \ @@ -75,6 +76,7 @@ SOURCES += \ tools/qcontiguouscache.cpp \ tools/qrect.cpp \ tools/qregexp.cpp \ + tools/qregularexpression.cpp \ tools/qrefcount.cpp \ tools/qshareddata.cpp \ tools/qsharedpointer.cpp \ @@ -105,6 +107,12 @@ contains(QT_CONFIG,icu) { DEFINES += QT_USE_ICU } +pcre { + include($$PWD/../../3rdparty/pcre.pri) +} else { + LIBS_PRIVATE += -lpcre16 +} + DEFINES += HB_EXPORT=Q_CORE_EXPORT INCLUDEPATH += ../3rdparty/harfbuzz/src HEADERS += ../3rdparty/harfbuzz/src/harfbuzz.h -- cgit v1.2.3 From aea65cbaa4fd889129d7945600dad3277f9c6d9b Mon Sep 17 00:00:00 2001 From: Giuseppe D'Angelo Date: Tue, 7 Feb 2012 23:56:40 +0000 Subject: QRegularExpression: QDebug support for pattern options Added the proper QDebug operator to debug the QRegularExpression::PatternOptions flags. Change-Id: Icd00e93a0c6cc4345db528d494fc176624f7b7a2 Reviewed-by: hjk Reviewed-by: Lars Knoll --- src/corelib/tools/qregularexpression.cpp | 37 ++++++++++++++++++++++++++++++++ src/corelib/tools/qregularexpression.h | 1 + 2 files changed, 38 insertions(+) (limited to 'src/corelib/tools') diff --git a/src/corelib/tools/qregularexpression.cpp b/src/corelib/tools/qregularexpression.cpp index 488a454aaa..7fbbfaa9ef 100644 --- a/src/corelib/tools/qregularexpression.cpp +++ b/src/corelib/tools/qregularexpression.cpp @@ -1976,6 +1976,43 @@ QDebug operator<<(QDebug debug, const QRegularExpression &re) return debug.space(); } +/*! + \relates QRegularExpression + + Writes the pattern options \a patternOptions into the debug object \a debug + for debugging purposes. + + \sa {Debugging Techniques} +*/ +QDebug operator<<(QDebug debug, QRegularExpression::PatternOptions patternOptions) +{ + QStringList flags; + + if (patternOptions == QRegularExpression::NoPatternOption) { + flags << QLatin1String("NoPatternOption"); + } else { + if (patternOptions & QRegularExpression::CaseInsensitiveOption) + flags << QLatin1String("CaseInsensitiveOption"); + if (patternOptions & QRegularExpression::DotMatchesEverythingOption) + flags << QLatin1String("DotMatchesEverythingOption"); + if (patternOptions & QRegularExpression::MultilineOption) + flags << QLatin1String("MultilineOption"); + if (patternOptions & QRegularExpression::ExtendedPatternSyntaxOption) + flags << QLatin1String("ExtendedPatternSyntaxOption"); + if (patternOptions & QRegularExpression::InvertedGreedinessOption) + flags << QLatin1String("InvertedGreedinessOption"); + if (patternOptions & QRegularExpression::DontCaptureOption) + flags << QLatin1String("DontCaptureOption"); + if (patternOptions & QRegularExpression::UseUnicodePropertiesOption) + flags << QLatin1String("UseUnicodePropertiesOption"); + } + + debug.nospace() << "QRegularExpression::PatternOptions(" + << qPrintable(flags.join(QLatin1String("|"))) + << ")"; + + return debug.space(); +} /*! \relates QRegularExpressionMatch diff --git a/src/corelib/tools/qregularexpression.h b/src/corelib/tools/qregularexpression.h index c9bcb1e7ba..13c7de7cab 100644 --- a/src/corelib/tools/qregularexpression.h +++ b/src/corelib/tools/qregularexpression.h @@ -142,6 +142,7 @@ Q_CORE_EXPORT QDataStream &operator>>(QDataStream &in, QRegularExpression &re); #ifndef QT_NO_DEBUG_STREAM Q_CORE_EXPORT QDebug operator<<(QDebug debug, const QRegularExpression &re); +Q_CORE_EXPORT QDebug operator<<(QDebug debug, QRegularExpression::PatternOptions patternOptions); #endif struct QRegularExpressionMatchPrivate; -- cgit v1.2.3 From bd30234b59c1a0cef81b8ce43f2fefac1f28b318 Mon Sep 17 00:00:00 2001 From: Giuseppe D'Angelo Date: Wed, 8 Feb 2012 18:53:22 +0000 Subject: QRegularExpression: improve operator==, add dedicated autotest Trivial change: compare dpointers first, then the data. Added test function for operator==. Change-Id: I33ac64a59db4ccad56c30be17622187e42415f38 Reviewed-by: Lars Knoll --- src/corelib/tools/qregularexpression.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'src/corelib/tools') diff --git a/src/corelib/tools/qregularexpression.cpp b/src/corelib/tools/qregularexpression.cpp index 7fbbfaa9ef..b7a5c3de8e 100644 --- a/src/corelib/tools/qregularexpression.cpp +++ b/src/corelib/tools/qregularexpression.cpp @@ -1395,7 +1395,8 @@ QRegularExpressionMatchIterator QRegularExpression::globalMatch(const QString &s */ bool QRegularExpression::operator==(const QRegularExpression &re) const { - return (pattern() == re.pattern() && patternOptions() == re.patternOptions()); + return (d == re.d) || + (d->pattern == re.d->pattern && d->patternOptions == re.d->patternOptions); } /*! -- cgit v1.2.3 From 1899861858eb262df7826eb32d1274233a35536e Mon Sep 17 00:00:00 2001 From: Giuseppe D'Angelo Date: Wed, 8 Feb 2012 19:28:14 +0000 Subject: QRegularExpression: do not use JIT in debug builds PCRE's JIT uses self-modifying code extensively, requiring full SMC checks enabled by tools like valgrind, which slow down the execution considerably; not enabling SMC checks lead to crashes. Therefore, JIT is now disabled by default in debug builds of Qt. Its usage (both in debug and release builds) can be controlled by setting the QT_ENABLE_REGEXP_JIT environment variable. Change-Id: Ib38952400e4219582942ce65ab9edcd89c432f3e Reviewed-by: Lars Knoll --- src/corelib/tools/qregularexpression.cpp | 42 +++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) (limited to 'src/corelib/tools') diff --git a/src/corelib/tools/qregularexpression.cpp b/src/corelib/tools/qregularexpression.cpp index b7a5c3de8e..17988bdb31 100644 --- a/src/corelib/tools/qregularexpression.cpp +++ b/src/corelib/tools/qregularexpression.cpp @@ -509,6 +509,22 @@ QT_BEGIN_NAMESPACE QRegExp::CaretAtOffset behaviour. There is no equivalent for the other QRegExp::CaretMode modes. + \section1 Debugging code that uses QRegularExpression + + QRegularExpression internally uses a just in time compiler (JIT) to + optimize the execution of the matching algorithm. The JIT makes extensive + usage of self-modifying code, which can lead debugging tools such as + Valgrind to crash. You must enable all checks for self-modifying code if + you want to debug programs using QRegularExpression (f.i., see Valgrind's + \c{--smc-check} command line option). The downside of enabling such checks + is that your program will run considerably slower. + + To avoid that, the JIT is disabled by default if you compile Qt in debug + mode. It is possible to override the default and enable or disable the JIT + usage (both in debug or release mode) by setting the + \c{QT_ENABLE_REGEXP_JIT} environment variable to a non-zero or zero value + respectively. + \sa QRegularExpressionMatch, QRegularExpressionMatchIterator */ @@ -969,6 +985,25 @@ void QRegularExpressionPrivate::getPatternInfo() (patternNewlineSetting == PCRE_NEWLINE_ANYCRLF); } +/*! + \internal +*/ +static bool isJitEnabled() +{ + QByteArray jitEnvironment = qgetenv("QT_ENABLE_REGEXP_JIT"); + if (!jitEnvironment.isEmpty()) { + bool ok; + int enableJit = jitEnvironment.toInt(&ok); + return ok ? (enableJit != 0) : true; + } + +#ifdef QT_DEBUG + return false; +#else + return true; +#endif +} + /*! \internal */ @@ -981,7 +1016,12 @@ void QRegularExpressionPrivate::optimizePattern() if (studyData || (++usedCount != OPTIMIZE_AFTER_USE_COUNT)) return; - int studyOptions = PCRE_STUDY_JIT_COMPILE; + static const bool enableJit = isJitEnabled(); + + int studyOptions = 0; + if (enableJit) + studyOptions |= PCRE_STUDY_JIT_COMPILE; + const char *err; studyData = pcre16_study(compiledPattern, studyOptions, &err); -- cgit v1.2.3 From efcd4d9470781e3a0331afeab49cd00cee24399f Mon Sep 17 00:00:00 2001 From: Giuseppe D'Angelo Date: Sun, 19 Feb 2012 23:56:50 +0100 Subject: QRegularExpression: add captureCount() QRegularExpression::captureCount() returns the number of capturing groups inside the regular expression pattern. Change-Id: Ib90ce67c67d06ab2966f0c98bd91da21defc156d Reviewed-by: Thiago Macieira --- src/corelib/tools/qregularexpression.cpp | 13 +++++++++++++ src/corelib/tools/qregularexpression.h | 2 ++ 2 files changed, 15 insertions(+) (limited to 'src/corelib/tools') diff --git a/src/corelib/tools/qregularexpression.cpp b/src/corelib/tools/qregularexpression.cpp index 17988bdb31..7bbac0144b 100644 --- a/src/corelib/tools/qregularexpression.cpp +++ b/src/corelib/tools/qregularexpression.cpp @@ -1340,6 +1340,19 @@ void QRegularExpression::setPatternOptions(PatternOptions options) d->patternOptions = options; } +/*! + Returns the number of capturing groups inside the pattern string, + or -1 if the regular expression is not valid. + + \sa isValid() +*/ +int QRegularExpression::captureCount() const +{ + if (!isValid()) // will compile the pattern + return -1; + return d->capturingCount; +} + /*! Returns true if the regular expression is a valid regular expression (that is, it contains no syntax errors, etc.), or false otherwise. Use diff --git a/src/corelib/tools/qregularexpression.h b/src/corelib/tools/qregularexpression.h index 13c7de7cab..3ca83c9e27 100644 --- a/src/corelib/tools/qregularexpression.h +++ b/src/corelib/tools/qregularexpression.h @@ -94,6 +94,8 @@ public: int patternErrorOffset() const; QString errorString() const; + int captureCount() const; + enum MatchType { NormalMatch = 0, PartialPreferCompleteMatch, -- cgit v1.2.3 From d7b720dd3ee7a095abe7e283d1bed881268b0f5e Mon Sep 17 00:00:00 2001 From: Giuseppe D'Angelo Date: Sun, 19 Feb 2012 23:58:04 +0100 Subject: QRegularExpression: const correctness fixes Adding some const qualifiers to members which are never written. Change-Id: Ibb8953764c7b7790a419a5d48f2956751d5fc1f9 Reviewed-by: Thiago Macieira --- src/corelib/tools/qregularexpression.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'src/corelib/tools') diff --git a/src/corelib/tools/qregularexpression.cpp b/src/corelib/tools/qregularexpression.cpp index 7bbac0144b..40b6b5a08e 100644 --- a/src/corelib/tools/qregularexpression.cpp +++ b/src/corelib/tools/qregularexpression.cpp @@ -817,14 +817,14 @@ struct QRegularExpressionMatchPrivate : QSharedData QRegularExpressionMatch nextMatch() const; - QRegularExpression regularExpression; - QString subject; + const QRegularExpression regularExpression; + const QString subject; // the capturedOffsets vector contains pairs of (start, end) positions // for each captured substring QVector capturedOffsets; - QRegularExpression::MatchType matchType; - QRegularExpression::MatchOptions matchOptions; + const QRegularExpression::MatchType matchType; + const QRegularExpression::MatchOptions matchOptions; int capturedCount; @@ -835,16 +835,16 @@ struct QRegularExpressionMatchPrivate : QSharedData struct QRegularExpressionMatchIteratorPrivate : QSharedData { - QRegularExpressionMatchIteratorPrivate(const QRegularExpression re, + QRegularExpressionMatchIteratorPrivate(const QRegularExpression &re, QRegularExpression::MatchType matchType, QRegularExpression::MatchOptions matchOptions, const QRegularExpressionMatch &next); bool hasNext() const; QRegularExpressionMatch next; - QRegularExpression regularExpression; - QRegularExpression::MatchType matchType; - QRegularExpression::MatchOptions matchOptions; + const QRegularExpression regularExpression; + const QRegularExpression::MatchType matchType; + const QRegularExpression::MatchOptions matchOptions; }; /*! @@ -1216,7 +1216,7 @@ QRegularExpressionMatch QRegularExpressionMatchPrivate::nextMatch() const /*! \internal */ -QRegularExpressionMatchIteratorPrivate::QRegularExpressionMatchIteratorPrivate(const QRegularExpression re, +QRegularExpressionMatchIteratorPrivate::QRegularExpressionMatchIteratorPrivate(const QRegularExpression &re, QRegularExpression::MatchType matchType, QRegularExpression::MatchOptions matchOptions, const QRegularExpressionMatch &next) -- cgit v1.2.3 From ebb94587f69207d02269d7c20dd963e59629cfc4 Mon Sep 17 00:00:00 2001 From: "Bradley T. Hughes" Date: Fri, 2 Mar 2012 08:07:23 +0100 Subject: Use #define before including SHA-2 3rdparty code Using typedef causes errors due to re-definition, so #define the types needed by the SHA-2 code to the q[u]int* equivalents instead. Change-Id: I6fc29788dd05aeee28723820f511527d482d31f2 Reviewed-by: Oliver Wolff --- src/corelib/tools/qcryptographichash.cpp | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) (limited to 'src/corelib/tools') diff --git a/src/corelib/tools/qcryptographichash.cpp b/src/corelib/tools/qcryptographichash.cpp index 3730a6c580..be124c94f7 100644 --- a/src/corelib/tools/qcryptographichash.cpp +++ b/src/corelib/tools/qcryptographichash.cpp @@ -48,23 +48,16 @@ #include "../../3rdparty/sha1/sha1.cpp" /* - These typedefs are needed by the RFC6234 code. Normally they would come - from from stdint.h, but since this header is not available on all platforms - (MSVC 2008, for example), we need to define them ourselves. + These #defines replace the typedefs needed by the RFC6234 code. Normally + the typedefs would come from from stdint.h, but since this header is not + available on all platforms (MSVC 2008, for example), we #define them to the + Qt equivalents. */ -#ifndef _UINT64_T_DECLARED -typedef QT_PREPEND_NAMESPACE(quint64) uint64_t; -#endif +#define uint64_t QT_PREPEND_NAMESPACE(quint64) +#define uint32_t QT_PREPEND_NAMESPACE(quint32) +#define uint8_t QT_PREPEND_NAMESPACE(quint8) +#define int_least16_t QT_PREPEND_NAMESPACE(qint16) -#ifndef _UINT32_T_DECLARED -typedef QT_PREPEND_NAMESPACE(quint32) uint32_t; -#endif - -#ifndef _UINT8_T_DECLARED -typedef QT_PREPEND_NAMESPACE(quint8) uint8_t; -#endif - -typedef QT_PREPEND_NAMESPACE(qint16) int_least16_t; // Header from rfc6234 with 1 modification: // sha1.h - commented out '#include ' on line 74 #include "../../3rdparty/rfc6234/sha.h" @@ -90,16 +83,21 @@ static int SHA384_512AddLength(SHA512Context *context, unsigned int length); // sha384-512.c - appended 'M' to the SHA224_256AddLength macro on line 304 #include "../../3rdparty/rfc6234/sha384-512.c" +#undef uint64_t +#undef uint32_t +#undef uint68_t +#undef int_least16_t + #include static inline int SHA224_256AddLength(SHA256Context *context, unsigned int length) { - uint32_t addTemp; + QT_PREPEND_NAMESPACE(quint32) addTemp; return SHA224_256AddLengthM(context, length); } static inline int SHA384_512AddLength(SHA512Context *context, unsigned int length) { - uint64_t addTemp; + QT_PREPEND_NAMESPACE(quint64) addTemp; return SHA384_512AddLengthM(context, length); } -- cgit v1.2.3 From c74bc26605f2337b13f366c2600fff4822d88ffe Mon Sep 17 00:00:00 2001 From: Aaron McCarthy Date: Tue, 6 Mar 2012 11:17:07 +1000 Subject: Support legacy QDataStream serialization of QDate. Commit 8327fa7c11f6c84ccc66be4365ee282a76288788 changed the type of the Julian day member of QDate from quint32 to qint64. This changed the QDataStream format. Keep the old behavior, with the limited date range, if the stream version is less than Qt_5_0. Change-Id: I800448979a1891581069f39de7f9ab9c634e4f0e Reviewed-by: John Layt Reviewed-by: Thiago Macieira --- src/corelib/tools/qdatetime.cpp | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'src/corelib/tools') diff --git a/src/corelib/tools/qdatetime.cpp b/src/corelib/tools/qdatetime.cpp index 64ad3121d0..fa5eed4f86 100644 --- a/src/corelib/tools/qdatetime.cpp +++ b/src/corelib/tools/qdatetime.cpp @@ -3511,7 +3511,10 @@ void QDateTime::detach() QDataStream &operator<<(QDataStream &out, const QDate &date) { - return out << (qint64)(date.jd); + if (out.version() < QDataStream::Qt_5_0) + return out << quint32(date.jd); + else + return out << qint64(date.jd); } /*! @@ -3524,9 +3527,16 @@ QDataStream &operator<<(QDataStream &out, const QDate &date) QDataStream &operator>>(QDataStream &in, QDate &date) { - qint64 jd; - in >> jd; - date.jd = jd; + if (in.version() < QDataStream::Qt_5_0) { + quint32 jd; + in >> jd; + date.jd = jd; + } else { + qint64 jd; + in >> jd; + date.jd = jd; + } + return in; } -- cgit v1.2.3 From 7ae6a6e744f92db2626b7c9b38175dc4140a4eaf Mon Sep 17 00:00:00 2001 From: Rohan McGovern Date: Tue, 6 Mar 2012 17:55:14 +1000 Subject: Fixed warning from gcc with -Wundef for some values of WCHAR_MAX Certain versions of system headers will declare WCHAR_MAX like: #define __WCHAR_MAX ( (wchar_t) - 1 ) #define WCHAR_MAX __WCHAR_MAX In particular on ARM (see e.g. http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=598937 ) In this case, defined(WCHAR_MAX) is true, but attempting to use the value of WCHAR_MAX in a preprocessor expression will not give the desired results - "wchar_t" is unknown to the preprocessor, so WCHAR_MAX silently (without -Wundef) evaluates to ( (0) - 1 ) == -1. A simple workaround is to avoid looking at WCHAR_MAX when the superior __SIZEOF_WCHAR_T__ is defined. Change-Id: I439b166cffb93416737ee19025fb6e8d51c27876 Reviewed-by: Bradley T. Hughes Reviewed-by: Thiago Macieira --- src/corelib/tools/qstring.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'src/corelib/tools') diff --git a/src/corelib/tools/qstring.h b/src/corelib/tools/qstring.h index 6fc86fc04b..4d02fbe66d 100644 --- a/src/corelib/tools/qstring.h +++ b/src/corelib/tools/qstring.h @@ -98,7 +98,9 @@ template struct QConstStringData #define QT_UNICODE_LITERAL_II(str) u"" str -#elif defined(Q_OS_WIN) || (defined(__SIZEOF_WCHAR_T__) && __SIZEOF_WCHAR_T__ == 2) || defined(WCHAR_MAX) && (WCHAR_MAX - 0 < 65536) +#elif defined(Q_OS_WIN) \ + || (defined(__SIZEOF_WCHAR_T__) && __SIZEOF_WCHAR_T__ == 2) \ + || (!defined(__SIZEOF_WCHAR_T__) && defined(WCHAR_MAX) && (WCHAR_MAX - 0 < 65536)) // wchar_t is 2 bytes template struct QConstStringData { -- cgit v1.2.3 From 02d947524d887e3ff6cb24065ccdbf3311ea81a8 Mon Sep 17 00:00:00 2001 From: Giuseppe D'Angelo Date: Sat, 3 Mar 2012 12:41:13 +0000 Subject: QRegularExpression: fix documentation due to qdoc changes Removes the usage of various qdoc macros which are now deprecated. Change-Id: I74fa70f8d2a2a1bff57cdb2bcc14a31a7198dea0 Reviewed-by: Casper van Donderen --- src/corelib/tools/qregularexpression.cpp | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'src/corelib/tools') diff --git a/src/corelib/tools/qregularexpression.cpp b/src/corelib/tools/qregularexpression.cpp index 40b6b5a08e..7faa907e35 100644 --- a/src/corelib/tools/qregularexpression.cpp +++ b/src/corelib/tools/qregularexpression.cpp @@ -73,21 +73,21 @@ QT_BEGIN_NAMESPACE strings and texts. This is useful in many contexts, e.g., \table - \row \i Validation - \i A regexp can test whether a substring meets some criteria, + \row \li Validation + \li A regexp can test whether a substring meets some criteria, e.g. is an integer or contains no whitespace. - \row \i Searching - \i A regexp provides more powerful pattern matching than + \row \li Searching + \li A regexp provides more powerful pattern matching than simple substring matching, e.g., match one of the words \e{mail}, \e{letter} or \e{correspondence}, but none of the words \e{email}, \e{mailman}, \e{mailer}, \e{letterbox}, etc. - \row \i Search and Replace - \i A regexp can replace all occurrences of a substring with a + \row \li Search and Replace + \li A regexp can replace all occurrences of a substring with a different substring, e.g., replace all occurrences of \e{&} with \e{\&} except where the \e{&} is already followed by an \e{amp;}. - \row \i String Splitting - \i A regexp can be used to identify where a string should be + \row \li String Splitting + \li A regexp can be used to identify where a string should be split apart, e.g. splitting tab-delimited strings. \endtable @@ -99,12 +99,12 @@ QT_BEGIN_NAMESPACE Good references about regular expressions include: \list - \o \e {Mastering Regular Expressions} (Third Edition) by Jeffrey E. F. + \li \e {Mastering Regular Expressions} (Third Edition) by Jeffrey E. F. Friedl, ISBN 0-596-52812-4; - \o the \l{http://pcre.org/pcre.txt} {pcrepattern(3)} man page, describing + \li the \l{http://pcre.org/pcre.txt} {pcrepattern(3)} man page, describing the pattern syntax supported by PCRE (the reference implementation of Perl-compatible regular expressions); - \o the \l{http://perldoc.perl.org/perlre.html} {Perl's regular expression + \li the \l{http://perldoc.perl.org/perlre.html} {Perl's regular expression documentation} and the \l{http://perldoc.perl.org/perlretut.html} {Perl's regular expression tutorial}. \endlist @@ -117,7 +117,7 @@ QT_BEGIN_NAMESPACE supports Unicode. For an overview of the regular expression syntax supported by QRegularExpression, please refer to the aforementioned pcrepattern(3) man page. A regular expression is made up of two things: a - \bold{pattern string} and a set of \bold{pattern options} that change the + \b{pattern string} and a set of \b{pattern options} that change the meaning of the pattern string. You can set the pattern string by passing a string to the QRegularExpression @@ -307,9 +307,9 @@ QT_BEGIN_NAMESPACE to do so we must distinguish three cases: \list - \o the input cannot possibly match the regular expression; - \o the input does match the regular expression; - \o the input does not match the regular expression right now, + \li the input cannot possibly match the regular expression; + \li the input does match the regular expression; + \li the input does not match the regular expression right now, but it will if more charaters will be added to it. \endlist @@ -653,7 +653,7 @@ QT_BEGIN_NAMESPACE \value ExtendedPatternSyntaxOption Any whitespace in the pattern string which is not escaped and outside a - character class is ignored. Moreover, an unescaped sharp (\bold{#}) + character class is ignored. Moreover, an unescaped sharp (\b{#}) outside a character class causes all the following characters, until the first newline (included), to be ignored. This can be used to increase the readability of a pattern string as well as put comments -- cgit v1.2.3 From a47d974e19512ae5a445f64fbd2b1703201f781d Mon Sep 17 00:00:00 2001 From: Giuseppe D'Angelo Date: Thu, 16 Feb 2012 23:49:27 +0100 Subject: QRegularExpression: fix optimizePattern, document the issue The studyData pointer is atomically set by the pointer assignment, but another processor running a different thread might see the new studyData value but not the memory it points to. Therefore, the current studyData is returned from optimizePattern and used by that thread. Docs were added to optimizePattern to explain what's going on. Change-Id: I4502c336077bb98a1751011aa93ffd4f585ed101 Reviewed-by: Thiago Macieira --- src/corelib/tools/qregularexpression.cpp | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) (limited to 'src/corelib/tools') diff --git a/src/corelib/tools/qregularexpression.cpp b/src/corelib/tools/qregularexpression.cpp index 7faa907e35..0252a30c89 100644 --- a/src/corelib/tools/qregularexpression.cpp +++ b/src/corelib/tools/qregularexpression.cpp @@ -773,7 +773,7 @@ struct QRegularExpressionPrivate : QSharedData void cleanCompiledPattern(); void compilePattern(); void getPatternInfo(); - void optimizePattern(); + pcre16_extra *optimizePattern(); QRegularExpressionMatchPrivate *doMatch(const QString &subject, int offset, @@ -1006,15 +1006,30 @@ static bool isJitEnabled() /*! \internal + + The purpose of the function is to call pcre16_study (which allows some + optimizations to be performed, including JIT-compiling the pattern), and + setting the studyData member variable to the result of the study. It gets + called by doMatch() every time a match is performed. As of now, the + optimizations on the pattern are performed after a certain number of usages + (i.e. the OPTIMIZE_AFTER_USE_COUNT constant). + + Notice that although the method is protected by a mutex, one thread may + invoke this function and return immediately (i.e. not study the pattern, + leaving studyData to NULL); but before calling pcre16_exec to perform the + match, another thread performs the studying and sets studyData to something + else. Although the assignment to studyData is itself atomic, the release of + the memory pointed by studyData isn't. Therefore, the current studyData + value is returned and used by doMatch. */ -void QRegularExpressionPrivate::optimizePattern() +pcre16_extra *QRegularExpressionPrivate::optimizePattern() { Q_ASSERT(compiledPattern); QMutexLocker lock(&mutex); if (studyData || (++usedCount != OPTIMIZE_AFTER_USE_COUNT)) - return; + return studyData; static const bool enableJit = isJitEnabled(); @@ -1027,6 +1042,8 @@ void QRegularExpressionPrivate::optimizePattern() if (!studyData && err) qWarning("QRegularExpressionPrivate::optimizePattern(): pcre_study failed: %s", err); + + return studyData; } /*! @@ -1089,7 +1106,7 @@ QRegularExpressionMatchPrivate *QRegularExpressionPrivate::doMatch(const QString capturingCount); // this is mutex protected - const_cast(this)->optimizePattern(); + const pcre16_extra *currentStudyData = const_cast(this)->optimizePattern(); int pcreOptions = convertToPcreOptions(matchOptions); @@ -1113,12 +1130,12 @@ QRegularExpressionMatchPrivate *QRegularExpressionPrivate::doMatch(const QString int result; if (!previousMatchWasEmpty) { - result = pcre16_exec(compiledPattern, studyData, + result = pcre16_exec(compiledPattern, currentStudyData, subjectUtf16, subjectLength, offset, pcreOptions, captureOffsets, captureOffsetsCount); } else { - result = pcre16_exec(compiledPattern, studyData, + result = pcre16_exec(compiledPattern, currentStudyData, subjectUtf16, subjectLength, offset, pcreOptions | PCRE_NOTEMPTY_ATSTART | PCRE_ANCHORED, captureOffsets, captureOffsetsCount); @@ -1136,7 +1153,7 @@ QRegularExpressionMatchPrivate *QRegularExpressionPrivate::doMatch(const QString ++offset; } - result = pcre16_exec(compiledPattern, studyData, + result = pcre16_exec(compiledPattern, currentStudyData, subjectUtf16, subjectLength, offset, pcreOptions, captureOffsets, captureOffsetsCount); -- cgit v1.2.3 From eb709333989a7e3d8eb662a0e167ac81ca19d882 Mon Sep 17 00:00:00 2001 From: Giuseppe D'Angelo Date: Wed, 22 Feb 2012 03:33:00 +0000 Subject: QRegularExpression: add optimizations autotest Exporting the counter that controls the optimization of a compiled pattern lets us to forcibly optimize all patterns. Therefore, two tests are now run: one with default optimization values and another one which always optimizes the pattern. The counter itself was renamed with a qt_ prefix and put inside the Qt compilation namespace (thanks to rohanpm for pointing it out). Change-Id: I56602433d37adc127772b2d0d2cdaf2e49d43c71 Reviewed-by: Rohan McGovern Reviewed-by: Thiago Macieira --- src/corelib/tools/qregularexpression.cpp | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'src/corelib/tools') diff --git a/src/corelib/tools/qregularexpression.cpp b/src/corelib/tools/qregularexpression.cpp index 0252a30c89..0fa7d6459e 100644 --- a/src/corelib/tools/qregularexpression.cpp +++ b/src/corelib/tools/qregularexpression.cpp @@ -50,9 +50,6 @@ #include -// after how many usages we optimize the regexp -static const unsigned int OPTIMIZE_AFTER_USE_COUNT = 10; - QT_BEGIN_NAMESPACE /*! @@ -726,6 +723,13 @@ QT_BEGIN_NAMESPACE contain any metacharacter that anchors the match at that point. */ +// after how many usages we optimize the regexp +#ifdef QT_BUILD_INTERNAL +Q_AUTOTEST_EXPORT unsigned int qt_qregularexpression_optimize_after_use_count = 10; +#else +static const unsigned int qt_qregularexpression_optimize_after_use_count = 10; +#endif // QT_BUILD_INTERNAL + /*! \internal */ @@ -1012,7 +1016,7 @@ static bool isJitEnabled() setting the studyData member variable to the result of the study. It gets called by doMatch() every time a match is performed. As of now, the optimizations on the pattern are performed after a certain number of usages - (i.e. the OPTIMIZE_AFTER_USE_COUNT constant). + (i.e. the qt_qregularexpression_optimize_after_use_count constant). Notice that although the method is protected by a mutex, one thread may invoke this function and return immediately (i.e. not study the pattern, @@ -1028,7 +1032,7 @@ pcre16_extra *QRegularExpressionPrivate::optimizePattern() QMutexLocker lock(&mutex); - if (studyData || (++usedCount != OPTIMIZE_AFTER_USE_COUNT)) + if (studyData || (++usedCount != qt_qregularexpression_optimize_after_use_count)) return studyData; static const bool enableJit = isJitEnabled(); -- cgit v1.2.3