diff options
author | Giuseppe D'Angelo <dangelog@gmail.com> | 2012-04-07 01:08:58 +0100 |
---|---|---|
committer | Qt by Nokia <qt-info@nokia.com> | 2012-04-12 15:12:23 +0200 |
commit | 5c9ba41f71340dd194cee5f9f5864a66b0940c9d (patch) | |
tree | c0c7909187e975009aed01170c2c584e9e55a03f | |
parent | 56ff31f0c17ff7c038028c1af6afb6eab5216bd6 (diff) |
QRegularExpression: optimize global match
PCRE doesn't like illegal Unicode sequences (it's explicitely
documented in pcreunicode(3) that they trigger undefined behaviour,
and the program may crash). Therefore, we always let PCRE check
the validity of both the pattern and the subject string.
However, when performing global matching, the subject string
can be checked only once: subsequent matches can safely skip the check
and avoid a huge performance hit of scanning the whole subject
string for each match (!).
This patch implements that behaviour internally -- it's still
not possible for the user to skip the sanity check. On large
subject strings, this gives a terrific performance benefit.
Change-Id: Ia44cf18782e07966c9cd6ec4ccfef081ed131763
Reviewed-by: Robin Burchell <robin+qt@viroteck.net>
Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
-rw-r--r-- | src/corelib/tools/qregularexpression.cpp | 13 |
1 files changed, 12 insertions, 1 deletions
diff --git a/src/corelib/tools/qregularexpression.cpp b/src/corelib/tools/qregularexpression.cpp index 27264f7e72..a65ac0cd2b 100644 --- a/src/corelib/tools/qregularexpression.cpp +++ b/src/corelib/tools/qregularexpression.cpp @@ -796,6 +796,7 @@ struct QRegularExpressionPrivate : QSharedData int offset, QRegularExpression::MatchType matchType, QRegularExpression::MatchOptions matchOptions, + bool checkSubjectString = true, const QRegularExpressionMatchPrivate *previous = 0) const; int captureIndexForName(const QString &name) const; @@ -1165,7 +1166,8 @@ static int pcre16SafeExec(const pcre16 *code, const pcre16_extra *extra, Performs a match of type \a matchType on the given \a subject string with options \a matchOptions and returns the QRegularExpressionMatchPrivate of the result. It also advances a match if a previous result is given as \a - previous. + previous. The \a subject string goes a Unicode validity check if + \a checkSubjectString is true (PCRE doesn't like illegal UTF-16 sequences). Advancing a match is a tricky algorithm. If the previous match matched a non-empty string, we just do an ordinary match at the offset position. @@ -1182,6 +1184,7 @@ QRegularExpressionMatchPrivate *QRegularExpressionPrivate::doMatch(const QString int offset, QRegularExpression::MatchType matchType, QRegularExpression::MatchOptions matchOptions, + bool checkSubjectString, const QRegularExpressionMatchPrivate *previous) const { if (offset < 0) @@ -1211,6 +1214,9 @@ QRegularExpressionMatchPrivate *QRegularExpressionPrivate::doMatch(const QString else if (matchType == QRegularExpression::PartialPreferFirstMatch) pcreOptions |= PCRE_PARTIAL_HARD; + if (!checkSubjectString) + pcreOptions |= PCRE_NO_UTF16_CHECK; + bool previousMatchWasEmpty = false; if (previous && previous->hasMatch && (previous->capturedOffsets.at(0) == previous->capturedOffsets.at(1))) { @@ -1318,10 +1324,15 @@ QRegularExpressionMatch QRegularExpressionMatchPrivate::nextMatch() const Q_ASSERT(isValid); Q_ASSERT(hasMatch || hasPartialMatch); + // Note the "false" passed for the check of the subject string: + // if we're advancing a match on the same subject, + // then that subject was already checked at least once (when this object + // was created, or when the object that created this one was created, etc.) QRegularExpressionMatchPrivate *nextPrivate = regularExpression.d->doMatch(subject, capturedOffsets.at(1), matchType, matchOptions, + false, this); return QRegularExpressionMatch(*nextPrivate); } |