summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGiuseppe D'Angelo <dangelog@gmail.com>2012-04-07 01:08:58 +0100
committerQt by Nokia <qt-info@nokia.com>2012-04-12 15:12:23 +0200
commit5c9ba41f71340dd194cee5f9f5864a66b0940c9d (patch)
treec0c7909187e975009aed01170c2c584e9e55a03f
parent56ff31f0c17ff7c038028c1af6afb6eab5216bd6 (diff)
QRegularExpression: optimize global match
PCRE doesn't like illegal Unicode sequences (it's explicitely documented in pcreunicode(3) that they trigger undefined behaviour, and the program may crash). Therefore, we always let PCRE check the validity of both the pattern and the subject string. However, when performing global matching, the subject string can be checked only once: subsequent matches can safely skip the check and avoid a huge performance hit of scanning the whole subject string for each match (!). This patch implements that behaviour internally -- it's still not possible for the user to skip the sanity check. On large subject strings, this gives a terrific performance benefit. Change-Id: Ia44cf18782e07966c9cd6ec4ccfef081ed131763 Reviewed-by: Robin Burchell <robin+qt@viroteck.net> Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
-rw-r--r--src/corelib/tools/qregularexpression.cpp13
1 files changed, 12 insertions, 1 deletions
diff --git a/src/corelib/tools/qregularexpression.cpp b/src/corelib/tools/qregularexpression.cpp
index 27264f7e72..a65ac0cd2b 100644
--- a/src/corelib/tools/qregularexpression.cpp
+++ b/src/corelib/tools/qregularexpression.cpp
@@ -796,6 +796,7 @@ struct QRegularExpressionPrivate : QSharedData
int offset,
QRegularExpression::MatchType matchType,
QRegularExpression::MatchOptions matchOptions,
+ bool checkSubjectString = true,
const QRegularExpressionMatchPrivate *previous = 0) const;
int captureIndexForName(const QString &name) const;
@@ -1165,7 +1166,8 @@ static int pcre16SafeExec(const pcre16 *code, const pcre16_extra *extra,
Performs a match of type \a matchType on the given \a subject string with
options \a matchOptions and returns the QRegularExpressionMatchPrivate of
the result. It also advances a match if a previous result is given as \a
- previous.
+ previous. The \a subject string goes a Unicode validity check if
+ \a checkSubjectString is true (PCRE doesn't like illegal UTF-16 sequences).
Advancing a match is a tricky algorithm. If the previous match matched a
non-empty string, we just do an ordinary match at the offset position.
@@ -1182,6 +1184,7 @@ QRegularExpressionMatchPrivate *QRegularExpressionPrivate::doMatch(const QString
int offset,
QRegularExpression::MatchType matchType,
QRegularExpression::MatchOptions matchOptions,
+ bool checkSubjectString,
const QRegularExpressionMatchPrivate *previous) const
{
if (offset < 0)
@@ -1211,6 +1214,9 @@ QRegularExpressionMatchPrivate *QRegularExpressionPrivate::doMatch(const QString
else if (matchType == QRegularExpression::PartialPreferFirstMatch)
pcreOptions |= PCRE_PARTIAL_HARD;
+ if (!checkSubjectString)
+ pcreOptions |= PCRE_NO_UTF16_CHECK;
+
bool previousMatchWasEmpty = false;
if (previous && previous->hasMatch &&
(previous->capturedOffsets.at(0) == previous->capturedOffsets.at(1))) {
@@ -1318,10 +1324,15 @@ QRegularExpressionMatch QRegularExpressionMatchPrivate::nextMatch() const
Q_ASSERT(isValid);
Q_ASSERT(hasMatch || hasPartialMatch);
+ // Note the "false" passed for the check of the subject string:
+ // if we're advancing a match on the same subject,
+ // then that subject was already checked at least once (when this object
+ // was created, or when the object that created this one was created, etc.)
QRegularExpressionMatchPrivate *nextPrivate = regularExpression.d->doMatch(subject,
capturedOffsets.at(1),
matchType,
matchOptions,
+ false,
this);
return QRegularExpressionMatch(*nextPrivate);
}