diff options
author | Thiago Macieira <thiago.macieira@intel.com> | 2023-01-24 13:18:06 -0800 |
---|---|---|
committer | Thiago Macieira <thiago.macieira@intel.com> | 2023-01-31 19:38:01 -0800 |
commit | b22ae069ac193cfa0479d0bc258a860ef00816b4 (patch) | |
tree | c742dd2bed141438ad7fba9efe1a15055a0ae8dd | |
parent | ee515dd842d79fa4543568ed82bd7c949923e438 (diff) |
QRegularExpression: fix count() when the RE matches a surrogate
When the match finds a surrogate pair as the first true Unicode character,
then we need to skip both code units of the pair in order to restart the
search. PCRE2 does not allow us to search for individual UTF-16 code
units.
That actually means that counting "." gives us the count of Unicode
characters.
Fixes: QTBUG-110586
Pick-to: 5.15 6.2 6.4 6.5
Change-Id: I194d0a32c94148f398e6fffd173d5b5be8137e19
Reviewed-by: Giuseppe D'Angelo <giuseppe.dangelo@kdab.com>
Reviewed-by: Marc Mutz <marc.mutz@qt.io>
-rw-r--r-- | src/corelib/text/qstring.cpp | 8 | ||||
-rw-r--r-- | tests/auto/corelib/text/qstring/tst_qstring.cpp | 15 |
2 files changed, 22 insertions, 1 deletions
diff --git a/src/corelib/text/qstring.cpp b/src/corelib/text/qstring.cpp index d33af45912..12c3186a71 100644 --- a/src/corelib/text/qstring.cpp +++ b/src/corelib/text/qstring.cpp @@ -9707,8 +9707,14 @@ qsizetype QtPrivate::count(QStringView haystack, const QRegularExpression &re) QRegularExpressionMatch match = re.matchView(haystack, index + 1); if (!match.hasMatch()) break; - index = match.capturedStart(); count++; + + // Search again, from the next character after the beginning of this + // capture. If the capture starts with a surrogate pair, both together + // count as "one character". + index = match.capturedStart(); + if (index < len && haystack[index].isHighSurrogate()) + ++index; } return count; } diff --git a/tests/auto/corelib/text/qstring/tst_qstring.cpp b/tests/auto/corelib/text/qstring/tst_qstring.cpp index 2ea3e2e6e9..fbbb4377f4 100644 --- a/tests/auto/corelib/text/qstring/tst_qstring.cpp +++ b/tests/auto/corelib/text/qstring/tst_qstring.cpp @@ -2059,6 +2059,21 @@ void tst_QString::count() QTest::ignoreMessage(QtWarningMsg, ignoreMessagePattern); QCOMPARE(emptyStr.count(QRegularExpression("invalid regex\\")), 0); #endif + + QString nonBmpString = u8"\U00010000\U00010000abc\U00010000"; + QCOMPARE(nonBmpString.count(u"\U00010000"), 3); +#if QT_CONFIG(regularexpression) + QCOMPARE(nonBmpString.count(QRegularExpression(u8"\U00010000")), 3); + QCOMPARE(nonBmpString.count(QRegularExpression(u8"\U00010000a?")), 3); + QCOMPARE(nonBmpString.count(QRegularExpression(u8"\U00010000a")), 1); + QCOMPARE(nonBmpString.count(QRegularExpression(".")), 6); + + // can't search for unpaired surrogates + QTest::ignoreMessage(QtWarningMsg, ignoreMessagePattern); + QCOMPARE(nonBmpString.count(QRegularExpression(QChar(0xd800))), 0); + QTest::ignoreMessage(QtWarningMsg, ignoreMessagePattern); + QCOMPARE(nonBmpString.count(QRegularExpression(QChar(0xdc00))), 0); +#endif } void tst_QString::contains() |