diff options
author | Ievgenii Meshcheriakov <ievgenii.meshcheriakov@qt.io> | 2021-08-04 14:35:02 +0200 |
---|---|---|
committer | Qt Cherry-pick Bot <cherrypick_bot@qt-project.org> | 2021-08-10 13:21:28 +0000 |
commit | 0da1e0171218af8430e9e96e7f919f3b3c459869 (patch) | |
tree | dbf0996fe786720c822923b1db61d6784787ba5f | |
parent | c6154452011da135c61bc00f86905c767a16f42d (diff) |
QUrl: Fix Punycode handling for non-BMP codepoints
Iterate over Unicode codepoints instead of UTF-16 characters
when converting to/from Punycode as described in the specification.
Additionally reject strings with invalid surrogate pairs when
encoding to Punycode, reject strings with any encoded surrogates
when decoding.
Remove expected failure marking from the test for this issue
in tst_qurlinternal.
Fixes: QTBUG-95577
Change-Id: I3dd68f95ada6d652e2fa5c0c3118dcfa0a5f4c4d
Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
Reviewed-by: Edward Welbourne <edward.welbourne@qt.io>
(cherry picked from commit 9bd2ab85ac88e88f29a11a7dedc8635261ca3484)
Reviewed-by: Qt Cherry-pick Bot <cherrypick_bot@qt-project.org>
-rw-r--r-- | src/corelib/io/qurlidna.cpp | 53 | ||||
-rw-r--r-- | tests/auto/corelib/io/qurlinternal/tst_qurlinternal.cpp | 1 |
2 files changed, 39 insertions, 15 deletions
diff --git a/src/corelib/io/qurlidna.cpp b/src/corelib/io/qurlidna.cpp index 774903c35b..c1a5220dbb 100644 --- a/src/corelib/io/qurlidna.cpp +++ b/src/corelib/io/qurlidna.cpp @@ -2252,15 +2252,27 @@ Q_AUTOTEST_EXPORT void qt_punycodeEncoder(QStringView in, QString *output) if (h > 0) *output += QLatin1Char{'-'}; + // compute the input length in Unicode code points. + qsizetype inputLength = 0; + for (QStringIterator iter(in); iter.hasNext();) { + inputLength++; + + if (iter.next(char32_t(-1)) == char32_t(-1)) { + output->truncate(outLen); + return; // invalid surrogate pair + } + } + // while there are still unprocessed non-basic code points left in // the input string... - while (h < (uint) in.length()) { + while (h < inputLength) { // find the character in the input string with the lowest // unicode value. uint m = Q_MAXINT; - for (QChar c : in) { - if (c.unicode() >= n && c.unicode() < m) - m = (uint) c.unicode(); + for (QStringIterator iter(in); iter.hasNext();) { + auto c = iter.nextUnchecked(); + if (c >= n && c < m) + m = c; } // reject out-of-bounds unicode characters @@ -2272,11 +2284,12 @@ Q_AUTOTEST_EXPORT void qt_punycodeEncoder(QStringView in, QString *output) delta += (m - n) * (h + 1); n = m; - for (QChar c : in) { + for (QStringIterator iter(in); iter.hasNext();) { + auto c = iter.nextUnchecked(); // increase delta until we reach the character with the // lowest unicode code. fail if delta overflows. - if (c.unicode() < n) { + if (c < n) { ++delta; if (!delta) { output->truncate(outLen); @@ -2286,7 +2299,7 @@ Q_AUTOTEST_EXPORT void qt_punycodeEncoder(QStringView in, QString *output) // if j is the index of the character with the lowest // unicode code... - if (c.unicode() == n) { + if (c == n) { appendEncode(output, delta, bias, b, h); } } @@ -2314,8 +2327,8 @@ Q_AUTOTEST_EXPORT QString qt_punycodeDecoder(const QString &pc) // find the last delimiter character '-' in the input array. copy // all data before this delimiter directly to the output array. int delimiterPos = pc.lastIndexOf(QLatin1Char{'-'}); - QString output = delimiterPos < 4 ? - QString() : pc.mid(start, delimiterPos - start); + auto output = delimiterPos < 4 ? std::u32string() + : pc.mid(start, delimiterPos - start).toStdU32String(); // if a delimiter was found, skip to the position after it; // otherwise start at the front of the input string. everything @@ -2357,18 +2370,30 @@ Q_AUTOTEST_EXPORT QString qt_punycodeDecoder(const QString &pc) // find new bias and calculate the next non-basic code // character. - bias = adapt(i - oldi, output.length() + 1, oldi == 0); - n += i / (output.length() + 1); + uint outputLength = static_cast<uint>(output.length()); + bias = adapt(i - oldi, outputLength + 1, oldi == 0); + n += i / (outputLength + 1); // allow the deltas to wrap around - i %= (output.length() + 1); + i %= (outputLength + 1); + + // Surrogates should normally be rejected later by other IDNA code. + // But because of Qt's use of UTF-16 to represent strings the + // IDNA code is not able to distinguish characters represented as pairs + // of surrogates from normal code points. This is why surrogates are + // not allowed here. + // + // Allowing surrogates would lead to non-unique (after normalization) + // encoding of strings with non-BMP characters. + if (QChar::isSurrogate(n)) + return QString(); // insert the character n at position i - output.insert((uint) i, QChar((ushort) n)); + output.insert(i, 1, static_cast<char32_t>(n)); ++i; } - return output; + return QString::fromStdU32String(output); } static const char * const idn_whitelist[] = { diff --git a/tests/auto/corelib/io/qurlinternal/tst_qurlinternal.cpp b/tests/auto/corelib/io/qurlinternal/tst_qurlinternal.cpp index f9876fc12e..7f2c506a1a 100644 --- a/tests/auto/corelib/io/qurlinternal/tst_qurlinternal.cpp +++ b/tests/auto/corelib/io/qurlinternal/tst_qurlinternal.cpp @@ -246,7 +246,6 @@ void tst_QUrlInternal::idna_testsuite() QString result; qt_punycodeEncoder(QStringView{unicode.points, numchars}, &result); - QEXPECT_FAIL("U+102F7", "QTBUG-95577: Non-BMP handling is broken", Continue); QCOMPARE(result.toLatin1(), punycode); QCOMPARE(qt_punycodeDecoder(result), QString::fromUtf16(unicode.points, numchars)); } |