QUrl: Fix Punycode handling for non-BMP codepoints

Iterate over Unicode codepoints instead of UTF-16 characters when converting to/from Punycode as described in the specification. Additionally reject strings with invalid surrogate pairs when encoding to Punycode, reject strings with any encoded surrogates when decoding. Remove expected failure marking from the test for this issue in tst_qurlinternal. Fixes: QTBUG-95577 Change-Id: I3dd68f95ada6d652e2fa5c0c3118dcfa0a5f4c4d Reviewed-by: Thiago Macieira <thiago.macieira@intel.com> Reviewed-by: Edward Welbourne <edward.welbourne@qt.io> (cherry picked from commit 9bd2ab85ac88e88f29a11a7dedc8635261ca3484) Reviewed-by: Qt Cherry-pick Bot <cherrypick_bot@qt-project.org>
author: Ievgenii Meshcheriakov <ievgenii.meshcheriakov@qt.io> 2021-08-04 14:35:02 +0200
committer: Qt Cherry-pick Bot <cherrypick_bot@qt-project.org> 2021-08-10 13:21:28 +0000
commit: 0da1e0171218af8430e9e96e7f919f3b3c459869 (patch)
tree: dbf0996fe786720c822923b1db61d6784787ba5f
parent: c6154452011da135c61bc00f86905c767a16f42d (diff)
2 files changed, 39 insertions, 15 deletions
diff --git a/src/corelib/io/qurlidna.cpp b/src/corelib/io/qurlidna.cpp
index 774903c35b..c1a5220dbb 100644
--- a/src/corelib/io/qurlidna.cpp
+++ b/src/corelib/io/qurlidna.cpp
@@ -2252,15 +2252,27 @@ Q_AUTOTEST_EXPORT void qt_punycodeEncoder(QStringView in, QString *output)
     if (h > 0)
         *output += QLatin1Char{'-'};
 
+    // compute the input length in Unicode code points.
+    qsizetype inputLength = 0;
+    for (QStringIterator iter(in); iter.hasNext();) {
+        inputLength++;
+
+        if (iter.next(char32_t(-1)) == char32_t(-1)) {
+            output->truncate(outLen);
+            return; // invalid surrogate pair
+        }
+    }
+
     // while there are still unprocessed non-basic code points left in
     // the input string...
-    while (h < (uint) in.length()) {
+    while (h < inputLength) {
         // find the character in the input string with the lowest
         // unicode value.
         uint m = Q_MAXINT;
-        for (QChar c : in) {
-            if (c.unicode() >= n && c.unicode() < m)
-                m = (uint) c.unicode();
+        for (QStringIterator iter(in); iter.hasNext();) {
+            auto c = iter.nextUnchecked();
+            if (c >= n && c < m)
+                m = c;
         }
 
         // reject out-of-bounds unicode characters
@@ -2272,11 +2284,12 @@ Q_AUTOTEST_EXPORT void qt_punycodeEncoder(QStringView in, QString *output)
         delta += (m - n) * (h + 1);
         n = m;
 
-        for (QChar c : in) {
+        for (QStringIterator iter(in); iter.hasNext();) {
+            auto c = iter.nextUnchecked();
 
             // increase delta until we reach the character with the
             // lowest unicode code. fail if delta overflows.
-            if (c.unicode() < n) {
+            if (c < n) {
                 ++delta;
                 if (!delta) {
                     output->truncate(outLen);
@@ -2286,7 +2299,7 @@ Q_AUTOTEST_EXPORT void qt_punycodeEncoder(QStringView in, QString *output)
 
             // if j is the index of the character with the lowest
             // unicode code...
-            if (c.unicode() == n) {
+            if (c == n) {
                 appendEncode(output, delta, bias, b, h);
             }
         }
@@ -2314,8 +2327,8 @@ Q_AUTOTEST_EXPORT QString qt_punycodeDecoder(const QString &pc)
     // find the last delimiter character '-' in the input array. copy
     // all data before this delimiter directly to the output array.
     int delimiterPos = pc.lastIndexOf(QLatin1Char{'-'});
-    QString output = delimiterPos < 4 ?
-                     QString() : pc.mid(start, delimiterPos - start);
+    auto output = delimiterPos < 4 ? std::u32string()
+                                   : pc.mid(start, delimiterPos - start).toStdU32String();
 
     // if a delimiter was found, skip to the position after it;
     // otherwise start at the front of the input string. everything
@@ -2357,18 +2370,30 @@ Q_AUTOTEST_EXPORT QString qt_punycodeDecoder(const QString &pc)
 
         // find new bias and calculate the next non-basic code
         // character.
-        bias = adapt(i - oldi, output.length() + 1, oldi == 0);
-        n += i / (output.length() + 1);
+        uint outputLength = static_cast<uint>(output.length());
+        bias = adapt(i - oldi, outputLength + 1, oldi == 0);
+        n += i / (outputLength + 1);
 
         // allow the deltas to wrap around
-        i %= (output.length() + 1);
+        i %= (outputLength + 1);
+
+        // Surrogates should normally be rejected later by other IDNA code.
+        // But because of Qt's use of UTF-16 to represent strings the
+        // IDNA code is not able to distinguish characters represented as pairs
+        // of surrogates from normal code points. This is why surrogates are
+        // not allowed here.
+        //
+        // Allowing surrogates would lead to non-unique (after normalization)
+        // encoding of strings with non-BMP characters.
+        if (QChar::isSurrogate(n))
+            return QString();
 
         // insert the character n at position i
-        output.insert((uint) i, QChar((ushort) n));
+        output.insert(i, 1, static_cast<char32_t>(n));
         ++i;
     }
 
-    return output;
+    return QString::fromStdU32String(output);
 }
 
 static const char * const idn_whitelist[] = {
diff --git a/tests/auto/corelib/io/qurlinternal/tst_qurlinternal.cpp b/tests/auto/corelib/io/qurlinternal/tst_qurlinternal.cpp
index f9876fc12e..7f2c506a1a 100644
--- a/tests/auto/corelib/io/qurlinternal/tst_qurlinternal.cpp
+++ b/tests/auto/corelib/io/qurlinternal/tst_qurlinternal.cpp
@@ -246,7 +246,6 @@ void tst_QUrlInternal::idna_testsuite()
 
     QString result;
     qt_punycodeEncoder(QStringView{unicode.points, numchars}, &result);
-    QEXPECT_FAIL("U+102F7", "QTBUG-95577: Non-BMP handling is broken", Continue);
     QCOMPARE(result.toLatin1(), punycode);
     QCOMPARE(qt_punycodeDecoder(result), QString::fromUtf16(unicode.points, numchars));
 }
author	Ievgenii Meshcheriakov <ievgenii.meshcheriakov@qt.io>	2021-08-04 14:35:02 +0200
committer	Qt Cherry-pick Bot <cherrypick_bot@qt-project.org>	2021-08-10 13:21:28 +0000
commit	0da1e0171218af8430e9e96e7f919f3b3c459869 (patch)
tree	dbf0996fe786720c822923b1db61d6784787ba5f
parent	c6154452011da135c61bc00f86905c767a16f42d (diff)