Use the new UTF-8 codec in QUrl and QUrlQuery

The new code is based on what QUrl already had, so this should have no net effect in performance. Change-Id: Ibb2fabd5a108e99a44e0e6e3f713ce2f8b26e4d7 Reviewed-by: Lars Knoll <lars.knoll@digia.com>
author: Thiago Macieira <thiago.macieira@intel.com> 2013-10-19 23:39:01 -0400
committer: The Qt Project <gerrit-noreply@qt-project.org> 2014-01-09 22:34:54 +0100
commit: 2b82923c8fba5dcff707e344acdf9db8c444a55e (patch)
tree: 4f53cb17a881e380665bf4aa95a324fa909c4532 /src/corelib/io/qurlrecode.cpp
parent: cd750c86d62152855d7f1105a5a4dd86d8ff8797 (diff)
1 files changed, 66 insertions, 120 deletions
diff --git a/src/corelib/io/qurlrecode.cpp b/src/corelib/io/qurlrecode.cpp
index 80fc0319fe..74a981b654 100644
--- a/src/corelib/io/qurlrecode.cpp
+++ b/src/corelib/io/qurlrecode.cpp
@@ -40,6 +40,7 @@
 ****************************************************************************/
 
 #include "qurl.h"
+#include "private/qutfcodec_p.h"
 
 QT_BEGIN_NAMESPACE
 
@@ -232,110 +233,73 @@ static void ensureDetached(QString &result, ushort *&output, const ushort *begin
     }
 }
 
-// returns true if we performed an UTF-8 decoding
-static bool encodedUtf8ToUtf16(QString &result, ushort *&output, const ushort *begin, const ushort *&input,
-                               const ushort *end, ushort decoded)
+namespace {
+struct QUrlUtf8Traits : public QUtf8BaseTraitsNoAscii
 {
-    int charsNeeded;
-    uint min_uc;
-    uint uc;
-
-    if (decoded <= 0xC1) {
-        // an UTF-8 first character must be at least 0xC0
-        // however, all 0xC0 and 0xC1 first bytes can only produce overlong sequences
-        return false;
-    } else if (decoded < 0xe0) {
-        charsNeeded = 2;
-        min_uc = 0x80;
-        uc = decoded & 0x1f;
-    } else if (decoded < 0xf0) {
-        charsNeeded = 3;
-        min_uc = 0x800;
-        uc = decoded & 0x0f;
-    } else if (decoded < 0xf5) {
-        charsNeeded = 4;
-        min_uc = 0x10000;
-        uc = decoded & 0x07;
-    } else {
-        // the last Unicode character is U+10FFFF
-        // it's encoded in UTF-8 as "\xF4\x8F\xBF\xBF"
-        // therefore, a byte higher than 0xF4 is not the UTF-8 first byte
-        return false;
+    // override: our "bytes" are three percent-encoded UTF-16 characters
+    static void appendByte(ushort *&ptr, uchar b)
+    {
+        // b >= 0x80, by construction, so percent-encode
+        *ptr++ = '%';
+        *ptr++ = encodeNibble(b >> 4);
+        *ptr++ = encodeNibble(b & 0xf);
     }
 
-    // are there enough remaining?
-    if (end - input < 3*charsNeeded)
-        return false;
+    static uchar peekByte(const ushort *ptr, int n = 0)
+    {
+        // decodePercentEncoding returns ushort(-1) if it can't decode,
+        // which means we return 0xff, which is not a valid continuation byte.
+        // If ptr[i * 3] is not '%', we'll multiply by zero and return 0,
+        // also not a valid continuation byte (if it's '%', we multiply by 1).
+        return uchar(decodePercentEncoding(ptr + n * 3))
+                * uchar(ptr[n * 3] == '%');
+    }
 
-    if (input[3] != '%')
-        return false;
+    static qptrdiff availableBytes(const ushort *ptr, const ushort *end)
+    {
+        return (end - ptr) / 3;
+    }
 
-    // first continuation character
-    decoded = decodePercentEncoding(input + 3);
-    if ((decoded & 0xc0) != 0x80)
-        return false;
-    uc <<= 6;
-    uc |= decoded & 0x3f;
-
-    if (charsNeeded > 2) {
-        if (input[6] != '%')
-            return false;
-
-        // second continuation character
-        decoded = decodePercentEncoding(input + 6);
-        if ((decoded & 0xc0) != 0x80)
-            return false;
-        uc <<= 6;
-        uc |= decoded & 0x3f;
-
-        if (charsNeeded > 3) {
-            if (input[9] != '%')
-                return false;
-
-            // third continuation character
-            decoded = decodePercentEncoding(input + 9);
-            if ((decoded & 0xc0) != 0x80)
-                return false;
-            uc <<= 6;
-            uc |= decoded & 0x3f;
-        }
+    static void advanceByte(const ushort *&ptr, int n = 1)
+    {
+        ptr += n * 3;
     }
+};
+}
 
-    // we've decoded something; safety-check it
-    if (uc < min_uc)
-        return false;
-    if (QChar::isSurrogate(uc) || uc > QChar::LastValidCodePoint)
+// returns true if we performed an UTF-8 decoding
+static bool encodedUtf8ToUtf16(QString &result, ushort *&output, const ushort *begin, const ushort *&input,
+                               const ushort *end, ushort decoded)
+{
+    uint ucs4, *dst = &ucs4;
+    const ushort *src = input + 3;// skip the %XX that yielded \a decoded
+    int charsNeeded = QUtf8Functions::fromUtf8<QUrlUtf8Traits>(decoded, dst, src, end);
+    if (charsNeeded < 0)
         return false;
 
-    if (!QChar::requiresSurrogates(uc)) {
+    if (!QChar::requiresSurrogates(ucs4)) {
         // UTF-8 decoded and no surrogates are required
         // detach if necessary
-        ensureDetached(result, output, begin, input, end, -9 * charsNeeded + 1);
-        *output++ = uc;
+        // possibilities are: 6 chars (%XX%XX) -> one char; 9 chars (%XX%XX%XX) -> one char
+        ensureDetached(result, output, begin, input, end, -3 * charsNeeded + 1);
+        *output++ = ucs4;
     } else {
         // UTF-8 decoded to something that requires a surrogate pair
-        ensureDetached(result, output, begin, input, end, -9 * charsNeeded + 2);
-        *output++ = QChar::highSurrogate(uc);
-        *output++ = QChar::lowSurrogate(uc);
+        // compressing from %XX%XX%XX%XX (12 chars) to two
+        ensureDetached(result, output, begin, input, end, -10);
+        *output++ = QChar::highSurrogate(ucs4);
+        *output++ = QChar::lowSurrogate(ucs4);
     }
-    input += charsNeeded * 3 - 1;
+
+    input = src - 1;
     return true;
 }
 
 static void unicodeToEncodedUtf8(QString &result, ushort *&output, const ushort *begin,
                                  const ushort *&input, const ushort *end, ushort decoded)
 {
-    uint uc = decoded;
-    if (QChar::isHighSurrogate(uc)) {
-        if (input < end && QChar::isLowSurrogate(input[1]))
-            uc = QChar::surrogateToUcs4(uc, input[1]);
-    }
-
-    // note: we will encode bad UTF-16 to UTF-8
-    // but they don't get decoded back
-
-    // calculate the utf8 length
-    int utf8len = uc >= 0x10000 ? 4 : uc >= 0x800 ? 3 : 2;
+    // calculate the utf8 length and ensure enough space is available
+    int utf8len = QChar::isHighSurrogate(decoded) ? 4 : decoded >= 0x800 ? 3 : 2;
 
     // detach
     if (!output) {
@@ -357,50 +321,32 @@ static void unicodeToEncodedUtf8(QString &result, ushort *&output, const ushort
         }
     }
 
-    // write the sequence
-    if (uc < 0x800) {
-        // first of two bytes
-        uchar c = 0xc0 | uchar(uc >> 6);
+    ++input;
+    int res = QUtf8Functions::toUtf8<QUrlUtf8Traits>(decoded, output, input, end);
+    --input;
+    if (res < 0) {
+        // bad surrogate pair sequence
+        // we will encode bad UTF-16 to UTF-8
+        // but they don't get decoded back
+
+        // first of three bytes
+        uchar c = 0xe0 | uchar(decoded >> 12);
         *output++ = '%';
-        *output++ = encodeNibble(c >> 4);
+        *output++ = 'E';
         *output++ = encodeNibble(c & 0xf);
-    } else {
-        uchar c;
-        if (uc > 0xFFFF) {
-            // first two of four bytes
-            c = 0xf0 | uchar(uc >> 18);
-            *output++ = '%';
-            *output++ = 'F';
-            *output++ = encodeNibble(c & 0xf);
 
-            // continuation byte
-            c = 0x80 | (uchar(uc >> 12) & 0x3f);
-            *output++ = '%';
-            *output++ = encodeNibble(c >> 4);
-            *output++ = encodeNibble(c & 0xf);
-
-            // this was a surrogate pair
-            ++input;
-        } else {
-            // first of three bytes
-            c = 0xe0 | uchar(uc >> 12);
-            *output++ = '%';
-            *output++ = 'E';
-            *output++ = encodeNibble(c & 0xf);
-        }
+        // second byte
+        c = 0x80 | (uchar(decoded >> 6) & 0x3f);
+        *output++ = '%';
+        *output++ = encodeNibble(c >> 4);
+        *output++ = encodeNibble(c & 0xf);
 
-        // continuation byte
-        c = 0x80 | (uchar(uc >> 6) & 0x3f);
+        // third byte
+        c = 0x80 | (decoded & 0x3f);
         *output++ = '%';
         *output++ = encodeNibble(c >> 4);
         *output++ = encodeNibble(c & 0xf);
     }
-
-    // continuation byte
-    uchar c = 0x80 | (uc & 0x3f);
-    *output++ = '%';
-    *output++ = encodeNibble(c >> 4);
-    *output++ = encodeNibble(c & 0xf);
 }
 
 static int recode(QString &result, const ushort *begin, const ushort *end, QUrl::ComponentFormattingOptions encoding,
author	Thiago Macieira <thiago.macieira@intel.com>	2013-10-19 23:39:01 -0400
committer	The Qt Project <gerrit-noreply@qt-project.org>	2014-01-09 22:34:54 +0100
commit	2b82923c8fba5dcff707e344acdf9db8c444a55e (patch)
tree	4f53cb17a881e380665bf4aa95a324fa909c4532 /src/corelib/io/qurlrecode.cpp
parent	cd750c86d62152855d7f1105a5a4dd86d8ff8797 (diff)