Remove the tolerant parsing function and make the recoder tolerant

The reason for this change is that the strict parser made little sense to exist. What would the recoder do if it was passed an invalid string? I believe that the tolerant recoder is more efficient than the correcting code followed by the strict recoder. This makes the recoder more complex and probably a little less efficient, but it's better in the common case (tolerant that doesn't need fixes) and in the worst case (needs fixes). Change-Id: I68a0c9fda6765de05914cbd6ba7d3cea560a7cd6 Reviewed-by: Lars Knoll <lars.knoll@nokia.com>
author: Thiago Macieira <thiago.macieira@intel.com> 2011-09-06 20:36:28 +0200
committer: Qt by Nokia <qt-info@nokia.com> 2012-03-30 01:19:59 +0200
commit: 73e16b15a6b3dc9838763407665d5797ba5618b2 (patch)
tree: 9d89100a2d65e99d27d1a815175e9ae5ef58fcde /src/corelib/io/qurlrecode.cpp
parent: 6028efa3ff56b58ce70d5b8fdb53030185149028 (diff)
1 files changed, 95 insertions, 106 deletions
diff --git a/src/corelib/io/qurlrecode.cpp b/src/corelib/io/qurlrecode.cpp
index 6a0517a7e5..27f541915d 100644
--- a/src/corelib/io/qurlrecode.cpp
+++ b/src/corelib/io/qurlrecode.cpp
@@ -133,6 +133,18 @@ static inline ushort decodeNibble(ushort c)
            c >= 'A' ? c - 'A' + 0xA : c - '0';
 }
 
+// if the sequence at input is 2*HEXDIG, returns its decoding
+// returns -1 if it isn't.
+// assumes that the range has been checked already
+static inline ushort decodePercentEncoding(const ushort *input)
+{
+    ushort c1 = input[0];
+    ushort c2 = input[1];
+    if (!isHex(c1) || !isHex(c2))
+        return ushort(-1);
+    return decodeNibble(c1) << 4 | decodeNibble(c2);
+}
+
 static inline ushort encodeNibble(ushort c)
 {
     static const uchar hexnumbers[] = "0123456789ABCDEF";
@@ -170,16 +182,15 @@ static inline bool isUnicodeNonCharacter(uint ucs4)
 // returns true if we performed an UTF-8 decoding
 static uint encodedUtf8ToUcs4(QString &result, ushort *&output, const ushort *&input, const ushort *end, ushort decoded)
 {
+    int charsNeeded;
+    uint min_uc;
+    uint uc;
+
     if (decoded <= 0xC1) {
         // an UTF-8 first character must be at least 0xC0
         // however, all 0xC0 and 0xC1 first bytes can only produce overlong sequences
         return false;
-    }
-
-    int charsNeeded;
-    uint min_uc;
-    uint uc;
-    if (decoded < 0xe0) {
+    } else if (decoded < 0xe0) {
         charsNeeded = 1;
         min_uc = 0x80;
         uc = decoded & 0x1f;
@@ -194,7 +205,7 @@ static uint encodedUtf8ToUcs4(QString &result, ushort *&output, const ushort *&i
     } else {
         // the last Unicode character is U+10FFFF
         // it's encoded in UTF-8 as "\xF4\x8F\xBF\xBF"
-        // therefore, a byte outside the range 0xC0..0xF4 is not the UTF-8 first byte
+        // therefore, a byte higher than 0xF4 is not the UTF-8 first byte
         return false;
     }
 
@@ -206,7 +217,7 @@ static uint encodedUtf8ToUcs4(QString &result, ushort *&output, const ushort *&i
         return false;
 
     // first continuation character
-    decoded = (decodeNibble(input[3]) << 4) | decodeNibble(input[4]);
+    decoded = decodePercentEncoding(input + 3);
     if ((decoded & 0xc0) != 0x80)
         return false;
     uc <<= 6;
@@ -217,7 +228,7 @@ static uint encodedUtf8ToUcs4(QString &result, ushort *&output, const ushort *&i
             return false;
 
         // second continuation character
-        decoded = (decodeNibble(input[6]) << 4) | decodeNibble(input[7]);
+        decoded = decodePercentEncoding(input + 6);
         if ((decoded & 0xc0) != 0x80)
             return false;
         uc <<= 6;
@@ -228,7 +239,7 @@ static uint encodedUtf8ToUcs4(QString &result, ushort *&output, const ushort *&i
                 return false;
 
             // third continuation character
-            decoded = (decodeNibble(input[9]) << 4) | decodeNibble(input[10]);
+            decoded = decodePercentEncoding(input + 9);
             if ((decoded & 0xc0) != 0x80)
                 return false;
             uc <<= 6;
@@ -348,72 +359,82 @@ static void unicodeToEncodedUtf8(QString &result, ushort *&output, const ushort
     *output++ = encodeNibble(c & 0xf);
 }
 
-Q_AUTOTEST_EXPORT QString
-qt_urlRecode(const QString &component, QUrl::ComponentFormattingOptions encoding,
-             const uchar *tableModifications)
+static QString recode(const QString &component, QUrl::ComponentFormattingOptions encoding,
+                      const uchar *actionTable, bool retryBadEncoding)
 {
-    uchar actionTable[sizeof defaultActionTable];
-    memcpy(actionTable, defaultActionTable, sizeof actionTable);
-    if (encoding & QUrl::DecodeSpaces)
-        actionTable[0] = DecodeCharacter; // decode
-
-    if (tableModifications) {
-        for (const ushort *p = tableModifications; *p; ++p)
-            actionTable[uchar(*p) - ' '] = *p >> 8;
-    }
-
     QString result = component;
     const ushort *input = reinterpret_cast<const ushort *>(component.constData());
     const ushort * const end = input + component.length();
     ushort *output = 0;
 
     while (input != end) {
-        register ushort c = *input++;
-        register ushort decoded;
-        if (c == '%') {
-            // our input is always valid, so there are two hex characters for us to read here
-            decoded = (decodeNibble(input[0]) << 4) | decodeNibble(input[1]);
-        } else {
-            decoded = c;
+        register ushort c;
+        EncodingAction action;
+
+        // try a run where no change is necessary
+        while (input != end) {
+            c = *input++;
+            if (c < 0x20 || c >= 0x80) // also: (c - 0x20 < 0x60U)
+                goto non_trivial;
+            action = EncodingAction(actionTable[c - ' ']);
+            if (action == EncodeCharacter)
+                goto non_trivial;
+            if (output)
+                *output++ = c;
         }
+        break;
 
-        EncodingAction action;
-        if (decoded < 0x20) {
-            // always encode control characters
-            action = EncodeCharacter;
-        } else if (decoded < 0x80) {
-            // use the table
-            action = EncodingAction(actionTable[decoded - ' ']);
-        } else {
-            // non-ASCII
-            bool decodeUnicode = encoding & QUrl::DecodeUnicode;
+non_trivial:
+        register uint decoded;
+        if (c == '%' && retryBadEncoding) {
+            // always write "%25"
+            ensureDetached(result, output, input, end);
+            *output++ = '%';
+            *output++ = '2';
+            *output++ = '5';
+            continue;
+        } else if (c == '%') {
+            // check if the input is valid
+            if (input + 1 >= end || (decoded = decodePercentEncoding(input)) == ushort(-1)) {
+                // not valid, retry
+                result.clear();
+                return recode(component, encoding, actionTable, true);
+            }
 
-            // should we leave it like this?
-            if ((c != '%' && decodeUnicode) || (c == '%' && !decodeUnicode)) {
-                action = LeaveCharacter;
-            } else if (decodeUnicode) {
-                // c == '%': decode the UTF-8 sequence
-                if (encodedUtf8ToUcs4(result, output, input, end, decoded))
+            if (decoded >= 0x80) {
+                // decode the UTF-8 sequence
+                if (encoding & QUrl::DecodeUnicode &&
+                        encodedUtf8ToUcs4(result, output, input, end, decoded))
                     continue;
+
+                // decoding the encoded UTF-8 failed
                 action = LeaveCharacter;
-            } else {
-                // c != '%': encode the UTF-8 sequence
+            } else if (decoded >= 0x20) {
+                action = EncodingAction(actionTable[decoded - ' ']);
+            }
+        } else {
+            decoded = c;
+            if (decoded >= 0x80 && (encoding & QUrl::DecodeUnicode) == 0) {
+                // encode the UTF-8 sequence
                 unicodeToEncodedUtf8(result, output, input, end, decoded);
                 continue;
+            } else if (decoded >= 0x80) {
+                if (output)
+                    *output++ = c;
+                continue;
             }
         }
 
+        if (decoded < 0x20)
+            action = EncodeCharacter;
+
         // there are six possibilities:
         //  current \ action  | DecodeCharacter | LeaveCharacter | EncodeCharacter
         //      decoded       |    1:leave      |    2:leave     |    3:encode
         //      encoded       |    4:decode     |    5:leave     |    6:leave
+        // cases 1 and 2 were handled before this section
 
-        if (c != '%' && (action == LeaveCharacter || action == DecodeCharacter)) {
-            // cases 1 and 2: it's decoded and we're leaving it as is
-            // there's always enough memory allocated for a single character
-            if (output)
-                *output++ = c;
-        } else if (c == '%' && (action == LeaveCharacter || action == EncodeCharacter)) {
+        if (c == '%' && action != DecodeCharacter) {
             // cases 5 and 6: it's encoded and we're leaving it as it is
             // except we're pedantic and we'll uppercase the hex
             if (output || !isUpperHex(input[0]) || !isUpperHex(input[1])) {
@@ -442,63 +463,31 @@ qt_urlRecode(const QString &component, QUrl::ComponentFormattingOptions encoding
 }
 
 Q_AUTOTEST_EXPORT QString
-qt_tolerantParsePercentEncoding(const QString &url)
+qt_urlRecode(const QString &component, QUrl::ComponentFormattingOptions encoding,
+             const ushort *tableModifications)
 {
-    // are there any '%'
-    int firstPercent = url.indexOf(QLatin1Char('%'));
-    if (firstPercent == -1) {
-        // none found, the string is fine
-        return url;
+    uchar actionTable[sizeof defaultActionTable];
+    if (encoding & QUrl::DecodeAllDelimiters) {
+        // reset the table
+        memset(actionTable, DecodeCharacter, sizeof actionTable);
+        if (!(encoding & QUrl::DecodeSpaces))
+            actionTable[0] = EncodeCharacter;
+
+        // these are always encoded
+        actionTable['%' - ' '] = EncodeCharacter;
+        actionTable[0x7F - ' '] = EncodeCharacter;
+    } else {
+        memcpy(actionTable, defaultActionTable, sizeof actionTable);
+        if (encoding & QUrl::DecodeSpaces)
+            actionTable[0] = DecodeCharacter; // decode
     }
 
-    // are there any invalid percents?
-    int nextPercent = firstPercent;
-    int percentCount = 0;
-
-    {
-        int len = url.length();
-        bool ok = true;
-        do {
-            ++percentCount;
-            if (nextPercent + 2 >= len ||
-                    !isHex(url.at(nextPercent + 1).unicode()) ||
-                    !isHex(url.at(nextPercent + 2).unicode())) {
-                ok = false;
-            }
-
-            nextPercent = url.indexOf(QLatin1Char('%'), nextPercent + 1);
-        } while (nextPercent != -1);
-
-        if (ok)
-            return url;
+    if (tableModifications) {
+        for (const ushort *p = tableModifications; *p; ++p)
+            actionTable[uchar(*p) - ' '] = *p >> 8;
     }
 
-    // we've found at least one invalid percent
-    // that means all of them are invalid
-    QString corrected(url.size() + percentCount * 2, Qt::Uninitialized);
-    ushort *output = reinterpret_cast<ushort *>(corrected.data());
-    const ushort *input = reinterpret_cast<const ushort *>(url.constData());
-    for (int i = 0; i <= firstPercent; ++i)
-        output[i] = input[i];
-
-    const ushort *const end = input + url.length();
-    output += firstPercent + 1;
-    input += firstPercent + 1;
-
-    // we've copied up to the first percent
-    // correct this one and all others
-    *output++ = '2';
-    *output++ = '5';
-    while (input != end) {
-        // copy verbatim until the next percent, inclusive
-        *output++ = *input;
-        if (*input == '%') {
-            *output++ = '2';
-            *output++ = '5';
-        }
-        ++input;
-    }
-    return corrected;
+    return recode(component, encoding, actionTable, false);
 }
 
 QT_END_NAMESPACE
author	Thiago Macieira <thiago.macieira@intel.com>	2011-09-06 20:36:28 +0200
committer	Qt by Nokia <qt-info@nokia.com>	2012-03-30 01:19:59 +0200
commit	73e16b15a6b3dc9838763407665d5797ba5618b2 (patch)
tree	9d89100a2d65e99d27d1a815175e9ae5ef58fcde /src/corelib/io/qurlrecode.cpp
parent	6028efa3ff56b58ce70d5b8fdb53030185149028 (diff)