Add the code that recodes URLs.

This one function is an all-in-one: - UTF-8 encoder - UTF-8 decoder - percent encoder - percent decoder The next step is add the ability to modify the behaviour, by telling the function what else it must encode or decode and what it should leave untouched. Change-Id: I997eccfd2f9ad8487305670b18d6c806f4cf6717 Reviewed-by: Lars Knoll <lars.knoll@nokia.com>
author: Thiago Macieira <thiago.macieira@intel.com> 2011-09-05 23:17:21 +0200
committer: Qt by Nokia <qt-info@nokia.com> 2012-03-30 01:19:59 +0200
commit: 6028efa3ff56b58ce70d5b8fdb53030185149028 (patch)
tree: d809a79bf11ed92778da5dc81fe45bbe2165ace6
parent: 4c7e950aad0ed7b2bc114b3ffd5c73f7a433af52 (diff)
4 files changed, 733 insertions, 0 deletions
diff --git a/src/corelib/io/io.pri b/src/corelib/io/io.pri
index a3bc3afbcf..a70b3c9012 100644
--- a/src/corelib/io/io.pri
+++ b/src/corelib/io/io.pri
@@ -65,6 +65,7 @@ SOURCES += \
         io/qresource_iterator.cpp \
         io/qstandardpaths.cpp \
         io/qurl.cpp \
+        io/qurlrecode.cpp \
         io/qsettings.cpp \
         io/qfsfileengine.cpp \
         io/qfsfileengine_iterator.cpp \
diff --git a/src/corelib/io/qurl.h b/src/corelib/io/qurl.h
index 7c6cc29618..1600cb8fa3 100644
--- a/src/corelib/io/qurl.h
+++ b/src/corelib/io/qurl.h
@@ -82,6 +82,18 @@ public:
     };
     Q_DECLARE_FLAGS(FormattingOptions, FormattingOption)
 
+    enum ComponentFormattingOption {
+        FullyEncoded = 0x000000,
+        DecodeSpaces = 0x100000,
+        DecodeUnambiguousDelimiters = 0x200000,
+        DecodeAllDelimiters = DecodeUnambiguousDelimiters | 0x400000,
+        DecodeUnicode = 0x800000,
+
+        PrettyDecoded = DecodeSpaces | DecodeUnambiguousDelimiters | DecodeUnicode,
+        MostDecoded = PrettyDecoded | DecodeAllDelimiters
+    };
+    Q_DECLARE_FLAGS(ComponentFormattingOptions, ComponentFormattingOption)
+
     QUrl();
 #ifdef QT_NO_URL_CAST_FROM_STRING
     explicit
@@ -236,6 +248,7 @@ inline uint qHash(const QUrl &url)
 
 Q_DECLARE_TYPEINFO(QUrl, Q_MOVABLE_TYPE);
 Q_DECLARE_SHARED(QUrl)
+Q_DECLARE_OPERATORS_FOR_FLAGS(QUrl::ComponentFormattingOptions)
 Q_DECLARE_OPERATORS_FOR_FLAGS(QUrl::FormattingOptions)
 
 #ifndef QT_NO_DATASTREAM
diff --git a/src/corelib/io/qurlrecode.cpp b/src/corelib/io/qurlrecode.cpp
new file mode 100644
index 0000000000..6a0517a7e5
--- /dev/null
+++ b/src/corelib/io/qurlrecode.cpp
@@ -0,0 +1,504 @@
+/****************************************************************************
+**
+** Copyright (C) 2012 Intel Corporation
+** Contact: http://www.qt-project.org/
+**
+** This file is part of the QtCore module of the Qt Toolkit.
+**
+** $QT_BEGIN_LICENSE:LGPL$
+** GNU Lesser General Public License Usage
+** This file may be used under the terms of the GNU Lesser General Public
+** License version 2.1 as published by the Free Software Foundation and
+** appearing in the file LICENSE.LGPL included in the packaging of this
+** file. Please review the following information to ensure the GNU Lesser
+** General Public License version 2.1 requirements will be met:
+** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
+**
+** In addition, as a special exception, Nokia gives you certain additional
+** rights. These rights are described in the Nokia Qt LGPL Exception
+** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
+**
+** GNU General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU General
+** Public License version 3.0 as published by the Free Software Foundation
+** and appearing in the file LICENSE.GPL included in the packaging of this
+** file. Please review the following information to ensure the GNU General
+** Public License version 3.0 requirements will be met:
+** http://www.gnu.org/copyleft/gpl.html.
+**
+** Other Usage
+** Alternatively, this file may be used in accordance with the terms and
+** conditions contained in a signed written agreement between you and Nokia.
+**
+**
+**
+**
+**
+**
+** $QT_END_LICENSE$
+**
+****************************************************************************/
+
+#include "qurl.h"
+
+QT_BEGIN_NAMESPACE
+
+// ### move to qurl_p.h
+enum EncodingAction {
+    DecodeCharacter = 0,
+    LeaveCharacter = 1,
+    EncodeCharacter = 2
+};
+
+// From RFC 3896, Appendix A Collected ABNF for URI
+//    unreserved    = ALPHA / DIGIT / "-" / "." / "_" / "~"
+//    reserved      = gen-delims / sub-delims
+//    gen-delims    = ":" / "/" / "?" / "#" / "[" / "]" / "@"
+//    sub-delims    = "!" / "$" / "&" / "'" / "(" / ")"
+//                  / "*" / "+" / "," / ";" / "="
+static const uchar defaultActionTable[96] = {
+    2, // space
+    1, // '!' (sub-delim)
+    2, // '"'
+    1, // '#' (gen-delim)
+    1, // '$' (gen-delim)
+    2, // '%' (percent)
+    1, // '&' (gen-delim)
+    1, // "'" (sub-delim)
+    1, // '(' (sub-delim)
+    1, // ')' (sub-delim)
+    1, // '*' (sub-delim)
+    1, // '+' (sub-delim)
+    1, // ',' (sub-delim)
+    0, // '-' (unreserved)
+    0, // '.' (unreserved)
+    1, // '/' (gen-delim)
+
+    0, 0, 0, 0, 0,  // '0' to '4' (unreserved)
+    0, 0, 0, 0, 0,  // '5' to '9' (unreserved)
+    1, // ':' (gen-delim)
+    1, // ';' (sub-delim)
+    2, // '<'
+    1, // '=' (sub-delim)
+    2, // '>'
+    1, // '?' (gen-delim)
+
+    1, // '@' (gen-delim)
+    0, 0, 0, 0, 0,  // 'A' to 'E' (unreserved)
+    0, 0, 0, 0, 0,  // 'F' to 'J' (unreserved)
+    0, 0, 0, 0, 0,  // 'K' to 'O' (unreserved)
+    0, 0, 0, 0, 0,  // 'P' to 'T' (unreserved)
+    0, 0, 0, 0, 0, 0,  // 'U' to 'Z' (unreserved)
+    1, // '[' (gen-delim)
+    2, // '\'
+    1, // ']' (gen-delim)
+    2, // '^'
+    0, // '_' (unreserved)
+
+    2, // '`'
+    0, 0, 0, 0, 0,  // 'a' to 'e' (unreserved)
+    0, 0, 0, 0, 0,  // 'f' to 'j' (unreserved)
+    0, 0, 0, 0, 0,  // 'k' to 'o' (unreserved)
+    0, 0, 0, 0, 0,  // 'p' to 't' (unreserved)
+    0, 0, 0, 0, 0, 0,  // 'u' to 'z' (unreserved)
+    2, // '{'
+    2, // '|'
+    2, // '}'
+    0, // '~' (unreserved)
+
+    2  // BSKP
+};
+
+static inline bool isHex(ushort c)
+{
+    return (c >= 'a' && c <= 'f') ||
+            (c >= 'A' && c <= 'F') ||
+            (c >= '0' && c <= '9');
+}
+
+static inline bool isUpperHex(ushort c)
+{
+    // undefined behaviour if c isn't an hex char!
+    return c < 0x60;
+}
+
+static inline ushort toUpperHex(ushort c)
+{
+    return isUpperHex(c) ? c : c - 0x20;
+}
+
+static inline ushort decodeNibble(ushort c)
+{
+    return c >= 'a' ? c - 'a' + 0xA :
+           c >= 'A' ? c - 'A' + 0xA : c - '0';
+}
+
+static inline ushort encodeNibble(ushort c)
+{
+    static const uchar hexnumbers[] = "0123456789ABCDEF";
+    return hexnumbers[c & 0xf];
+}
+
+static void ensureDetached(QString &result, ushort *&output, const ushort *input, const ushort *end)
+{
+    if (!output) {
+        // now detach
+        // create enough space if the rest of the string needed to be percent-encoded
+        int charsProcessed = input - reinterpret_cast<const ushort *>(result.constData()) - 1;
+        int charsRemaining = end - input + 1;
+        int newSize = result.size() + 2 * charsRemaining;
+        result.resize(newSize);
+
+        // set the output variable
+        output = reinterpret_cast<ushort *>(result.data()) + charsProcessed;
+    }
+}
+
+static inline bool isUnicodeNonCharacter(uint ucs4)
+{
+    // Unicode has a couple of "non-characters" that one can use internally,
+    // but are not allowed to be used for text interchange.
+    //
+    // Those are the last two entries each Unicode Plane (U+FFFE, U+FFFF,
+    // U+1FFFE, U+1FFFF, etc.) as well as the entries between U+FDD0 and
+    // U+FDEF (inclusive)
+
+    return (ucs4 & 0xfffe) == 0xfffe
+            || (ucs4 - 0xfdd0U) < 16;
+}
+
+// returns true if we performed an UTF-8 decoding
+static uint encodedUtf8ToUcs4(QString &result, ushort *&output, const ushort *&input, const ushort *end, ushort decoded)
+{
+    if (decoded <= 0xC1) {
+        // an UTF-8 first character must be at least 0xC0
+        // however, all 0xC0 and 0xC1 first bytes can only produce overlong sequences
+        return false;
+    }
+
+    int charsNeeded;
+    uint min_uc;
+    uint uc;
+    if (decoded < 0xe0) {
+        charsNeeded = 1;
+        min_uc = 0x80;
+        uc = decoded & 0x1f;
+    } else if (decoded < 0xf0) {
+        charsNeeded = 2;
+        min_uc = 0x800;
+        uc = decoded & 0x0f;
+    } else if (decoded < 0xf5) {
+        charsNeeded = 3;
+        min_uc = 0x10000;
+        uc = decoded & 0x07;
+    } else {
+        // the last Unicode character is U+10FFFF
+        // it's encoded in UTF-8 as "\xF4\x8F\xBF\xBF"
+        // therefore, a byte outside the range 0xC0..0xF4 is not the UTF-8 first byte
+        return false;
+    }
+
+    // are there enough remaining?
+    if (end - input < 3*charsNeeded + 2)
+        return false;
+
+    if (input[2] != '%')
+        return false;
+
+    // first continuation character
+    decoded = (decodeNibble(input[3]) << 4) | decodeNibble(input[4]);
+    if ((decoded & 0xc0) != 0x80)
+        return false;
+    uc <<= 6;
+    uc |= decoded & 0x3f;
+
+    if (charsNeeded > 1) {
+        if (input[5] != '%')
+            return false;
+
+        // second continuation character
+        decoded = (decodeNibble(input[6]) << 4) | decodeNibble(input[7]);
+        if ((decoded & 0xc0) != 0x80)
+            return false;
+        uc <<= 6;
+        uc |= decoded & 0x3f;
+
+        if (charsNeeded > 2) {
+            if (input[8] != '%')
+                return false;
+
+            // third continuation character
+            decoded = (decodeNibble(input[9]) << 4) | decodeNibble(input[10]);
+            if ((decoded & 0xc0) != 0x80)
+                return false;
+            uc <<= 6;
+            uc |= decoded & 0x3f;
+        }
+    }
+
+    // we've decoded something; safety-check it
+    if (uc < min_uc)
+        return false;
+    if (isUnicodeNonCharacter(uc) || (uc >= 0xD800 && uc <= 0xDFFF) || uc >= 0x110000)
+        return false;
+
+    // detach if necessary
+    if (!output) {
+        // create enough space if the rest of the string needed to be percent-encoded
+        int charsProcessed = input - reinterpret_cast<const ushort *>(result.constData()) - 1;
+        int charsRemaining = end - input - 2 - 3*charsNeeded;
+        int newSize = result.size() + 2 * charsRemaining;
+        result.resize(newSize);
+
+        // set the output variable
+        output = reinterpret_cast<ushort *>(result.data()) + charsProcessed;
+    }
+
+    if (!QChar::requiresSurrogates(uc)) {
+        // UTF-8 decoded and no surrogates are required
+        *output++ = uc;
+    } else {
+        // UTF-8 decoded to something that requires a surrogate pair
+        *output++ = QChar::highSurrogate(uc);
+        *output++ = QChar::lowSurrogate(uc);
+    }
+    input += charsNeeded * 3 + 2;
+    return true;
+}
+
+static void unicodeToEncodedUtf8(QString &result, ushort *&output, const ushort *&input, const ushort *end, ushort decoded)
+{
+    uint uc = decoded;
+    if (QChar::isHighSurrogate(uc)) {
+        if (QChar::isLowSurrogate(*input))
+            uc = QChar::surrogateToUcs4(uc, *input);
+    }
+
+    // note: we will encode bad UTF-16 to UTF-8
+    // but they don't get decoded back
+
+    // calculate the utf8 length
+    int utf8len = uc >= 0x10000 ? 4 : uc >= 0x800 ? 3 : 2;
+
+    // detach
+    if (!output) {
+        // create enough space if the rest of the string needed to be percent-encoded
+        int charsProcessed = input - reinterpret_cast<const ushort *>(result.constData()) - 1;
+        int charsRemaining = end - input;
+        int newSize = result.size() + 2 * charsRemaining - 1 + 3*utf8len;
+        result.resize(newSize);
+
+        // set the output variable
+        output = reinterpret_cast<ushort *>(result.data()) + charsProcessed;
+    } else {
+        // verify that there's enough space or expand
+        int charsRemaining = end - input;
+        int pos = output - reinterpret_cast<const ushort *>(result.constData());
+        int spaceRemaining = result.size() - pos;
+        if (spaceRemaining < 3*charsRemaining + 3*utf8len) {
+            // must resize
+            result.resize(result.size() + 3*utf8len);
+            output = reinterpret_cast<ushort *>(result.data()) + pos;
+        }
+    }
+
+    if (QChar::requiresSurrogates(uc))
+        ++input;
+
+    // write the sequence
+    if (uc < 0x800) {
+        // first of two bytes
+        uchar c = 0xc0 | uchar(uc >> 6);
+        *output++ = '%';
+        *output++ = encodeNibble(c >> 4);
+        *output++ = encodeNibble(c & 0xf);
+    } else {
+        uchar c;
+        if (uc > 0xFFFF) {
+            // first two of four bytes
+            c = 0xf0 | uchar(uc >> 18);
+            *output++ = '%';
+            *output++ = 'F';
+            *output++ = encodeNibble(c & 0xf);
+
+            // continuation byte
+            c = 0x80 | (uchar(uc >> 12) & 0x3f);
+            *output++ = '%';
+            *output++ = encodeNibble(c >> 4);
+            *output++ = encodeNibble(c & 0xf);
+        } else {
+            // first of three bytes
+            c = 0xe0 | uchar(uc >> 12);
+            *output++ = '%';
+            *output++ = 'E';
+            *output++ = encodeNibble(c & 0xf);
+        }
+
+        // continuation byte
+        c = 0x80 | (uchar(uc >> 6) & 0x3f);
+        *output++ = '%';
+        *output++ = encodeNibble(c >> 4);
+        *output++ = encodeNibble(c & 0xf);
+    }
+
+    // continuation byte
+    uchar c = 0x80 | (uc & 0x3f);
+    *output++ = '%';
+    *output++ = encodeNibble(c >> 4);
+    *output++ = encodeNibble(c & 0xf);
+}
+
+Q_AUTOTEST_EXPORT QString
+qt_urlRecode(const QString &component, QUrl::ComponentFormattingOptions encoding,
+             const uchar *tableModifications)
+{
+    uchar actionTable[sizeof defaultActionTable];
+    memcpy(actionTable, defaultActionTable, sizeof actionTable);
+    if (encoding & QUrl::DecodeSpaces)
+        actionTable[0] = DecodeCharacter; // decode
+
+    if (tableModifications) {
+        for (const ushort *p = tableModifications; *p; ++p)
+            actionTable[uchar(*p) - ' '] = *p >> 8;
+    }
+
+    QString result = component;
+    const ushort *input = reinterpret_cast<const ushort *>(component.constData());
+    const ushort * const end = input + component.length();
+    ushort *output = 0;
+
+    while (input != end) {
+        register ushort c = *input++;
+        register ushort decoded;
+        if (c == '%') {
+            // our input is always valid, so there are two hex characters for us to read here
+            decoded = (decodeNibble(input[0]) << 4) | decodeNibble(input[1]);
+        } else {
+            decoded = c;
+        }
+
+        EncodingAction action;
+        if (decoded < 0x20) {
+            // always encode control characters
+            action = EncodeCharacter;
+        } else if (decoded < 0x80) {
+            // use the table
+            action = EncodingAction(actionTable[decoded - ' ']);
+        } else {
+            // non-ASCII
+            bool decodeUnicode = encoding & QUrl::DecodeUnicode;
+
+            // should we leave it like this?
+            if ((c != '%' && decodeUnicode) || (c == '%' && !decodeUnicode)) {
+                action = LeaveCharacter;
+            } else if (decodeUnicode) {
+                // c == '%': decode the UTF-8 sequence
+                if (encodedUtf8ToUcs4(result, output, input, end, decoded))
+                    continue;
+                action = LeaveCharacter;
+            } else {
+                // c != '%': encode the UTF-8 sequence
+                unicodeToEncodedUtf8(result, output, input, end, decoded);
+                continue;
+            }
+        }
+
+        // there are six possibilities:
+        //  current \ action  | DecodeCharacter | LeaveCharacter | EncodeCharacter
+        //      decoded       |    1:leave      |    2:leave     |    3:encode
+        //      encoded       |    4:decode     |    5:leave     |    6:leave
+
+        if (c != '%' && (action == LeaveCharacter || action == DecodeCharacter)) {
+            // cases 1 and 2: it's decoded and we're leaving it as is
+            // there's always enough memory allocated for a single character
+            if (output)
+                *output++ = c;
+        } else if (c == '%' && (action == LeaveCharacter || action == EncodeCharacter)) {
+            // cases 5 and 6: it's encoded and we're leaving it as it is
+            // except we're pedantic and we'll uppercase the hex
+            if (output || !isUpperHex(input[0]) || !isUpperHex(input[1])) {
+                ensureDetached(result, output, input, end);
+                *output++ = '%';
+                *output++ = toUpperHex(*input++);
+                *output++ = toUpperHex(*input++);
+            }
+        } else if (c == '%' && action == DecodeCharacter) {
+            // case 4: we need to decode
+            ensureDetached(result, output, input, end);
+            *output++ = decoded;
+            input += 2;
+        } else {
+            // must be case 3: we need to encode
+            ensureDetached(result, output, input, end);
+            *output++ = '%';
+            *output++ = encodeNibble(c >> 4);
+            *output++ = encodeNibble(c & 0xf);
+        }
+    }
+
+    if (output)
+        result.truncate(output - reinterpret_cast<const ushort *>(result.constData()));
+    return result;
+}
+
+Q_AUTOTEST_EXPORT QString
+qt_tolerantParsePercentEncoding(const QString &url)
+{
+    // are there any '%'
+    int firstPercent = url.indexOf(QLatin1Char('%'));
+    if (firstPercent == -1) {
+        // none found, the string is fine
+        return url;
+    }
+
+    // are there any invalid percents?
+    int nextPercent = firstPercent;
+    int percentCount = 0;
+
+    {
+        int len = url.length();
+        bool ok = true;
+        do {
+            ++percentCount;
+            if (nextPercent + 2 >= len ||
+                    !isHex(url.at(nextPercent + 1).unicode()) ||
+                    !isHex(url.at(nextPercent + 2).unicode())) {
+                ok = false;
+            }
+
+            nextPercent = url.indexOf(QLatin1Char('%'), nextPercent + 1);
+        } while (nextPercent != -1);
+
+        if (ok)
+            return url;
+    }
+
+    // we've found at least one invalid percent
+    // that means all of them are invalid
+    QString corrected(url.size() + percentCount * 2, Qt::Uninitialized);
+    ushort *output = reinterpret_cast<ushort *>(corrected.data());
+    const ushort *input = reinterpret_cast<const ushort *>(url.constData());
+    for (int i = 0; i <= firstPercent; ++i)
+        output[i] = input[i];
+
+    const ushort *const end = input + url.length();
+    output += firstPercent + 1;
+    input += firstPercent + 1;
+
+    // we've copied up to the first percent
+    // correct this one and all others
+    *output++ = '2';
+    *output++ = '5';
+    while (input != end) {
+        // copy verbatim until the next percent, inclusive
+        *output++ = *input;
+        if (*input == '%') {
+            *output++ = '2';
+            *output++ = '5';
+        }
+        ++input;
+    }
+    return corrected;
+}
+
+QT_END_NAMESPACE
diff --git a/tests/auto/corelib/io/qurlinternal/tst_qurlinternal.cpp b/tests/auto/corelib/io/qurlinternal/tst_qurlinternal.cpp
index 10c4736f68..c71acef148 100644
--- a/tests/auto/corelib/io/qurlinternal/tst_qurlinternal.cpp
+++ b/tests/auto/corelib/io/qurlinternal/tst_qurlinternal.cpp
@@ -1,6 +1,7 @@
 /****************************************************************************
 **
 ** Copyright (C) 2012 Nokia Corporation and/or its subsidiary(-ies).
+** Copyright (C) 2012 Intel Corporation.
 ** Contact: http://www.qt-project.org/
 **
 ** This file is part of the test suite of the Qt Toolkit.
@@ -49,6 +50,9 @@ Q_CORE_EXPORT extern void qt_nameprep(QString *source, int from);
 Q_CORE_EXPORT extern bool qt_check_std3rules(const QChar *, int);
 Q_CORE_EXPORT void qt_punycodeEncoder(const QChar *s, int ucLength, QString *output);
 Q_CORE_EXPORT QString qt_punycodeDecoder(const QString &pc);
+Q_CORE_EXPORT QString qt_tolerantParsePercentEncoding(const QString &url);
+Q_CORE_EXPORT QString qt_urlRecode(const QString &component, QUrl::ComponentFormattingOptions encoding,
+                                   const ushort *tableModifications = 0);
 QT_END_NAMESPACE
 
 // For testsuites
@@ -72,6 +76,7 @@ struct ushortarray {
 
 Q_DECLARE_METATYPE(ushortarray)
 Q_DECLARE_METATYPE(QUrl::FormattingOptions)
+Q_DECLARE_METATYPE(QUrl::ComponentFormattingOptions)
 
 class tst_QUrlInternal : public QObject
 {
@@ -91,6 +96,14 @@ private Q_SLOTS:
     void std3violations();
     void std3deviations_data();
     void std3deviations();
+
+    // percent-encoding internals
+    void correctEncodedMistakes_data();
+    void correctEncodedMistakes();
+    void encodingRecode_data();
+    void encodingRecode();
+    void encodingRecodeInvalidUtf8_data();
+    void encodingRecodeInvalidUtf8();
 };
 
 void tst_QUrlInternal::idna_testsuite_data()
@@ -745,6 +758,208 @@ void tst_QUrlInternal::std3deviations()
     QVERIFY(!url.host().isEmpty());
 }
 
+void tst_QUrlInternal::correctEncodedMistakes_data()
+{
+    QTest::addColumn<QString>("input");
+    QTest::addColumn<QString>("expected");
+
+    QTest::newRow("null") << QString() << QString();
+    QTest::newRow("empty") << "" << "";
+
+    // these contain one invalid percent
+    QTest::newRow("%") << QString("%") << QString("%25");
+    QTest::newRow("3%") << QString("3%") << QString("3%25");
+    QTest::newRow("13%") << QString("13%") << QString("13%25");
+    QTest::newRow("13%!") << QString("13%!") << QString("13%25!");
+    QTest::newRow("13%!!") << QString("13%!!") << QString("13%25!!");
+    QTest::newRow("13%a") << QString("13%a") << QString("13%25a");
+    QTest::newRow("13%az") << QString("13%az") << QString("13%25az");
+
+    // two invalid percents
+    QTest::newRow("13%%") << "13%%" << "13%25%25";
+    QTest::newRow("13%a%a") << "13%a%a" << "13%25a%25a";
+    QTest::newRow("13%az%az") << "13%az%az" << "13%25az%25az";
+
+    // these are correct (idempotent)
+    QTest::newRow("13%25") << QString("13%25")  << QString("13%25");
+    QTest::newRow("13%25%25") << QString("13%25%25")  << QString("13%25%25");
+
+    // these contain one invalid and one valid
+    // the code assumes they are all invalid
+    QTest::newRow("13%13..%") << "13%13..%" << "13%2513..%25";
+    QTest::newRow("13%..%13") << "13%..%13" << "13%25..%2513";
+
+    // three percents, one invalid
+    QTest::newRow("%01%02%3") << "%01%02%3" << "%2501%2502%253";
+}
+
+void tst_QUrlInternal::correctEncodedMistakes()
+{
+    QFETCH(QString, input);
+    QFETCH(QString, expected);
+
+    QString output = qt_tolerantParsePercentEncoding(input);
+    QCOMPARE(output, expected);
+    QCOMPARE(output.isNull(), expected.isNull());
+}
+
+static void addUtf8Data(const char *name, const char *data)
+{
+    QString encoded = QByteArray(data).toPercentEncoding();
+    QString decoded = QString::fromUtf8(data);
+
+    QTest::newRow(QByteArray("decode-") + name) << encoded << QUrl::ComponentFormattingOptions(QUrl::DecodeUnicode) << decoded;
+    QTest::newRow(QByteArray("encode-") + name) << decoded << QUrl::ComponentFormattingOptions(QUrl::FullyEncoded) << encoded;
+}
+
+void tst_QUrlInternal::encodingRecode_data()
+{
+    typedef QUrl::ComponentFormattingOptions F;
+    QTest::addColumn<QString>("input");
+    QTest::addColumn<F>("encodingMode");
+    QTest::addColumn<QString>("expected");
+
+    // -- idempotent tests --
+    for (int i = 0; i < 0x10; ++i) {
+        QByteArray code = QByteArray::number(i, 16);
+        F mode = QUrl::ComponentFormattingOption(i << 12);
+
+        QTest::newRow("null-0x" + code) << QString() << mode << QString();
+        QTest::newRow("empty-0x" + code) << "" << mode << "";
+
+        //    unreserved    = ALPHA / DIGIT / "-" / "." / "_" / "~"
+        // Unreserved characters are never encoded
+        QTest::newRow("alpha-0x" + code) << "abcABCZZzz" << mode << "abcABCZZzz";
+        QTest::newRow("digits-0x" + code) << "01234567890" << mode << "01234567890";
+        QTest::newRow("otherunreserved-0x" + code) << "-._~" << mode << "-._~";
+
+        // Control characters are always encoded
+        // Use uppercase because the output is also uppercased
+        QTest::newRow("control-nul-0x" + code) << "%00" << mode << "%00";
+        QTest::newRow("control-0x" + code) << "%0D%0A%1F%1A%7F" << mode << "%0D%0A%1F%1A%7F";
+
+        // The percent is always encoded
+        QTest::newRow("percent-0x" + code) << "25%2525" << mode << "25%2525";
+
+        // mixed control and unreserved
+        QTest::newRow("control-unreserved-0x" + code) << "Foo%00Bar%0D%0Abksp%7F" << mode << "Foo%00Bar%0D%0Abksp%7F";
+    }
+
+    //    gen-delims    = ":" / "/" / "?" / "#" / "[" / "]" / "@"
+    //    sub-delims    = "!" / "$" / "&" / "'" / "(" / ")"
+    //                  / "*" / "+" / "," / ";" / "="
+    // in the default operation, delimiters don't get encoded or decoded
+    static const char delimiters[] =  ":/?#[]@" "!$&'()*+,;=";
+    for (const char *c = delimiters; *c; ++c) {
+        QByteArray code = QByteArray::number(*c, 16);
+        QString encoded = QString("abc%") + code.toUpper() + "def" ;
+        QString decoded = QString("abc") + *c + "def" ;
+        QTest::newRow("delimiter-encoded-" + code) << encoded << F(QUrl::FullyEncoded) << encoded;
+        QTest::newRow("delimiter-decoded-" + code) << decoded << F(QUrl::FullyEncoded) << decoded;
+    }
+
+    // encode control characters
+    QTest::newRow("encode-control") << "\1abc\2\033esc" << F(QUrl::MostDecoded) << "%01abc%02%1Besc";
+    QTest::newRow("encode-nul") << QString::fromLatin1("abc\0def", 7) << F(QUrl::MostDecoded) << "abc%00def";
+
+    // space
+    QTest::newRow("space-leave-decoded") << "Hello World " << F(QUrl::DecodeSpaces) << "Hello World ";
+    QTest::newRow("space-leave-encoded") << "Hello%20World%20" << F(QUrl::FullyEncoded) << "Hello%20World%20";
+    QTest::newRow("space-encode") << "Hello World " << F(QUrl::FullyEncoded) << "Hello%20World%20";
+    QTest::newRow("space-decode") << "Hello%20World%20" << F(QUrl::DecodeSpaces) << "Hello World ";
+
+    // decode unreserved
+    QTest::newRow("unreserved-decode") << "%66%6f%6f%42a%72" << F(QUrl::FullyEncoded) << "fooBar";
+
+    // mix encoding with decoding
+    QTest::newRow("encode-control-decode-space") << "\1\2%200" << F(QUrl::DecodeSpaces) << "%01%02 0";
+    QTest::newRow("decode-space-encode-control") << "%20\1\2" << F(QUrl::DecodeSpaces) << " %01%02";
+
+    // decode and encode valid UTF-8 data
+    // invalid is tested in encodingRecodeInvalidUtf8
+    addUtf8Data("utf8-2char-1", "\xC2\x80"); // U+0080
+    addUtf8Data("utf8-2char-2", "\xDF\xBF"); // U+07FF
+    addUtf8Data("utf8-3char-1", "\xE0\xA0\x80"); // U+0800
+    addUtf8Data("utf8-3char-2", "\xED\x9F\xBF"); // U+D7FF
+    addUtf8Data("utf8-3char-3", "\xEE\x80\x80"); // U+E000
+    addUtf8Data("utf8-3char-4", "\xEF\xBF\xBD"); // U+FFFD
+    addUtf8Data("utf8-2char-1", "\xF0\x90\x80\x80"); // U+10000
+    addUtf8Data("utf8-4char-2", "\xF4\x8F\xBF\xBD"); // U+10FFFD
+
+    // longer UTF-8 sequences, mixed with unreserved
+    addUtf8Data("utf8-string-1", "R\xc3\xa9sum\xc3\xa9");
+    addUtf8Data("utf8-string-2", "\xDF\xBF\xE0\xA0\x80""A");
+    addUtf8Data("utf8-string-3", "\xE0\xA0\x80\xDF\xBF...");
+
+    // special cases: stuff we can encode, but not decode
+    QTest::newRow("unicode-noncharacter") << QString(QChar(0xffff)) << F(QUrl::FullyEncoded) << "%EF%BF%BF";
+    QTest::newRow("unicode-lo-surrogate") << QString(QChar(0xD800)) << F(QUrl::FullyEncoded) << "%ED%A0%80";
+    QTest::newRow("unicode-hi-surrogate") << QString(QChar(0xDC00)) << F(QUrl::FullyEncoded) << "%ED%B0%80";
+
+    // a couple of Unicode strings with leading spaces
+    QTest::newRow("space-unicode") << QString::fromUtf8(" \xc2\xa0") << F(QUrl::FullyEncoded) << "%20%C2%A0";
+    QTest::newRow("space-space-unicode") << QString::fromUtf8("  \xc2\xa0") << F(QUrl::FullyEncoded) << "%20%20%C2%A0";
+    QTest::newRow("space-space-space-unicode") << QString::fromUtf8("   \xc2\xa0") << F(QUrl::FullyEncoded) << "%20%20%20%C2%A0";
+
+    // hex case testing
+    QTest::newRow("FF") << "%FF" << F(QUrl::FullyEncoded) << "%FF";
+    QTest::newRow("Ff") << "%Ff" << F(QUrl::FullyEncoded) << "%FF";
+    QTest::newRow("fF") << "%fF" << F(QUrl::FullyEncoded) << "%FF";
+    QTest::newRow("ff") << "%ff" << F(QUrl::FullyEncoded) << "%FF";
+
+    // decode UTF-8 mixed with non-UTF-8 and unreserved
+    QTest::newRow("utf8-mix-1") << "%80%C2%80" << F(QUrl::DecodeUnicode) << QString::fromUtf8("%80\xC2\x80");
+    QTest::newRow("utf8-mix-2") << "%C2%C2%80" << F(QUrl::DecodeUnicode) << QString::fromUtf8("%C2\xC2\x80");
+    QTest::newRow("utf8-mix-3") << "%E0%C2%80" << F(QUrl::DecodeUnicode) << QString::fromUtf8("%E0\xC2\x80");
+    QTest::newRow("utf8-mix-3") << "A%C2%80" << F(QUrl::DecodeUnicode) << QString::fromUtf8("A\xC2\x80");
+    QTest::newRow("utf8-mix-3") << "%C2%80A" << F(QUrl::DecodeUnicode) << QString::fromUtf8("\xC2\x80""A");
+}
+
+void tst_QUrlInternal::encodingRecode()
+{
+    QFETCH(QString, input);
+    QFETCH(QString, expected);
+    QFETCH(QUrl::ComponentFormattingOptions, encodingMode);
+
+    // ensure the string is properly percent-encoded
+    QVERIFY2(input == qt_tolerantParsePercentEncoding(input), "Test data is not properly encoded");
+    QVERIFY2(expected == qt_tolerantParsePercentEncoding(expected), "Test data is not properly encoded");
+
+    QString output = qt_urlRecode(input, encodingMode);
+    QCOMPARE(output, expected);
+    QCOMPARE(output.isNull(), expected.isNull());
+}
+
+void tst_QUrlInternal::encodingRecodeInvalidUtf8_data()
+{
+    QTest::addColumn<QByteArray>("utf8");
+    QTest::addColumn<QString>("utf16");
+
+    extern void loadInvalidUtf8Rows();
+    loadInvalidUtf8Rows();
+
+    extern void loadNonCharactersRows();
+    loadNonCharactersRows();
+
+    QTest::newRow("utf8-mix-4") << QByteArray("\xE0!A2\x80");
+    QTest::newRow("utf8-mix-5") << QByteArray("\xE0\xA2!80");
+    QTest::newRow("utf8-mix-5") << QByteArray("\xE0\xA2\x33");
+}
+
+void tst_QUrlInternal::encodingRecodeInvalidUtf8()
+{
+    QFETCH(QByteArray, utf8);
+    QString input = utf8.toPercentEncoding();
+
+    QString output;
+    output = qt_urlRecode(input, QUrl::DecodeUnicode);
+    QCOMPARE(output, input);
+
+    // this is just control
+    output = qt_urlRecode(input, QUrl::FullyEncoded);
+    QCOMPARE(output, input);
+}
+
 QTEST_APPLESS_MAIN(tst_QUrlInternal)
 
 #include "tst_qurlinternal.moc"
author	Thiago Macieira <thiago.macieira@intel.com>	2011-09-05 23:17:21 +0200
committer	Qt by Nokia <qt-info@nokia.com>	2012-03-30 01:19:59 +0200
commit	6028efa3ff56b58ce70d5b8fdb53030185149028 (patch)
tree	d809a79bf11ed92778da5dc81fe45bbe2165ace6
parent	4c7e950aad0ed7b2bc114b3ffd5c73f7a433af52 (diff)