diff options
-rw-r--r-- | src/corelib/io/qurlrecode.cpp | 201 | ||||
-rw-r--r-- | tests/auto/corelib/io/qurlinternal/tst_qurlinternal.cpp | 18 |
2 files changed, 107 insertions, 112 deletions
diff --git a/src/corelib/io/qurlrecode.cpp b/src/corelib/io/qurlrecode.cpp index 6a0517a7e5..27f541915d 100644 --- a/src/corelib/io/qurlrecode.cpp +++ b/src/corelib/io/qurlrecode.cpp @@ -133,6 +133,18 @@ static inline ushort decodeNibble(ushort c) c >= 'A' ? c - 'A' + 0xA : c - '0'; } +// if the sequence at input is 2*HEXDIG, returns its decoding +// returns -1 if it isn't. +// assumes that the range has been checked already +static inline ushort decodePercentEncoding(const ushort *input) +{ + ushort c1 = input[0]; + ushort c2 = input[1]; + if (!isHex(c1) || !isHex(c2)) + return ushort(-1); + return decodeNibble(c1) << 4 | decodeNibble(c2); +} + static inline ushort encodeNibble(ushort c) { static const uchar hexnumbers[] = "0123456789ABCDEF"; @@ -170,16 +182,15 @@ static inline bool isUnicodeNonCharacter(uint ucs4) // returns true if we performed an UTF-8 decoding static uint encodedUtf8ToUcs4(QString &result, ushort *&output, const ushort *&input, const ushort *end, ushort decoded) { + int charsNeeded; + uint min_uc; + uint uc; + if (decoded <= 0xC1) { // an UTF-8 first character must be at least 0xC0 // however, all 0xC0 and 0xC1 first bytes can only produce overlong sequences return false; - } - - int charsNeeded; - uint min_uc; - uint uc; - if (decoded < 0xe0) { + } else if (decoded < 0xe0) { charsNeeded = 1; min_uc = 0x80; uc = decoded & 0x1f; @@ -194,7 +205,7 @@ static uint encodedUtf8ToUcs4(QString &result, ushort *&output, const ushort *&i } else { // the last Unicode character is U+10FFFF // it's encoded in UTF-8 as "\xF4\x8F\xBF\xBF" - // therefore, a byte outside the range 0xC0..0xF4 is not the UTF-8 first byte + // therefore, a byte higher than 0xF4 is not the UTF-8 first byte return false; } @@ -206,7 +217,7 @@ static uint encodedUtf8ToUcs4(QString &result, ushort *&output, const ushort *&i return false; // first continuation character - decoded = (decodeNibble(input[3]) << 4) | decodeNibble(input[4]); + decoded = decodePercentEncoding(input + 3); if ((decoded & 0xc0) != 0x80) return false; uc <<= 6; @@ -217,7 +228,7 @@ static uint encodedUtf8ToUcs4(QString &result, ushort *&output, const ushort *&i return false; // second continuation character - decoded = (decodeNibble(input[6]) << 4) | decodeNibble(input[7]); + decoded = decodePercentEncoding(input + 6); if ((decoded & 0xc0) != 0x80) return false; uc <<= 6; @@ -228,7 +239,7 @@ static uint encodedUtf8ToUcs4(QString &result, ushort *&output, const ushort *&i return false; // third continuation character - decoded = (decodeNibble(input[9]) << 4) | decodeNibble(input[10]); + decoded = decodePercentEncoding(input + 9); if ((decoded & 0xc0) != 0x80) return false; uc <<= 6; @@ -348,72 +359,82 @@ static void unicodeToEncodedUtf8(QString &result, ushort *&output, const ushort *output++ = encodeNibble(c & 0xf); } -Q_AUTOTEST_EXPORT QString -qt_urlRecode(const QString &component, QUrl::ComponentFormattingOptions encoding, - const uchar *tableModifications) +static QString recode(const QString &component, QUrl::ComponentFormattingOptions encoding, + const uchar *actionTable, bool retryBadEncoding) { - uchar actionTable[sizeof defaultActionTable]; - memcpy(actionTable, defaultActionTable, sizeof actionTable); - if (encoding & QUrl::DecodeSpaces) - actionTable[0] = DecodeCharacter; // decode - - if (tableModifications) { - for (const ushort *p = tableModifications; *p; ++p) - actionTable[uchar(*p) - ' '] = *p >> 8; - } - QString result = component; const ushort *input = reinterpret_cast<const ushort *>(component.constData()); const ushort * const end = input + component.length(); ushort *output = 0; while (input != end) { - register ushort c = *input++; - register ushort decoded; - if (c == '%') { - // our input is always valid, so there are two hex characters for us to read here - decoded = (decodeNibble(input[0]) << 4) | decodeNibble(input[1]); - } else { - decoded = c; + register ushort c; + EncodingAction action; + + // try a run where no change is necessary + while (input != end) { + c = *input++; + if (c < 0x20 || c >= 0x80) // also: (c - 0x20 < 0x60U) + goto non_trivial; + action = EncodingAction(actionTable[c - ' ']); + if (action == EncodeCharacter) + goto non_trivial; + if (output) + *output++ = c; } + break; - EncodingAction action; - if (decoded < 0x20) { - // always encode control characters - action = EncodeCharacter; - } else if (decoded < 0x80) { - // use the table - action = EncodingAction(actionTable[decoded - ' ']); - } else { - // non-ASCII - bool decodeUnicode = encoding & QUrl::DecodeUnicode; +non_trivial: + register uint decoded; + if (c == '%' && retryBadEncoding) { + // always write "%25" + ensureDetached(result, output, input, end); + *output++ = '%'; + *output++ = '2'; + *output++ = '5'; + continue; + } else if (c == '%') { + // check if the input is valid + if (input + 1 >= end || (decoded = decodePercentEncoding(input)) == ushort(-1)) { + // not valid, retry + result.clear(); + return recode(component, encoding, actionTable, true); + } - // should we leave it like this? - if ((c != '%' && decodeUnicode) || (c == '%' && !decodeUnicode)) { - action = LeaveCharacter; - } else if (decodeUnicode) { - // c == '%': decode the UTF-8 sequence - if (encodedUtf8ToUcs4(result, output, input, end, decoded)) + if (decoded >= 0x80) { + // decode the UTF-8 sequence + if (encoding & QUrl::DecodeUnicode && + encodedUtf8ToUcs4(result, output, input, end, decoded)) continue; + + // decoding the encoded UTF-8 failed action = LeaveCharacter; - } else { - // c != '%': encode the UTF-8 sequence + } else if (decoded >= 0x20) { + action = EncodingAction(actionTable[decoded - ' ']); + } + } else { + decoded = c; + if (decoded >= 0x80 && (encoding & QUrl::DecodeUnicode) == 0) { + // encode the UTF-8 sequence unicodeToEncodedUtf8(result, output, input, end, decoded); continue; + } else if (decoded >= 0x80) { + if (output) + *output++ = c; + continue; } } + if (decoded < 0x20) + action = EncodeCharacter; + // there are six possibilities: // current \ action | DecodeCharacter | LeaveCharacter | EncodeCharacter // decoded | 1:leave | 2:leave | 3:encode // encoded | 4:decode | 5:leave | 6:leave + // cases 1 and 2 were handled before this section - if (c != '%' && (action == LeaveCharacter || action == DecodeCharacter)) { - // cases 1 and 2: it's decoded and we're leaving it as is - // there's always enough memory allocated for a single character - if (output) - *output++ = c; - } else if (c == '%' && (action == LeaveCharacter || action == EncodeCharacter)) { + if (c == '%' && action != DecodeCharacter) { // cases 5 and 6: it's encoded and we're leaving it as it is // except we're pedantic and we'll uppercase the hex if (output || !isUpperHex(input[0]) || !isUpperHex(input[1])) { @@ -442,63 +463,31 @@ qt_urlRecode(const QString &component, QUrl::ComponentFormattingOptions encoding } Q_AUTOTEST_EXPORT QString -qt_tolerantParsePercentEncoding(const QString &url) +qt_urlRecode(const QString &component, QUrl::ComponentFormattingOptions encoding, + const ushort *tableModifications) { - // are there any '%' - int firstPercent = url.indexOf(QLatin1Char('%')); - if (firstPercent == -1) { - // none found, the string is fine - return url; + uchar actionTable[sizeof defaultActionTable]; + if (encoding & QUrl::DecodeAllDelimiters) { + // reset the table + memset(actionTable, DecodeCharacter, sizeof actionTable); + if (!(encoding & QUrl::DecodeSpaces)) + actionTable[0] = EncodeCharacter; + + // these are always encoded + actionTable['%' - ' '] = EncodeCharacter; + actionTable[0x7F - ' '] = EncodeCharacter; + } else { + memcpy(actionTable, defaultActionTable, sizeof actionTable); + if (encoding & QUrl::DecodeSpaces) + actionTable[0] = DecodeCharacter; // decode } - // are there any invalid percents? - int nextPercent = firstPercent; - int percentCount = 0; - - { - int len = url.length(); - bool ok = true; - do { - ++percentCount; - if (nextPercent + 2 >= len || - !isHex(url.at(nextPercent + 1).unicode()) || - !isHex(url.at(nextPercent + 2).unicode())) { - ok = false; - } - - nextPercent = url.indexOf(QLatin1Char('%'), nextPercent + 1); - } while (nextPercent != -1); - - if (ok) - return url; + if (tableModifications) { + for (const ushort *p = tableModifications; *p; ++p) + actionTable[uchar(*p) - ' '] = *p >> 8; } - // we've found at least one invalid percent - // that means all of them are invalid - QString corrected(url.size() + percentCount * 2, Qt::Uninitialized); - ushort *output = reinterpret_cast<ushort *>(corrected.data()); - const ushort *input = reinterpret_cast<const ushort *>(url.constData()); - for (int i = 0; i <= firstPercent; ++i) - output[i] = input[i]; - - const ushort *const end = input + url.length(); - output += firstPercent + 1; - input += firstPercent + 1; - - // we've copied up to the first percent - // correct this one and all others - *output++ = '2'; - *output++ = '5'; - while (input != end) { - // copy verbatim until the next percent, inclusive - *output++ = *input; - if (*input == '%') { - *output++ = '2'; - *output++ = '5'; - } - ++input; - } - return corrected; + return recode(component, encoding, actionTable, false); } QT_END_NAMESPACE diff --git a/tests/auto/corelib/io/qurlinternal/tst_qurlinternal.cpp b/tests/auto/corelib/io/qurlinternal/tst_qurlinternal.cpp index c71acef148..3761603c28 100644 --- a/tests/auto/corelib/io/qurlinternal/tst_qurlinternal.cpp +++ b/tests/auto/corelib/io/qurlinternal/tst_qurlinternal.cpp @@ -50,7 +50,6 @@ Q_CORE_EXPORT extern void qt_nameprep(QString *source, int from); Q_CORE_EXPORT extern bool qt_check_std3rules(const QChar *, int); Q_CORE_EXPORT void qt_punycodeEncoder(const QChar *s, int ucLength, QString *output); Q_CORE_EXPORT QString qt_punycodeDecoder(const QString &pc); -Q_CORE_EXPORT QString qt_tolerantParsePercentEncoding(const QString &url); Q_CORE_EXPORT QString qt_urlRecode(const QString &component, QUrl::ComponentFormattingOptions encoding, const ushort *tableModifications = 0); QT_END_NAMESPACE @@ -791,6 +790,17 @@ void tst_QUrlInternal::correctEncodedMistakes_data() // three percents, one invalid QTest::newRow("%01%02%3") << "%01%02%3" << "%2501%2502%253"; + + // now mix bad percents with Unicode decoding + QTest::newRow("%C2%") << "%C2%" << "%25C2%25"; + QTest::newRow("%C2%A") << "%C2%A" << "%25C2%25A"; + QTest::newRow("%C2%Az") << "%C2%Az" << "%25C2%25Az"; + QTest::newRow("%E2%A0%") << "%E2%A0%" << "%25E2%25A0%25"; + QTest::newRow("%E2%A0%A") << "%E2%A0%A" << "%25E2%25A0%25A"; + QTest::newRow("%E2%A0%Az") << "%E2%A0%Az" << "%25E2%25A0%25Az"; + QTest::newRow("%F2%A0%A0%") << "%F2%A0%A0%" << "%25F2%25A0%25A0%25"; + QTest::newRow("%F2%A0%A0%A") << "%F2%A0%A0%A" << "%25F2%25A0%25A0%25A"; + QTest::newRow("%F2%A0%A0%Az") << "%F2%A0%A0%Az" << "%25F2%25A0%25A0%25Az"; } void tst_QUrlInternal::correctEncodedMistakes() @@ -798,7 +808,7 @@ void tst_QUrlInternal::correctEncodedMistakes() QFETCH(QString, input); QFETCH(QString, expected); - QString output = qt_tolerantParsePercentEncoding(input); + QString output = qt_urlRecode(input, QUrl::DecodeUnicode); QCOMPARE(output, expected); QCOMPARE(output.isNull(), expected.isNull()); } @@ -921,10 +931,6 @@ void tst_QUrlInternal::encodingRecode() QFETCH(QString, expected); QFETCH(QUrl::ComponentFormattingOptions, encodingMode); - // ensure the string is properly percent-encoded - QVERIFY2(input == qt_tolerantParsePercentEncoding(input), "Test data is not properly encoded"); - QVERIFY2(expected == qt_tolerantParsePercentEncoding(expected), "Test data is not properly encoded"); - QString output = qt_urlRecode(input, encodingMode); QCOMPARE(output, expected); QCOMPARE(output.isNull(), expected.isNull()); |