diff options
Diffstat (limited to 'src/corelib/io')
-rw-r--r-- | src/corelib/io/qurlrecode.cpp | 157 |
1 files changed, 80 insertions, 77 deletions
diff --git a/src/corelib/io/qurlrecode.cpp b/src/corelib/io/qurlrecode.cpp index 27f541915d..3b08e1544d 100644 --- a/src/corelib/io/qurlrecode.cpp +++ b/src/corelib/io/qurlrecode.cpp @@ -138,8 +138,8 @@ static inline ushort decodeNibble(ushort c) // assumes that the range has been checked already static inline ushort decodePercentEncoding(const ushort *input) { - ushort c1 = input[0]; - ushort c2 = input[1]; + ushort c1 = input[1]; + ushort c2 = input[2]; if (!isHex(c1) || !isHex(c2)) return ushort(-1); return decodeNibble(c1) << 4 | decodeNibble(c2); @@ -151,18 +151,27 @@ static inline ushort encodeNibble(ushort c) return hexnumbers[c & 0xf]; } -static void ensureDetached(QString &result, ushort *&output, const ushort *input, const ushort *end) +static void ensureDetached(QString &result, ushort *&output, const ushort *begin, const ushort *input, const ushort *end, + int add = 0) { if (!output) { // now detach // create enough space if the rest of the string needed to be percent-encoded - int charsProcessed = input - reinterpret_cast<const ushort *>(result.constData()) - 1; - int charsRemaining = end - input + 1; - int newSize = result.size() + 2 * charsRemaining; - result.resize(newSize); - - // set the output variable - output = reinterpret_cast<ushort *>(result.data()) + charsProcessed; + int charsProcessed = input - begin; + int charsRemaining = end - input; + int spaceNeeded = end - begin + 2 * charsRemaining + add; + int origSize = result.size(); + result.resize(origSize + spaceNeeded); + + // we know that resize() above detached, so we bypass the reference count check + output = const_cast<ushort *>(reinterpret_cast<const ushort *>(result.constData())) + + origSize; + + // copy the chars we've already processed + int i; + for (i = 0; i < charsProcessed; ++i) + output[i] = begin[i]; + output += i; } } @@ -180,7 +189,8 @@ static inline bool isUnicodeNonCharacter(uint ucs4) } // returns true if we performed an UTF-8 decoding -static uint encodedUtf8ToUcs4(QString &result, ushort *&output, const ushort *&input, const ushort *end, ushort decoded) +static bool encodedUtf8ToUtf16(QString &result, ushort *&output, const ushort *begin, const ushort *&input, + const ushort *end, ushort decoded) { int charsNeeded; uint min_uc; @@ -191,15 +201,15 @@ static uint encodedUtf8ToUcs4(QString &result, ushort *&output, const ushort *&i // however, all 0xC0 and 0xC1 first bytes can only produce overlong sequences return false; } else if (decoded < 0xe0) { - charsNeeded = 1; + charsNeeded = 2; min_uc = 0x80; uc = decoded & 0x1f; } else if (decoded < 0xf0) { - charsNeeded = 2; + charsNeeded = 3; min_uc = 0x800; uc = decoded & 0x0f; } else if (decoded < 0xf5) { - charsNeeded = 3; + charsNeeded = 4; min_uc = 0x10000; uc = decoded & 0x07; } else { @@ -210,10 +220,10 @@ static uint encodedUtf8ToUcs4(QString &result, ushort *&output, const ushort *&i } // are there enough remaining? - if (end - input < 3*charsNeeded + 2) + if (end - input < 3*charsNeeded) return false; - if (input[2] != '%') + if (input[3] != '%') return false; // first continuation character @@ -223,8 +233,8 @@ static uint encodedUtf8ToUcs4(QString &result, ushort *&output, const ushort *&i uc <<= 6; uc |= decoded & 0x3f; - if (charsNeeded > 1) { - if (input[5] != '%') + if (charsNeeded > 2) { + if (input[6] != '%') return false; // second continuation character @@ -234,8 +244,8 @@ static uint encodedUtf8ToUcs4(QString &result, ushort *&output, const ushort *&i uc <<= 6; uc |= decoded & 0x3f; - if (charsNeeded > 2) { - if (input[8] != '%') + if (charsNeeded > 3) { + if (input[9] != '%') return false; // third continuation character @@ -253,36 +263,28 @@ static uint encodedUtf8ToUcs4(QString &result, ushort *&output, const ushort *&i if (isUnicodeNonCharacter(uc) || (uc >= 0xD800 && uc <= 0xDFFF) || uc >= 0x110000) return false; - // detach if necessary - if (!output) { - // create enough space if the rest of the string needed to be percent-encoded - int charsProcessed = input - reinterpret_cast<const ushort *>(result.constData()) - 1; - int charsRemaining = end - input - 2 - 3*charsNeeded; - int newSize = result.size() + 2 * charsRemaining; - result.resize(newSize); - - // set the output variable - output = reinterpret_cast<ushort *>(result.data()) + charsProcessed; - } - if (!QChar::requiresSurrogates(uc)) { // UTF-8 decoded and no surrogates are required + // detach if necessary + ensureDetached(result, output, begin, input, end, -9 * charsNeeded + 1); *output++ = uc; } else { // UTF-8 decoded to something that requires a surrogate pair + ensureDetached(result, output, begin, input, end, -9 * charsNeeded + 2); *output++ = QChar::highSurrogate(uc); *output++ = QChar::lowSurrogate(uc); } - input += charsNeeded * 3 + 2; + input += charsNeeded * 3 - 1; return true; } -static void unicodeToEncodedUtf8(QString &result, ushort *&output, const ushort *&input, const ushort *end, ushort decoded) +static void unicodeToEncodedUtf8(QString &result, ushort *&output, const ushort *begin, + const ushort *&input, const ushort *end, ushort decoded) { uint uc = decoded; if (QChar::isHighSurrogate(uc)) { - if (QChar::isLowSurrogate(*input)) - uc = QChar::surrogateToUcs4(uc, *input); + if (input < end && QChar::isLowSurrogate(input[1])) + uc = QChar::surrogateToUcs4(uc, input[1]); } // note: we will encode bad UTF-16 to UTF-8 @@ -293,29 +295,24 @@ static void unicodeToEncodedUtf8(QString &result, ushort *&output, const ushort // detach if (!output) { - // create enough space if the rest of the string needed to be percent-encoded - int charsProcessed = input - reinterpret_cast<const ushort *>(result.constData()) - 1; - int charsRemaining = end - input; - int newSize = result.size() + 2 * charsRemaining - 1 + 3*utf8len; - result.resize(newSize); - - // set the output variable - output = reinterpret_cast<ushort *>(result.data()) + charsProcessed; + // we need 3 * utf8len for the encoded UTF-8 sequence + // but ensureDetached already adds 3 for the char we're processing + ensureDetached(result, output, begin, input, end, 3*utf8len - 3); } else { // verify that there's enough space or expand - int charsRemaining = end - input; + int charsRemaining = end - input - 1; // not including this one int pos = output - reinterpret_cast<const ushort *>(result.constData()); int spaceRemaining = result.size() - pos; if (spaceRemaining < 3*charsRemaining + 3*utf8len) { // must resize result.resize(result.size() + 3*utf8len); - output = reinterpret_cast<ushort *>(result.data()) + pos; + + // we know that resize() above detached, so we bypass the reference count check + output = const_cast<ushort *>(reinterpret_cast<const ushort *>(result.constData())); + output += pos; } } - if (QChar::requiresSurrogates(uc)) - ++input; - // write the sequence if (uc < 0x800) { // first of two bytes @@ -337,6 +334,9 @@ static void unicodeToEncodedUtf8(QString &result, ushort *&output, const ushort *output++ = '%'; *output++ = encodeNibble(c >> 4); *output++ = encodeNibble(c & 0xf); + + // this was a surrogate pair + ++input; } else { // first of three bytes c = 0xe0 | uchar(uc >> 12); @@ -359,22 +359,21 @@ static void unicodeToEncodedUtf8(QString &result, ushort *&output, const ushort *output++ = encodeNibble(c & 0xf); } -static QString recode(const QString &component, QUrl::ComponentFormattingOptions encoding, - const uchar *actionTable, bool retryBadEncoding) +static int recode(QString &result, const ushort *begin, const ushort *end, QUrl::ComponentFormattingOptions encoding, + const uchar *actionTable, bool retryBadEncoding) { - QString result = component; - const ushort *input = reinterpret_cast<const ushort *>(component.constData()); - const ushort * const end = input + component.length(); + const int origSize = result.size(); + const ushort *input = begin; ushort *output = 0; - while (input != end) { + for ( ; input != end; ++input) { register ushort c; EncodingAction action; // try a run where no change is necessary - while (input != end) { - c = *input++; - if (c < 0x20 || c >= 0x80) // also: (c - 0x20 < 0x60U) + for ( ; input != end; ++input) { + c = *input; + if (c < 0x20U || c >= 0x80U) // also: (c - 0x20 < 0x60U) goto non_trivial; action = EncodingAction(actionTable[c - ' ']); if (action == EncodeCharacter) @@ -388,23 +387,23 @@ non_trivial: register uint decoded; if (c == '%' && retryBadEncoding) { // always write "%25" - ensureDetached(result, output, input, end); + ensureDetached(result, output, begin, input, end); *output++ = '%'; *output++ = '2'; *output++ = '5'; continue; } else if (c == '%') { // check if the input is valid - if (input + 1 >= end || (decoded = decodePercentEncoding(input)) == ushort(-1)) { + if (input + 2 >= end || (decoded = decodePercentEncoding(input)) == ushort(-1)) { // not valid, retry - result.clear(); - return recode(component, encoding, actionTable, true); + result.resize(origSize); + return recode(result, begin, end, encoding, actionTable, true); } if (decoded >= 0x80) { // decode the UTF-8 sequence if (encoding & QUrl::DecodeUnicode && - encodedUtf8ToUcs4(result, output, input, end, decoded)) + encodedUtf8ToUtf16(result, output, begin, input, end, decoded)) continue; // decoding the encoded UTF-8 failed @@ -416,7 +415,7 @@ non_trivial: decoded = c; if (decoded >= 0x80 && (encoding & QUrl::DecodeUnicode) == 0) { // encode the UTF-8 sequence - unicodeToEncodedUtf8(result, output, input, end, decoded); + unicodeToEncodedUtf8(result, output, begin, input, end, decoded); continue; } else if (decoded >= 0x80) { if (output) @@ -437,34 +436,37 @@ non_trivial: if (c == '%' && action != DecodeCharacter) { // cases 5 and 6: it's encoded and we're leaving it as it is // except we're pedantic and we'll uppercase the hex - if (output || !isUpperHex(input[0]) || !isUpperHex(input[1])) { - ensureDetached(result, output, input, end); + if (output || !isUpperHex(input[1]) || !isUpperHex(input[2])) { + ensureDetached(result, output, begin, input, end); *output++ = '%'; - *output++ = toUpperHex(*input++); - *output++ = toUpperHex(*input++); + *output++ = toUpperHex(*++input); + *output++ = toUpperHex(*++input); } } else if (c == '%' && action == DecodeCharacter) { // case 4: we need to decode - ensureDetached(result, output, input, end); + ensureDetached(result, output, begin, input, end); *output++ = decoded; input += 2; } else { // must be case 3: we need to encode - ensureDetached(result, output, input, end); + ensureDetached(result, output, begin, input, end); *output++ = '%'; *output++ = encodeNibble(c >> 4); *output++ = encodeNibble(c & 0xf); } } - if (output) - result.truncate(output - reinterpret_cast<const ushort *>(result.constData())); - return result; + if (output) { + int len = output - reinterpret_cast<const ushort *>(result.constData()); + result.truncate(len); + return len - origSize; + } + return 0; } -Q_AUTOTEST_EXPORT QString -qt_urlRecode(const QString &component, QUrl::ComponentFormattingOptions encoding, - const ushort *tableModifications) +Q_AUTOTEST_EXPORT int +qt_urlRecode(QString &appendTo, const QChar *begin, const QChar *end, + QUrl::ComponentFormattingOptions encoding, const ushort *tableModifications) { uchar actionTable[sizeof defaultActionTable]; if (encoding & QUrl::DecodeAllDelimiters) { @@ -487,7 +489,8 @@ qt_urlRecode(const QString &component, QUrl::ComponentFormattingOptions encoding actionTable[uchar(*p) - ' '] = *p >> 8; } - return recode(component, encoding, actionTable, false); + return recode(appendTo, reinterpret_cast<const ushort *>(begin), reinterpret_cast<const ushort *>(end), + encoding, actionTable, false); } QT_END_NAMESPACE |