diff options
-rw-r--r-- | src/corelib/text/qstringconverter.cpp | 172 | ||||
-rw-r--r-- | tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp | 8 | ||||
-rw-r--r-- | tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp | 8 |
3 files changed, 82 insertions, 106 deletions
diff --git a/src/corelib/text/qstringconverter.cpp b/src/corelib/text/qstringconverter.cpp index f4a51ef93d..43ebde6449 100644 --- a/src/corelib/text/qstringconverter.cpp +++ b/src/corelib/text/qstringconverter.cpp @@ -416,6 +416,8 @@ char *QUtf8::convertFromUnicode(char *out, QStringView in, QStringConverter::Sta Q_ASSERT(state); const QChar *uc = in.data(); qsizetype len = in.length(); + if (!len) + return out; auto appendReplacementChar = [state](uchar *cursor) -> uchar * { if (state->flags & QStringConverter::Flag::ConvertInvalidToNull) { @@ -433,56 +435,50 @@ char *QUtf8::convertFromUnicode(char *out, QStringView in, QStringConverter::Sta const ushort *src = reinterpret_cast<const ushort *>(uc); const ushort *const end = src + len; - int surrogate_high = -1; - if (state->remainingChars) { - surrogate_high = state->state_data[0]; - } else if (!(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom) { - // append UTF-8 BOM - *cursor++ = utf8bom[0]; - *cursor++ = utf8bom[1]; - *cursor++ = utf8bom[2]; - state->internalState |= HeaderDone; + if (!(state->flags & QStringDecoder::Flag::Stateless)) { + if (state->remainingChars) { + int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(state->state_data[0], cursor, src, end); + if (res < 0) + cursor = appendReplacementChar(cursor); + state->state_data[0] = 0; + state->remainingChars = 0; + } else if (!(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom) { + // append UTF-8 BOM + *cursor++ = utf8bom[0]; + *cursor++ = utf8bom[1]; + *cursor++ = utf8bom[2]; + state->internalState |= HeaderDone; + } } - const ushort *nextAscii = src; while (src != end) { - int res; - ushort uc; - if (surrogate_high != -1) { - uc = surrogate_high; - surrogate_high = -1; - res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, cursor, src, end); - } else { - if (src >= nextAscii && simdEncodeAscii(cursor, nextAscii, src, end)) - break; + const ushort *nextAscii = end; + if (simdEncodeAscii(cursor, nextAscii, src, end)) + break; - uc = *src++; - res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, cursor, src, end); - } - if (Q_LIKELY(res >= 0)) - continue; + do { + ushort uc = *src++; + int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, cursor, src, end); + if (Q_LIKELY(res >= 0)) + continue; - if (res == QUtf8BaseTraits::Error) { - // encoding error - ++state->invalidChars; - cursor = appendReplacementChar(cursor); - } else if (res == QUtf8BaseTraits::EndOfString) { - surrogate_high = uc; - break; - } + if (res == QUtf8BaseTraits::Error) { + // encoding error + ++state->invalidChars; + cursor = appendReplacementChar(cursor); + } else if (res == QUtf8BaseTraits::EndOfString) { + if (state->flags & QStringConverter::Flag::Stateless) { + ++state->invalidChars; + cursor = appendReplacementChar(cursor); + } else { + state->remainingChars = 1; + state->state_data[0] = uc; + } + return reinterpret_cast<char *>(cursor); + } + } while (src < nextAscii); } - state->internalState |= HeaderDone; - state->remainingChars = 0; - if (surrogate_high >= 0) { - if (state->flags & QStringConverter::Flag::Stateless) { - ++state->invalidChars; - cursor = appendReplacementChar(cursor); - } else { - state->remainingChars = 1; - state->state_data[0] = surrogate_high; - } - } return reinterpret_cast<char *>(cursor); } @@ -581,8 +577,9 @@ QString QUtf8::convertToUnicode(const char *chars, qsizetype len, QStringConvert QChar *QUtf8::convertToUnicode(QChar *out, const char *chars, qsizetype len, QStringConverter::State *state) { Q_ASSERT(state); + if (!len) + return out; - bool headerdone = state->internalState & HeaderDone || state->flags & QStringConverter::Flag::ConvertInitialBom; ushort replacement = QChar::ReplacementCharacter; if (state->flags & QStringConverter::Flag::ConvertInvalidToNull) @@ -595,62 +592,60 @@ QChar *QUtf8::convertToUnicode(QChar *out, const char *chars, qsizetype len, QSt const uchar *src = reinterpret_cast<const uchar *>(chars); const uchar *end = src + len; - if (state->remainingChars) { - // handle incoming state first - uchar remainingCharsData[4]; // longest UTF-8 sequence possible - qsizetype remainingCharsCount = state->remainingChars; - qsizetype newCharsToCopy = qMin<qsizetype>(sizeof(remainingCharsData) - remainingCharsCount, end - src); - - memset(remainingCharsData, 0, sizeof(remainingCharsData)); - memcpy(remainingCharsData, &state->state_data[0], remainingCharsCount); - memcpy(remainingCharsData + remainingCharsCount, src, newCharsToCopy); - - const uchar *begin = &remainingCharsData[1]; - res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(remainingCharsData[0], dst, begin, - static_cast<const uchar *>(remainingCharsData) + remainingCharsCount + newCharsToCopy); - if (res == QUtf8BaseTraits::Error || (res == QUtf8BaseTraits::EndOfString && len == 0)) { - // special case for len == 0: - // if we were supplied an empty string, terminate the previous, unfinished sequence with error - ++state->invalidChars; - *dst++ = replacement; - } else if (res == QUtf8BaseTraits::EndOfString) { - // if we got EndOfString again, then there were too few bytes in src; - // copy to our state and return - state->remainingChars = remainingCharsCount + newCharsToCopy; - memcpy(&state->state_data[0], remainingCharsData, state->remainingChars); - return out; - } else if (!headerdone && res >= 0) { - // eat the UTF-8 BOM - headerdone = true; - if (dst[-1] == 0xfeff) - --dst; - } + if (!(state->flags & QStringConverter::Flag::Stateless)) { + bool headerdone = state->internalState & HeaderDone || state->flags & QStringConverter::Flag::ConvertInitialBom; + if (state->remainingChars || !headerdone) { + // handle incoming state first + uchar remainingCharsData[4]; // longest UTF-8 sequence possible + qsizetype remainingCharsCount = state->remainingChars; + qsizetype newCharsToCopy = qMin<qsizetype>(sizeof(remainingCharsData) - remainingCharsCount, end - src); + + memset(remainingCharsData, 0, sizeof(remainingCharsData)); + memcpy(remainingCharsData, &state->state_data[0], remainingCharsCount); + memcpy(remainingCharsData + remainingCharsCount, src, newCharsToCopy); + + const uchar *begin = &remainingCharsData[1]; + res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(remainingCharsData[0], dst, begin, + static_cast<const uchar *>(remainingCharsData) + remainingCharsCount + newCharsToCopy); + if (res == QUtf8BaseTraits::Error) { + ++state->invalidChars; + *dst++ = replacement; + ++src; + } else if (res == QUtf8BaseTraits::EndOfString) { + // if we got EndOfString again, then there were too few bytes in src; + // copy to our state and return + state->remainingChars = remainingCharsCount + newCharsToCopy; + memcpy(&state->state_data[0], remainingCharsData, state->remainingChars); + return out; + } else if (!headerdone) { + // eat the UTF-8 BOM + if (dst[-1] == 0xfeff) + --dst; + } + state->internalState |= HeaderDone; - // adjust src now that we have maybe consumed a few chars - if (res >= 0) { - Q_ASSERT(res > remainingCharsCount); - src += res - remainingCharsCount; + // adjust src now that we have maybe consumed a few chars + if (res >= 0) { + Q_ASSERT(res > remainingCharsCount); + src += res - remainingCharsCount; + } } + } else if (!(state->flags & QStringConverter::Flag::ConvertInitialBom)) { + // stateless, remove initial BOM + if (len > 2 && src[0] == utf8bom[0] && src[1] == utf8bom[1] && src[2] == utf8bom[2]) + // skip BOM + src += 3; } // main body, stateless decoding res = 0; const uchar *nextAscii = src; - const uchar *start = src; while (res >= 0 && src < end) { if (src >= nextAscii && simdDecodeAscii(dst, nextAscii, src, end)) break; ch = *src++; res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(ch, dst, src, end); - if (!headerdone && res >= 0) { - headerdone = true; - if (src == start + 3) { // 3 == sizeof(utf8-bom) - // eat the UTF-8 BOM (it can only appear at the beginning of the string). - if (dst[-1] == 0xfeff) - --dst; - } - } if (res == QUtf8BaseTraits::Error) { res = 0; ++state->invalidChars; @@ -677,9 +672,6 @@ QChar *QUtf8::convertToUnicode(QChar *out, const char *chars, qsizetype len, QSt state->remainingChars = 0; } - if (headerdone) - state->internalState |= HeaderDone; - return reinterpret_cast<QChar *>(dst); } diff --git a/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp b/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp index 4dc75b11c0..08c2954484 100644 --- a/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp +++ b/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp @@ -1641,14 +1641,6 @@ void tst_QTextCodec::utf8stateful_data() QTest::newRow("2of4+invalid") << QByteArray("\360\220") << QByteArray("a") << QString(); QTest::newRow("3of4+invalid") << QByteArray("\360\220\210") << QByteArray("a") << QString(); - // invalid: sequence too short (the empty second buffer causes a state reset) - QTest::newRow("1of2+empty") << QByteArray("\xc2") << QByteArray() << QString(); - QTest::newRow("1of3+empty") << QByteArray("\xe0") << QByteArray() << QString(); - QTest::newRow("2of3+empty") << QByteArray("\xe0\xa0") << QByteArray() << QString(); - QTest::newRow("1of4+empty") << QByteArray("\360") << QByteArray() << QString(); - QTest::newRow("2of4+empty") << QByteArray("\360\220") << QByteArray() << QString(); - QTest::newRow("3of4+empty") << QByteArray("\360\220\210") << QByteArray() << QString(); - // overlong sequence: QTest::newRow("overlong-1of2") << QByteArray("\xc1") << QByteArray("\x81") << QString(); QTest::newRow("overlong-1of3") << QByteArray("\xe0") << QByteArray("\x81\x81") << QString(); diff --git a/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp b/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp index e95bce6915..58ae28bbf4 100644 --- a/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp +++ b/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp @@ -1335,14 +1335,6 @@ void tst_QStringConverter::utf8stateful_data() QTest::newRow("2of4+invalid") << QByteArray("\360\220") << QByteArray("a") << QString(); QTest::newRow("3of4+invalid") << QByteArray("\360\220\210") << QByteArray("a") << QString(); - // invalid: sequence too short (the empty second buffer causes a state reset) - QTest::newRow("1of2+empty") << QByteArray("\xc2") << QByteArray() << QString(); - QTest::newRow("1of3+empty") << QByteArray("\xe0") << QByteArray() << QString(); - QTest::newRow("2of3+empty") << QByteArray("\xe0\xa0") << QByteArray() << QString(); - QTest::newRow("1of4+empty") << QByteArray("\360") << QByteArray() << QString(); - QTest::newRow("2of4+empty") << QByteArray("\360\220") << QByteArray() << QString(); - QTest::newRow("3of4+empty") << QByteArray("\360\220\210") << QByteArray() << QString(); - // overlong sequence: QTest::newRow("overlong-1of2") << QByteArray("\xc1") << QByteArray("\x81") << QString(); QTest::newRow("overlong-1of3") << QByteArray("\xe0") << QByteArray("\x81\x81") << QString(); |