diff options
-rw-r--r-- | src/corelib/codecs/qutfcodec.cpp | 44 | ||||
-rw-r--r-- | tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp | 96 |
2 files changed, 132 insertions, 8 deletions
diff --git a/src/corelib/codecs/qutfcodec.cpp b/src/corelib/codecs/qutfcodec.cpp index 072cda63aa..c5f580e13d 100644 --- a/src/corelib/codecs/qutfcodec.cpp +++ b/src/corelib/codecs/qutfcodec.cpp @@ -237,7 +237,20 @@ QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::Conve QString QUtf8::convertToUnicode(const char *chars, int len) { - QString result(len + 1, Qt::Uninitialized); // worst case + // UTF-8 to UTF-16 always needs the exact same number of words or less: + // UTF-8 UTF-16 + // 1 byte 1 word + // 2 bytes 1 word + // 3 bytes 1 word + // 4 bytes 2 words (one surrogate pair) + // That is, we'll use the full buffer if the input is US-ASCII (1-byte UTF-8), + // half the buffer for U+0080-U+07FF text (e.g., Greek, Cyrillic, Arabic) or + // non-BMP text, and one third of the buffer for U+0800-U+FFFF text (e.g, CJK). + // + // The table holds for invalid sequences too: we'll insert one replacement char + // per invalid byte. + QString result(len, Qt::Uninitialized); + ushort *dst = reinterpret_cast<ushort *>(const_cast<QChar *>(result.constData())); const uchar *src = reinterpret_cast<const uchar *>(chars); const uchar *end = src + len; @@ -282,7 +295,18 @@ QString QUtf8::convertToUnicode(const char *chars, int len, QTextCodec::Converte int res; uchar ch = 0; - QString result(need + len + 1, Qt::Uninitialized); // worst case + // See above for buffer requirements for stateless decoding. However, that + // fails if the state is not empty. The following situations can add to the + // requirements: + // state contains chars starts with requirement + // 1 of 2 bytes valid continuation 0 + // 2 of 3 bytes same 0 + // 3 bytes of 4 same +1 (need to insert surrogate pair) + // 1 of 2 bytes invalid continuation +1 (need to insert replacement and restart) + // 2 of 3 bytes same +1 (same) + // 3 of 4 bytes same +1 (same) + QString result(need + len + 1, Qt::Uninitialized); + ushort *dst = reinterpret_cast<ushort *>(const_cast<QChar *>(result.constData())); const uchar *src = reinterpret_cast<const uchar *>(chars); const uchar *end = src + len; @@ -305,15 +329,17 @@ QString QUtf8::convertToUnicode(const char *chars, int len, QTextCodec::Converte const uchar *begin = &remainingCharsData[1]; res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(remainingCharsData[0], dst, begin, static_cast<const uchar *>(remainingCharsData) + remainingCharsCount + newCharsToCopy); - if (res == QUtf8BaseTraits::EndOfString) { + if (res == QUtf8BaseTraits::Error || (res == QUtf8BaseTraits::EndOfString && len == 0)) { + // special case for len == 0: + // if we were supplied an empty string, terminate the previous, unfinished sequence with error + ++invalid; + *dst++ = replacement; + } else if (res == QUtf8BaseTraits::EndOfString) { // if we got EndOfString again, then there were too few bytes in src; // copy to our state and return state->remainingChars = remainingCharsCount + newCharsToCopy; memcpy(&state->state_data[0], remainingCharsData, state->remainingChars); return QString(); - } else if (res == QUtf8BaseTraits::Error) { - ++invalid; - *dst++ = replacement; } else if (!headerdone && res >= 0) { // eat the UTF-8 BOM headerdone = true; @@ -322,8 +348,10 @@ QString QUtf8::convertToUnicode(const char *chars, int len, QTextCodec::Converte } // adjust src now that we have maybe consumed a few chars - //Q_ASSERT(res > remainingCharsCount) - src += res - remainingCharsCount; + if (res >= 0) { + Q_ASSERT(res > remainingCharsCount); + src += res - remainingCharsCount; + } } } diff --git a/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp b/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp index 12b81ee7d4..54e8f8c386 100644 --- a/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp +++ b/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp @@ -80,6 +80,9 @@ private slots: void utf8bom_data(); void utf8bom(); + void utf8stateful_data(); + void utf8stateful(); + void utfHeaders_data(); void utfHeaders(); @@ -1611,6 +1614,99 @@ void tst_QTextCodec::utf8bom() QCOMPARE(codec->toUnicode(data.constData(), data.length(), &state), result); } +void tst_QTextCodec::utf8stateful_data() +{ + QTest::addColumn<QByteArray>("buffer1"); + QTest::addColumn<QByteArray>("buffer2"); + QTest::addColumn<QString>("result"); // null QString indicates decoder error + + // valid buffer continuations + QTest::newRow("1of2+valid") << QByteArray("\xc2") << QByteArray("\xa0") << "\xc2\xa0"; + QTest::newRow("1of3+valid") << QByteArray("\xe0") << QByteArray("\xa0\x80") << "\xe0\xa0\x80"; + QTest::newRow("2of3+valid") << QByteArray("\xe0\xa0") << QByteArray("\x80") << "\xe0\xa0\x80"; + QTest::newRow("1of4+valid") << QByteArray("\360") << QByteArray("\220\210\203") << "\360\220\210\203"; + QTest::newRow("2of4+valid") << QByteArray("\360\220") << QByteArray("\210\203") << "\360\220\210\203"; + QTest::newRow("3of4+valid") << QByteArray("\360\220\210") << QByteArray("\203") << "\360\220\210\203"; + QTest::newRow("1ofBom+valid") << QByteArray("\xef") << QByteArray("\xbb\xbf") << ""; + QTest::newRow("2ofBom+valid") << QByteArray("\xef\xbb") << QByteArray("\xbf") << ""; + + // invalid continuation + QTest::newRow("1of2+invalid") << QByteArray("\xc2") << QByteArray("a") << QString(); + QTest::newRow("1of3+invalid") << QByteArray("\xe0") << QByteArray("a") << QString(); + QTest::newRow("2of3+invalid") << QByteArray("\xe0\xa0") << QByteArray("a") << QString(); + QTest::newRow("1of4+invalid") << QByteArray("\360") << QByteArray("a") << QString(); + QTest::newRow("2of4+invalid") << QByteArray("\360\220") << QByteArray("a") << QString(); + QTest::newRow("3of4+invalid") << QByteArray("\360\220\210") << QByteArray("a") << QString(); + + // invalid: sequence too short (the empty second buffer causes a state reset) + QTest::newRow("1of2+empty") << QByteArray("\xc2") << QByteArray() << QString(); + QTest::newRow("1of3+empty") << QByteArray("\xe0") << QByteArray() << QString(); + QTest::newRow("2of3+empty") << QByteArray("\xe0\xa0") << QByteArray() << QString(); + QTest::newRow("1of4+empty") << QByteArray("\360") << QByteArray() << QString(); + QTest::newRow("2of4+empty") << QByteArray("\360\220") << QByteArray() << QString(); + QTest::newRow("3of4+empty") << QByteArray("\360\220\210") << QByteArray() << QString(); + + // overlong sequence: + QTest::newRow("overlong-1of2") << QByteArray("\xc1") << QByteArray("\x81") << QString(); + QTest::newRow("overlong-1of3") << QByteArray("\xe0") << QByteArray("\x81\x81") << QString(); + QTest::newRow("overlong-2of3") << QByteArray("\xe0\x81") << QByteArray("\x81") << QString(); + QTest::newRow("overlong-1of4") << QByteArray("\xf0") << QByteArray("\x80\x81\x81") << QString(); + QTest::newRow("overlong-2of4") << QByteArray("\xf0\x80") << QByteArray("\x81\x81") << QString(); + QTest::newRow("overlong-3of4") << QByteArray("\xf0\x80\x81") << QByteArray("\x81") << QString(); + + // out of range: + // leading byte 0xF4 can produce codepoints above U+10FFFF, which aren't valid + QTest::newRow("outofrange1-1of4") << QByteArray("\xf4") << QByteArray("\x90\x80\x80") << QString(); + QTest::newRow("outofrange1-2of4") << QByteArray("\xf4\x90") << QByteArray("\x80\x80") << QString(); + QTest::newRow("outofrange1-3of4") << QByteArray("\xf4\x90\x80") << QByteArray("\x80") << QString(); + QTest::newRow("outofrange2-1of4") << QByteArray("\xf5") << QByteArray("\x90\x80\x80") << QString(); + QTest::newRow("outofrange2-2of4") << QByteArray("\xf5\x90") << QByteArray("\x80\x80") << QString(); + QTest::newRow("outofrange2-3of4") << QByteArray("\xf5\x90\x80") << QByteArray("\x80") << QString(); + QTest::newRow("outofrange-1of5") << QByteArray("\xf8") << QByteArray("\x88\x80\x80\x80") << QString(); + QTest::newRow("outofrange-2of5") << QByteArray("\xf8\x88") << QByteArray("\x80\x80\x80") << QString(); + QTest::newRow("outofrange-3of5") << QByteArray("\xf8\x88\x80") << QByteArray("\x80\x80") << QString(); + QTest::newRow("outofrange-4of5") << QByteArray("\xf8\x88\x80\x80") << QByteArray("\x80") << QString(); + QTest::newRow("outofrange-1of6") << QByteArray("\xfc") << QByteArray("\x84\x80\x80\x80\x80") << QString(); + QTest::newRow("outofrange-2of6") << QByteArray("\xfc\x84") << QByteArray("\x80\x80\x80\x80") << QString(); + QTest::newRow("outofrange-3of6") << QByteArray("\xfc\x84\x80") << QByteArray("\x80\x80\x80") << QString(); + QTest::newRow("outofrange-4of6") << QByteArray("\xfc\x84\x80\x80") << QByteArray("\x80\x80") << QString(); + QTest::newRow("outofrange-5of6") << QByteArray("\xfc\x84\x80\x80\x80") << QByteArray("\x80") << QString(); +} + +void tst_QTextCodec::utf8stateful() +{ + QFETCH(QByteArray, buffer1); + QFETCH(QByteArray, buffer2); + QFETCH(QString, result); + + QTextCodec *utf8codec = QTextCodec::codecForName("utf-8"); + QVERIFY(utf8codec); + + QTextCodec::ConverterState state; + memset(&state, 0, sizeof state); + + QString decoded1 = utf8codec->toUnicode(buffer1, buffer1.size(), &state); + if (result.isNull()) { + // the decoder may have found an early error (invalidChars > 0): + // if it has, remainingChars == 0; + // if it hasn't, then it must have a state + QVERIFY2((state.remainingChars == 0) != (state.invalidChars == 0), + "remainingChars = " + QByteArray::number(state.remainingChars) + + "; invalidChars = " + QByteArray::number(state.invalidChars)); + } else { + QVERIFY(state.remainingChars > 0); + QCOMPARE(state.invalidChars, 0); + } + + QString decoded2 = utf8codec->toUnicode(buffer2, buffer2.size(), &state); + QCOMPARE(state.remainingChars, 0); + if (result.isNull()) { + QVERIFY(state.invalidChars > 0); + } else { + QCOMPARE(decoded1 + decoded2, result); + } +} + void tst_QTextCodec::utfHeaders_data() { QTest::addColumn<QByteArray>("codecName"); |