2 files changed, 132 insertions, 8 deletions
diff --git a/src/corelib/codecs/qutfcodec.cpp b/src/corelib/codecs/qutfcodec.cpp
index 072cda63aa..c5f580e13d 100644
--- a/src/corelib/codecs/qutfcodec.cpp
+++ b/src/corelib/codecs/qutfcodec.cpp
@@ -237,7 +237,20 @@ QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::Conve
 
 QString QUtf8::convertToUnicode(const char *chars, int len)
 {
-    QString result(len + 1, Qt::Uninitialized); // worst case
+    // UTF-8 to UTF-16 always needs the exact same number of words or less:
+    //    UTF-8     UTF-16
+    //   1 byte     1 word
+    //   2 bytes    1 word
+    //   3 bytes    1 word
+    //   4 bytes    2 words (one surrogate pair)
+    // That is, we'll use the full buffer if the input is US-ASCII (1-byte UTF-8),
+    // half the buffer for U+0080-U+07FF text (e.g., Greek, Cyrillic, Arabic) or
+    // non-BMP text, and one third of the buffer for U+0800-U+FFFF text (e.g, CJK).
+    //
+    // The table holds for invalid sequences too: we'll insert one replacement char
+    // per invalid byte.
+    QString result(len, Qt::Uninitialized);
+
     ushort *dst = reinterpret_cast<ushort *>(const_cast<QChar *>(result.constData()));
     const uchar *src = reinterpret_cast<const uchar *>(chars);
     const uchar *end = src + len;
@@ -282,7 +295,18 @@ QString QUtf8::convertToUnicode(const char *chars, int len, QTextCodec::Converte
     int res;
     uchar ch = 0;
 
-    QString result(need + len + 1, Qt::Uninitialized); // worst case
+    // See above for buffer requirements for stateless decoding. However, that
+    // fails if the state is not empty. The following situations can add to the
+    // requirements:
+    //  state contains      chars starts with           requirement
+    //   1 of 2 bytes       valid continuation          0
+    //   2 of 3 bytes       same                        0
+    //   3 bytes of 4       same                        +1 (need to insert surrogate pair)
+    //   1 of 2 bytes       invalid continuation        +1 (need to insert replacement and restart)
+    //   2 of 3 bytes       same                        +1 (same)
+    //   3 of 4 bytes       same                        +1 (same)
+    QString result(need + len + 1, Qt::Uninitialized);
+
     ushort *dst = reinterpret_cast<ushort *>(const_cast<QChar *>(result.constData()));
     const uchar *src = reinterpret_cast<const uchar *>(chars);
     const uchar *end = src + len;
@@ -305,15 +329,17 @@ QString QUtf8::convertToUnicode(const char *chars, int len, QTextCodec::Converte
             const uchar *begin = &remainingCharsData[1];
             res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(remainingCharsData[0], dst, begin,
                     static_cast<const uchar *>(remainingCharsData) + remainingCharsCount + newCharsToCopy);
-            if (res == QUtf8BaseTraits::EndOfString) {
+            if (res == QUtf8BaseTraits::Error || (res == QUtf8BaseTraits::EndOfString && len == 0)) {
+                // special case for len == 0:
+                // if we were supplied an empty string, terminate the previous, unfinished sequence with error
+                ++invalid;
+                *dst++ = replacement;
+            } else if (res == QUtf8BaseTraits::EndOfString) {
                 // if we got EndOfString again, then there were too few bytes in src;
                 // copy to our state and return
                 state->remainingChars = remainingCharsCount + newCharsToCopy;
                 memcpy(&state->state_data[0], remainingCharsData, state->remainingChars);
                 return QString();
-            } else if (res == QUtf8BaseTraits::Error) {
-                ++invalid;
-                *dst++ = replacement;
             } else if (!headerdone && res >= 0) {
                 // eat the UTF-8 BOM
                 headerdone = true;
@@ -322,8 +348,10 @@ QString QUtf8::convertToUnicode(const char *chars, int len, QTextCodec::Converte
             }
 
             // adjust src now that we have maybe consumed a few chars
-            //Q_ASSERT(res > remainingCharsCount)
-            src += res - remainingCharsCount;
+            if (res >= 0) {
+                Q_ASSERT(res > remainingCharsCount);
+                src += res - remainingCharsCount;
+            }
         }
     }
 
diff --git a/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp b/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp
index 12b81ee7d4..54e8f8c386 100644
--- a/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp
+++ b/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp
@@ -80,6 +80,9 @@ private slots:
     void utf8bom_data();
     void utf8bom();
 
+    void utf8stateful_data();
+    void utf8stateful();
+
     void utfHeaders_data();
     void utfHeaders();
 
@@ -1611,6 +1614,99 @@ void tst_QTextCodec::utf8bom()
     QCOMPARE(codec->toUnicode(data.constData(), data.length(), &state), result);
 }
 
+void tst_QTextCodec::utf8stateful_data()
+{
+    QTest::addColumn<QByteArray>("buffer1");
+    QTest::addColumn<QByteArray>("buffer2");
+    QTest::addColumn<QString>("result");    // null QString indicates decoder error
+
+    // valid buffer continuations
+    QTest::newRow("1of2+valid") << QByteArray("\xc2") << QByteArray("\xa0") << "\xc2\xa0";
+    QTest::newRow("1of3+valid") << QByteArray("\xe0") << QByteArray("\xa0\x80") << "\xe0\xa0\x80";
+    QTest::newRow("2of3+valid") << QByteArray("\xe0\xa0") << QByteArray("\x80") << "\xe0\xa0\x80";
+    QTest::newRow("1of4+valid") << QByteArray("\360") << QByteArray("\220\210\203") << "\360\220\210\203";
+    QTest::newRow("2of4+valid") << QByteArray("\360\220") << QByteArray("\210\203") << "\360\220\210\203";
+    QTest::newRow("3of4+valid") << QByteArray("\360\220\210") << QByteArray("\203") << "\360\220\210\203";
+    QTest::newRow("1ofBom+valid") << QByteArray("\xef") << QByteArray("\xbb\xbf") << "";
+    QTest::newRow("2ofBom+valid") << QByteArray("\xef\xbb") << QByteArray("\xbf") << "";
+
+    // invalid continuation
+    QTest::newRow("1of2+invalid") << QByteArray("\xc2") << QByteArray("a") << QString();
+    QTest::newRow("1of3+invalid") << QByteArray("\xe0") << QByteArray("a") << QString();
+    QTest::newRow("2of3+invalid") << QByteArray("\xe0\xa0") << QByteArray("a") << QString();
+    QTest::newRow("1of4+invalid") << QByteArray("\360") << QByteArray("a") << QString();
+    QTest::newRow("2of4+invalid") << QByteArray("\360\220") << QByteArray("a") << QString();
+    QTest::newRow("3of4+invalid") << QByteArray("\360\220\210") << QByteArray("a") << QString();
+
+    // invalid: sequence too short (the empty second buffer causes a state reset)
+    QTest::newRow("1of2+empty") << QByteArray("\xc2") << QByteArray() << QString();
+    QTest::newRow("1of3+empty") << QByteArray("\xe0") << QByteArray() << QString();
+    QTest::newRow("2of3+empty") << QByteArray("\xe0\xa0") << QByteArray() << QString();
+    QTest::newRow("1of4+empty") << QByteArray("\360") << QByteArray() << QString();
+    QTest::newRow("2of4+empty") << QByteArray("\360\220") << QByteArray() << QString();
+    QTest::newRow("3of4+empty") << QByteArray("\360\220\210") << QByteArray() << QString();
+
+    // overlong sequence:
+    QTest::newRow("overlong-1of2") << QByteArray("\xc1") << QByteArray("\x81") << QString();
+    QTest::newRow("overlong-1of3") << QByteArray("\xe0") << QByteArray("\x81\x81") << QString();
+    QTest::newRow("overlong-2of3") << QByteArray("\xe0\x81") << QByteArray("\x81") << QString();
+    QTest::newRow("overlong-1of4") << QByteArray("\xf0") << QByteArray("\x80\x81\x81") << QString();
+    QTest::newRow("overlong-2of4") << QByteArray("\xf0\x80") << QByteArray("\x81\x81") << QString();
+    QTest::newRow("overlong-3of4") << QByteArray("\xf0\x80\x81") << QByteArray("\x81") << QString();
+
+    // out of range:
+    // leading byte 0xF4 can produce codepoints above U+10FFFF, which aren't valid
+    QTest::newRow("outofrange1-1of4") << QByteArray("\xf4") << QByteArray("\x90\x80\x80") << QString();
+    QTest::newRow("outofrange1-2of4") << QByteArray("\xf4\x90") << QByteArray("\x80\x80") << QString();
+    QTest::newRow("outofrange1-3of4") << QByteArray("\xf4\x90\x80") << QByteArray("\x80") << QString();
+    QTest::newRow("outofrange2-1of4") << QByteArray("\xf5") << QByteArray("\x90\x80\x80") << QString();
+    QTest::newRow("outofrange2-2of4") << QByteArray("\xf5\x90") << QByteArray("\x80\x80") << QString();
+    QTest::newRow("outofrange2-3of4") << QByteArray("\xf5\x90\x80") << QByteArray("\x80") << QString();
+    QTest::newRow("outofrange-1of5") << QByteArray("\xf8") << QByteArray("\x88\x80\x80\x80") << QString();
+    QTest::newRow("outofrange-2of5") << QByteArray("\xf8\x88") << QByteArray("\x80\x80\x80") << QString();
+    QTest::newRow("outofrange-3of5") << QByteArray("\xf8\x88\x80") << QByteArray("\x80\x80") << QString();
+    QTest::newRow("outofrange-4of5") << QByteArray("\xf8\x88\x80\x80") << QByteArray("\x80") << QString();
+    QTest::newRow("outofrange-1of6") << QByteArray("\xfc") << QByteArray("\x84\x80\x80\x80\x80") << QString();
+    QTest::newRow("outofrange-2of6") << QByteArray("\xfc\x84") << QByteArray("\x80\x80\x80\x80") << QString();
+    QTest::newRow("outofrange-3of6") << QByteArray("\xfc\x84\x80") << QByteArray("\x80\x80\x80") << QString();
+    QTest::newRow("outofrange-4of6") << QByteArray("\xfc\x84\x80\x80") << QByteArray("\x80\x80") << QString();
+    QTest::newRow("outofrange-5of6") << QByteArray("\xfc\x84\x80\x80\x80") << QByteArray("\x80") << QString();
+}
+
+void tst_QTextCodec::utf8stateful()
+{
+    QFETCH(QByteArray, buffer1);
+    QFETCH(QByteArray, buffer2);
+    QFETCH(QString, result);
+
+    QTextCodec *utf8codec = QTextCodec::codecForName("utf-8");
+    QVERIFY(utf8codec);
+
+    QTextCodec::ConverterState state;
+    memset(&state, 0, sizeof state);
+
+    QString decoded1 = utf8codec->toUnicode(buffer1, buffer1.size(), &state);
+    if (result.isNull()) {
+        // the decoder may have found an early error (invalidChars > 0):
+        // if it has, remainingChars == 0;
+        // if it hasn't, then it must have a state
+        QVERIFY2((state.remainingChars == 0) != (state.invalidChars == 0),
+                 "remainingChars = " + QByteArray::number(state.remainingChars) +
+                 "; invalidChars = " + QByteArray::number(state.invalidChars));
+    } else {
+        QVERIFY(state.remainingChars > 0);
+        QCOMPARE(state.invalidChars, 0);
+    }
+
+    QString decoded2 = utf8codec->toUnicode(buffer2, buffer2.size(), &state);
+    QCOMPARE(state.remainingChars, 0);
+    if (result.isNull()) {
+        QVERIFY(state.invalidChars > 0);
+    } else {
+        QCOMPARE(decoded1 + decoded2, result);
+    }
+}
+
 void tst_QTextCodec::utfHeaders_data()
 {
     QTest::addColumn<QByteArray>("codecName");