From b23e72a772a5abfdf9784ab80db9a4d620137515 Mon Sep 17 00:00:00 2001
From: Thiago Macieira <thiago.macieira@intel.com>
Date: Mon, 12 May 2014 17:05:36 -0700
Subject: Fix stateful handling of invalid UTF-8 straddling buffer borders

When a UTF-8 sequences is too short, QUtf8Functions::fromUtf8 returns
EndOfString. If the decoder is stateful, we must save the state and then
restart it when more data is supplied.

The new stateful decoder (8dd47e34b9b96ac27a99cdcf10b8aec506882fc2)
mishandled the Error case by advancing the src pointer by a negative
number, thus causing a buffer overflow (the issue of the task).

And it also did not handle the len == 0 case properly, though neither
did the older decoder.

Task-number: QTBUG-38939
Change-Id: Ie03d7c55a04e51ee838ccdb3a01e5b989d8e67aa
Reviewed-by: Kai Koehne <kai.koehne@digia.com>
Reviewed-by: Lars Knoll <lars.knoll@digia.com>
---
 src/corelib/codecs/qutfcodec.cpp                   | 44 ++++++++--
 .../corelib/codecs/qtextcodec/tst_qtextcodec.cpp   | 96 ++++++++++++++++++++++
 2 files changed, 132 insertions(+), 8 deletions(-)
diff --git a/src/corelib/codecs/qutfcodec.cpp b/src/corelib/codecs/qutfcodec.cpp
index 072cda63aa..c5f580e13d 100644
--- a/src/corelib/codecs/qutfcodec.cpp
+++ b/src/corelib/codecs/qutfcodec.cpp
@@ -237,7 +237,20 @@ QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::Conve
 
 QString QUtf8::convertToUnicode(const char *chars, int len)
 {
-    QString result(len + 1, Qt::Uninitialized); // worst case
+    // UTF-8 to UTF-16 always needs the exact same number of words or less:
+    //    UTF-8     UTF-16
+    //   1 byte     1 word
+    //   2 bytes    1 word
+    //   3 bytes    1 word
+    //   4 bytes    2 words (one surrogate pair)
+    // That is, we'll use the full buffer if the input is US-ASCII (1-byte UTF-8),
+    // half the buffer for U+0080-U+07FF text (e.g., Greek, Cyrillic, Arabic) or
+    // non-BMP text, and one third of the buffer for U+0800-U+FFFF text (e.g, CJK).
+    //
+    // The table holds for invalid sequences too: we'll insert one replacement char
+    // per invalid byte.
+    QString result(len, Qt::Uninitialized);
+
     ushort *dst = reinterpret_cast<ushort *>(const_cast<QChar *>(result.constData()));
     const uchar *src = reinterpret_cast<const uchar *>(chars);
     const uchar *end = src + len;
@@ -282,7 +295,18 @@ QString QUtf8::convertToUnicode(const char *chars, int len, QTextCodec::Converte
     int res;
     uchar ch = 0;
 
-    QString result(need + len + 1, Qt::Uninitialized); // worst case
+    // See above for buffer requirements for stateless decoding. However, that
+    // fails if the state is not empty. The following situations can add to the
+    // requirements:
+    //  state contains      chars starts with           requirement
+    //   1 of 2 bytes       valid continuation          0
+    //   2 of 3 bytes       same                        0
+    //   3 bytes of 4       same                        +1 (need to insert surrogate pair)
+    //   1 of 2 bytes       invalid continuation        +1 (need to insert replacement and restart)
+    //   2 of 3 bytes       same                        +1 (same)
+    //   3 of 4 bytes       same                        +1 (same)
+    QString result(need + len + 1, Qt::Uninitialized);
+
     ushort *dst = reinterpret_cast<ushort *>(const_cast<QChar *>(result.constData()));
     const uchar *src = reinterpret_cast<const uchar *>(chars);
     const uchar *end = src + len;
@@ -305,15 +329,17 @@ QString QUtf8::convertToUnicode(const char *chars, int len, QTextCodec::Converte
             const uchar *begin = &remainingCharsData[1];
             res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(remainingCharsData[0], dst, begin,
                     static_cast<const uchar *>(remainingCharsData) + remainingCharsCount + newCharsToCopy);
-            if (res == QUtf8BaseTraits::EndOfString) {
+            if (res == QUtf8BaseTraits::Error || (res == QUtf8BaseTraits::EndOfString && len == 0)) {
+                // special case for len == 0:
+                // if we were supplied an empty string, terminate the previous, unfinished sequence with error
+                ++invalid;
+                *dst++ = replacement;
+            } else if (res == QUtf8BaseTraits::EndOfString) {
                 // if we got EndOfString again, then there were too few bytes in src;
                 // copy to our state and return
                 state->remainingChars = remainingCharsCount + newCharsToCopy;
                 memcpy(&state->state_data[0], remainingCharsData, state->remainingChars);
                 return QString();
-            } else if (res == QUtf8BaseTraits::Error) {
-                ++invalid;
-                *dst++ = replacement;
             } else if (!headerdone && res >= 0) {
                 // eat the UTF-8 BOM
                 headerdone = true;
@@ -322,8 +348,10 @@ QString QUtf8::convertToUnicode(const char *chars, int len, QTextCodec::Converte
             }
 
             // adjust src now that we have maybe consumed a few chars
-            //Q_ASSERT(res > remainingCharsCount)
-            src += res - remainingCharsCount;
+            if (res >= 0) {
+                Q_ASSERT(res > remainingCharsCount);
+                src += res - remainingCharsCount;
+            }
         }
     }
 
diff --git a/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp b/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp
index 12b81ee7d4..54e8f8c386 100644
--- a/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp
+++ b/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp
@@ -80,6 +80,9 @@ private slots:
     void utf8bom_data();
     void utf8bom();
 
+    void utf8stateful_data();
+    void utf8stateful();
+
     void utfHeaders_data();
     void utfHeaders();
 
@@ -1611,6 +1614,99 @@ void tst_QTextCodec::utf8bom()
     QCOMPARE(codec->toUnicode(data.constData(), data.length(), &state), result);
 }
 
+void tst_QTextCodec::utf8stateful_data()
+{
+    QTest::addColumn<QByteArray>("buffer1");
+    QTest::addColumn<QByteArray>("buffer2");
+    QTest::addColumn<QString>("result");    // null QString indicates decoder error
+
+    // valid buffer continuations
+    QTest::newRow("1of2+valid") << QByteArray("\xc2") << QByteArray("\xa0") << "\xc2\xa0";
+    QTest::newRow("1of3+valid") << QByteArray("\xe0") << QByteArray("\xa0\x80") << "\xe0\xa0\x80";
+    QTest::newRow("2of3+valid") << QByteArray("\xe0\xa0") << QByteArray("\x80") << "\xe0\xa0\x80";
+    QTest::newRow("1of4+valid") << QByteArray("\360") << QByteArray("\220\210\203") << "\360\220\210\203";
+    QTest::newRow("2of4+valid") << QByteArray("\360\220") << QByteArray("\210\203") << "\360\220\210\203";
+    QTest::newRow("3of4+valid") << QByteArray("\360\220\210") << QByteArray("\203") << "\360\220\210\203";
+    QTest::newRow("1ofBom+valid") << QByteArray("\xef") << QByteArray("\xbb\xbf") << "";
+    QTest::newRow("2ofBom+valid") << QByteArray("\xef\xbb") << QByteArray("\xbf") << "";
+
+    // invalid continuation
+    QTest::newRow("1of2+invalid") << QByteArray("\xc2") << QByteArray("a") << QString();
+    QTest::newRow("1of3+invalid") << QByteArray("\xe0") << QByteArray("a") << QString();
+    QTest::newRow("2of3+invalid") << QByteArray("\xe0\xa0") << QByteArray("a") << QString();
+    QTest::newRow("1of4+invalid") << QByteArray("\360") << QByteArray("a") << QString();
+    QTest::newRow("2of4+invalid") << QByteArray("\360\220") << QByteArray("a") << QString();
+    QTest::newRow("3of4+invalid") << QByteArray("\360\220\210") << QByteArray("a") << QString();
+
+    // invalid: sequence too short (the empty second buffer causes a state reset)
+    QTest::newRow("1of2+empty") << QByteArray("\xc2") << QByteArray() << QString();
+    QTest::newRow("1of3+empty") << QByteArray("\xe0") << QByteArray() << QString();
+    QTest::newRow("2of3+empty") << QByteArray("\xe0\xa0") << QByteArray() << QString();
+    QTest::newRow("1of4+empty") << QByteArray("\360") << QByteArray() << QString();
+    QTest::newRow("2of4+empty") << QByteArray("\360\220") << QByteArray() << QString();
+    QTest::newRow("3of4+empty") << QByteArray("\360\220\210") << QByteArray() << QString();
+
+    // overlong sequence:
+    QTest::newRow("overlong-1of2") << QByteArray("\xc1") << QByteArray("\x81") << QString();
+    QTest::newRow("overlong-1of3") << QByteArray("\xe0") << QByteArray("\x81\x81") << QString();
+    QTest::newRow("overlong-2of3") << QByteArray("\xe0\x81") << QByteArray("\x81") << QString();
+    QTest::newRow("overlong-1of4") << QByteArray("\xf0") << QByteArray("\x80\x81\x81") << QString();
+    QTest::newRow("overlong-2of4") << QByteArray("\xf0\x80") << QByteArray("\x81\x81") << QString();
+    QTest::newRow("overlong-3of4") << QByteArray("\xf0\x80\x81") << QByteArray("\x81") << QString();
+
+    // out of range:
+    // leading byte 0xF4 can produce codepoints above U+10FFFF, which aren't valid
+    QTest::newRow("outofrange1-1of4") << QByteArray("\xf4") << QByteArray("\x90\x80\x80") << QString();
+    QTest::newRow("outofrange1-2of4") << QByteArray("\xf4\x90") << QByteArray("\x80\x80") << QString();
+    QTest::newRow("outofrange1-3of4") << QByteArray("\xf4\x90\x80") << QByteArray("\x80") << QString();
+    QTest::newRow("outofrange2-1of4") << QByteArray("\xf5") << QByteArray("\x90\x80\x80") << QString();
+    QTest::newRow("outofrange2-2of4") << QByteArray("\xf5\x90") << QByteArray("\x80\x80") << QString();
+    QTest::newRow("outofrange2-3of4") << QByteArray("\xf5\x90\x80") << QByteArray("\x80") << QString();
+    QTest::newRow("outofrange-1of5") << QByteArray("\xf8") << QByteArray("\x88\x80\x80\x80") << QString();
+    QTest::newRow("outofrange-2of5") << QByteArray("\xf8\x88") << QByteArray("\x80\x80\x80") << QString();
+    QTest::newRow("outofrange-3of5") << QByteArray("\xf8\x88\x80") << QByteArray("\x80\x80") << QString();
+    QTest::newRow("outofrange-4of5") << QByteArray("\xf8\x88\x80\x80") << QByteArray("\x80") << QString();
+    QTest::newRow("outofrange-1of6") << QByteArray("\xfc") << QByteArray("\x84\x80\x80\x80\x80") << QString();
+    QTest::newRow("outofrange-2of6") << QByteArray("\xfc\x84") << QByteArray("\x80\x80\x80\x80") << QString();
+    QTest::newRow("outofrange-3of6") << QByteArray("\xfc\x84\x80") << QByteArray("\x80\x80\x80") << QString();
+    QTest::newRow("outofrange-4of6") << QByteArray("\xfc\x84\x80\x80") << QByteArray("\x80\x80") << QString();
+    QTest::newRow("outofrange-5of6") << QByteArray("\xfc\x84\x80\x80\x80") << QByteArray("\x80") << QString();
+}
+
+void tst_QTextCodec::utf8stateful()
+{
+    QFETCH(QByteArray, buffer1);
+    QFETCH(QByteArray, buffer2);
+    QFETCH(QString, result);
+
+    QTextCodec *utf8codec = QTextCodec::codecForName("utf-8");
+    QVERIFY(utf8codec);
+
+    QTextCodec::ConverterState state;
+    memset(&state, 0, sizeof state);
+
+    QString decoded1 = utf8codec->toUnicode(buffer1, buffer1.size(), &state);
+    if (result.isNull()) {
+        // the decoder may have found an early error (invalidChars > 0):
+        // if it has, remainingChars == 0;
+        // if it hasn't, then it must have a state
+        QVERIFY2((state.remainingChars == 0) != (state.invalidChars == 0),
+                 "remainingChars = " + QByteArray::number(state.remainingChars) +
+                 "; invalidChars = " + QByteArray::number(state.invalidChars));
+    } else {
+        QVERIFY(state.remainingChars > 0);
+        QCOMPARE(state.invalidChars, 0);
+    }
+
+    QString decoded2 = utf8codec->toUnicode(buffer2, buffer2.size(), &state);
+    QCOMPARE(state.remainingChars, 0);
+    if (result.isNull()) {
+        QVERIFY(state.invalidChars > 0);
+    } else {
+        QCOMPARE(decoded1 + decoded2, result);
+    }
+}
+
 void tst_QTextCodec::utfHeaders_data()
 {
     QTest::addColumn<QByteArray>("codecName");
-- 
cgit v1.2.3