5 files changed, 225 insertions, 135 deletions
diff --git a/src/corelib/codecs/qutfcodec.cpp b/src/corelib/codecs/qutfcodec.cpp
index d1fc5b851a..b0e0b3f010 100644
--- a/src/corelib/codecs/qutfcodec.cpp
+++ b/src/corelib/codecs/qutfcodec.cpp
@@ -128,114 +128,117 @@ QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::Conve
     return rstr;
 }
 
+QString QUtf8::convertToUnicode(const char *chars, int len)
+{
+    QString result(len + 1, Qt::Uninitialized); // worst case
+    ushort *dst = reinterpret_cast<ushort *>(const_cast<QChar *>(result.constData()));
+    const uchar *src = reinterpret_cast<const uchar *>(chars);
+    const uchar *end = src + len;
+
+    while (src < end) {
+        uchar b = *src++;
+        int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end);
+        if (res < 0) {
+            // decoding error
+            *dst++ = QChar::ReplacementCharacter;
+        }
+    }
+
+    result.truncate(dst - reinterpret_cast<const ushort *>(result.constData()));
+    return result;
+}
+
 QString QUtf8::convertToUnicode(const char *chars, int len, QTextCodec::ConverterState *state)
 {
     bool headerdone = false;
     ushort replacement = QChar::ReplacementCharacter;
     int need = 0;
-    int error = -1;
-    uint uc = 0;
-    uint min_uc = 0;
+    int invalid = 0;
+    int res;
+    uchar ch = 0;
+
+    QString result(need + len + 1, Qt::Uninitialized); // worst case
+    ushort *dst = reinterpret_cast<ushort *>(const_cast<QChar *>(result.constData()));
+    const uchar *src = reinterpret_cast<const uchar *>(chars);
+    const uchar *end = src + len;
+
     if (state) {
         if (state->flags & QTextCodec::IgnoreHeader)
             headerdone = true;
         if (state->flags & QTextCodec::ConvertInvalidToNull)
             replacement = QChar::Null;
-        need = state->remainingChars;
-        if (need) {
-            uc = state->state_data[0];
-            min_uc = state->state_data[1];
-        }
-    }
-    if (!headerdone && len > 3
-        && (uchar)chars[0] == 0xef && (uchar)chars[1] == 0xbb && (uchar)chars[2] == 0xbf) {
-        // starts with a byte order mark
-        chars += 3;
-        len -= 3;
-        headerdone = true;
-    }
-
-    QString result(need + len + 1, Qt::Uninitialized); // worst case
-    ushort *qch = (ushort *)result.unicode();
-    uchar ch;
-    int invalid = 0;
-
-    for (int i = 0; i < len; ++i) {
-        ch = chars[i];
-        if (need) {
-            if ((ch&0xc0) == 0x80) {
-                uc = (uc << 6) | (ch & 0x3f);
-                --need;
-                if (!need) {
-                    // utf-8 bom composes into 0xfeff code point
-                    if (!headerdone && uc == 0xfeff) {
-                        // don't do anything, just skip the BOM
-                    } else if (QChar::requiresSurrogates(uc) && uc <= QChar::LastValidCodePoint) {
-                        // surrogate pair
-                        Q_ASSERT((qch - (ushort*)result.unicode()) + 2 < result.length());
-                        *qch++ = QChar::highSurrogate(uc);
-                        *qch++ = QChar::lowSurrogate(uc);
-                    } else if ((uc < min_uc) || QChar::isSurrogate(uc) || uc > QChar::LastValidCodePoint) {
-                        // error: overlong sequence, UTF16 surrogate or non-character
-                        *qch++ = replacement;
-                        ++invalid;
-                    } else {
-                        *qch++ = uc;
-                    }
-                    headerdone = true;
-                }
-            } else {
-                // error
-                i = error;
-                *qch++ = replacement;
-                ++invalid;
-                need = 0;
-                headerdone = true;
-            }
-        } else {
-            if (ch < 128) {
-                *qch++ = ushort(ch);
-                headerdone = true;
-            } else if ((ch & 0xe0) == 0xc0) {
-                uc = ch & 0x1f;
-                need = 1;
-                error = i;
-                min_uc = 0x80;
-                headerdone = true;
-            } else if ((ch & 0xf0) == 0xe0) {
-                uc = ch & 0x0f;
-                need = 2;
-                error = i;
-                min_uc = 0x800;
-            } else if ((ch&0xf8) == 0xf0) {
-                uc = ch & 0x07;
-                need = 3;
-                error = i;
-                min_uc = 0x10000;
-                headerdone = true;
-            } else {
-                // error
-                *qch++ = replacement;
+        if (state->remainingChars) {
+            // handle incoming state first
+            uchar remainingCharsData[4]; // longest UTF-8 sequence possible
+            int remainingCharsCount = state->remainingChars;
+            int newCharsToCopy = qMin<int>(sizeof(remainingCharsData) - remainingCharsCount, end - src);
+
+            memset(remainingCharsData, 0, sizeof(remainingCharsData));
+            memcpy(remainingCharsData, &state->state_data[0], remainingCharsCount);
+            memcpy(remainingCharsData + remainingCharsCount, src, newCharsToCopy);
+
+            const uchar *begin = &remainingCharsData[1];
+            res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(remainingCharsData[0], dst, begin,
+                    static_cast<const uchar *>(remainingCharsData) + remainingCharsCount + newCharsToCopy);
+            if (res == QUtf8BaseTraits::EndOfString) {
+                // if we got EndOfString again, then there were too few bytes in src;
+                // copy to our state and return
+                state->remainingChars = remainingCharsCount + newCharsToCopy;
+                memcpy(&state->state_data[0], remainingCharsData, state->remainingChars);
+                return QString();
+            } else if (res == QUtf8BaseTraits::Error) {
                 ++invalid;
+                *dst++ = replacement;
+            } else if (!headerdone && res >= 0) {
+                // eat the UTF-8 BOM
                 headerdone = true;
+                if (dst[-1] == 0xfeff)
+                    --dst;
             }
+
+            // adjust src now that we have maybe consumed a few chars
+            //Q_ASSERT(res > remainingCharsCount)
+            src += res - remainingCharsCount;
         }
     }
-    if (!state && need > 0) {
-        // unterminated UTF sequence
-        for (int i = error; i < len; ++i) {
-            *qch++ = replacement;
+
+    // main body, stateless decoding
+    res = 0;
+    while (res >= 0 && src < end) {
+        ch = *src++;
+        res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(ch, dst, src, end);
+        if (!headerdone && res >= 0) {
+            headerdone = true;
+            // eat the UTF-8 BOM
+            if (dst[-1] == 0xfeff)
+                --dst;
+        }
+        if (res == QUtf8BaseTraits::Error) {
+            res = 0;
             ++invalid;
+            *dst++ = replacement;
         }
     }
-    result.truncate(qch - (ushort *)result.unicode());
+
+    if (!state && res == QUtf8BaseTraits::EndOfString) {
+        // unterminated UTF sequence
+        *dst++ = QChar::ReplacementCharacter;
+        while (src++ < end)
+            *dst++ = QChar::ReplacementCharacter;
+    }
+
+    result.truncate(dst - (ushort *)result.unicode());
     if (state) {
         state->invalidChars += invalid;
-        state->remainingChars = need;
         if (headerdone)
             state->flags |= QTextCodec::IgnoreHeader;
-        state->state_data[0] = need ? uc : 0;
-        state->state_data[1] = need ? min_uc : 0;
+        if (res == QUtf8BaseTraits::EndOfString) {
+            --src; // unread the byte in ch
+            state->remainingChars = end - src;
+            memcpy(&state->state_data[0], src, end - src);
+        } else {
+            state->remainingChars = 0;
+        }
     }
     return result;
 }
diff --git a/src/corelib/codecs/qutfcodec_p.h b/src/corelib/codecs/qutfcodec_p.h
index 4f0e2394fe..c94a7a12e4 100644
--- a/src/corelib/codecs/qutfcodec_p.h
+++ b/src/corelib/codecs/qutfcodec_p.h
@@ -169,6 +169,110 @@ namespace QUtf8Functions
         Traits::appendByte(dst, 0x80 | (u & 0x3f));
         return 0;
     }
+
+    inline bool isContinuationByte(uchar b)
+    {
+        return (b & 0xc0) == 0x80;
+    }
+
+    /// returns the number of characters consumed (including \a b) in case of success;
+    /// returns negative in case of error: Traits::Error or Traits::EndOfString
+    template <typename Traits, typename OutputPtr, typename InputPtr> inline
+    int fromUtf8(uchar b, OutputPtr &dst, InputPtr &src, InputPtr end)
+    {
+        int charsNeeded;
+        uint min_uc;
+        uint uc;
+
+        if (!Traits::skipAsciiHandling && b < 0x80) {
+            // US-ASCII
+            Traits::appendUtf16(dst, b);
+            return 1;
+        }
+
+        if (!Traits::isTrusted && Q_UNLIKELY(b <= 0xC1)) {
+            // an UTF-8 first character must be at least 0xC0
+            // however, all 0xC0 and 0xC1 first bytes can only produce overlong sequences
+            return Traits::Error;
+        } else if (b < 0xe0) {
+            charsNeeded = 2;
+            min_uc = 0x80;
+            uc = b & 0x1f;
+        } else if (b < 0xf0) {
+            charsNeeded = 3;
+            min_uc = 0x800;
+            uc = b & 0x0f;
+        } else if (b < 0xf5) {
+            charsNeeded = 4;
+            min_uc = 0x10000;
+            uc = b & 0x07;
+        } else {
+            // the last Unicode character is U+10FFFF
+            // it's encoded in UTF-8 as "\xF4\x8F\xBF\xBF"
+            // therefore, a byte higher than 0xF4 is not the UTF-8 first byte
+            return Traits::Error;
+        }
+
+        int bytesAvailable = Traits::availableBytes(src, end);
+        if (Q_UNLIKELY(bytesAvailable < charsNeeded - 1)) {
+            // it's possible that we have an error instead of just unfinished bytes
+            if (bytesAvailable > 0 && !isContinuationByte(Traits::peekByte(src, 0)))
+                return Traits::Error;
+            if (bytesAvailable > 1 && !isContinuationByte(Traits::peekByte(src, 1)))
+                return Traits::Error;
+            if (bytesAvailable > 2 && !isContinuationByte(Traits::peekByte(src, 2)))
+                return Traits::Error;
+            return Traits::EndOfString;
+        }
+
+        // first continuation character
+        b = Traits::peekByte(src, 0);
+        if (!isContinuationByte(b))
+            return Traits::Error;
+        uc <<= 6;
+        uc |= b & 0x3f;
+
+        if (charsNeeded > 2) {
+            // second continuation character
+            b = Traits::peekByte(src, 1);
+            if (!isContinuationByte(b))
+                return Traits::Error;
+            uc <<= 6;
+            uc |= b & 0x3f;
+
+            if (charsNeeded > 3) {
+                // third continuation character
+                b = Traits::peekByte(src, 2);
+                if (!isContinuationByte(b))
+                    return Traits::Error;
+                uc <<= 6;
+                uc |= b & 0x3f;
+            }
+        }
+
+        // we've decoded something; safety-check it
+        if (!Traits::isTrusted) {
+            if (uc < min_uc)
+                return Traits::Error;
+            if (QChar::isSurrogate(uc) || uc > QChar::LastValidCodePoint)
+                return Traits::Error;
+            if (!Traits::allowNonCharacters && QChar::isNonCharacter(uc))
+                return Traits::Error;
+        }
+
+        // write the UTF-16 sequence
+        if (!QChar::requiresSurrogates(uc)) {
+            // UTF-8 decoded and no surrogates are required
+            // detach if necessary
+            Traits::appendUtf16(dst, ushort(uc));
+        } else {
+            // UTF-8 decoded to something that requires a surrogate pair
+            Traits::appendUcs4(dst, uc);
+        }
+
+        Traits::advanceByte(src, charsNeeded - 1);
+        return charsNeeded;
+    }
 }
 
 enum DataEndianness
@@ -180,6 +284,7 @@ enum DataEndianness
 
 struct QUtf8
 {
+    static QString convertToUnicode(const char *, int);
     static QString convertToUnicode(const char *, int, QTextCodec::ConverterState *);
     static QByteArray convertFromUnicode(const QChar *, int);
     static QByteArray convertFromUnicode(const QChar *, int, QTextCodec::ConverterState *);
diff --git a/src/corelib/tools/qstring.cpp b/src/corelib/tools/qstring.cpp
index bd2aa3450a..9544062dd9 100644
--- a/src/corelib/tools/qstring.cpp
+++ b/src/corelib/tools/qstring.cpp
@@ -4317,7 +4317,7 @@ QString QString::fromUtf8_helper(const char *str, int size)
         return QString();
 
     Q_ASSERT(size != -1);
-    return QUtf8::convertToUnicode(str, size, 0);
+    return QUtf8::convertToUnicode(str, size);
 }
 
 /*!
diff --git a/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp b/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp
index 8e1b3cf3b2..12b81ee7d4 100644
--- a/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp
+++ b/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp
@@ -456,7 +456,7 @@ void tst_QTextCodec::flagF7808080() const
     //QVERIFY(!codec->canEncode(QChar(0x1C0000)));
 
     QTextCodec::ConverterState state(QTextCodec::ConvertInvalidToNull);
-    QVERIFY(codec->toUnicode(input.constData(), input.length(), &state) == QChar(0));
+    QCOMPARE(codec->toUnicode(input.constData(), input.length(), &state), QString(input.size(), QChar(0)));
 }
 
 void tst_QTextCodec::nonFlaggedEFBFBF() const
@@ -689,8 +689,7 @@ void tst_QTextCodec::utf8Codec_data()
     utf8 += char(0xbf);
     utf8 += char(0xbf);
     utf8 += char(0xbf);
-    str.clear();
-    str += QChar(QChar::ReplacementCharacter);
+    str = fromInvalidUtf8Sequence(utf8);
     QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.2.4") << utf8 << str << -1;
 
     // 2.2.5 U+03FFFFFF (not a valid Unicode character)
@@ -755,8 +754,7 @@ void tst_QTextCodec::utf8Codec_data()
     utf8 += char(0x90);
     utf8 += char(0x80);
     utf8 += char(0x80);
-    str.clear();
-    str += QChar(QChar::ReplacementCharacter);
+    str = fromInvalidUtf8Sequence(utf8);
     QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.3.5") << utf8 << str << -1;
 
     // 3.1.1
@@ -1244,7 +1242,7 @@ void tst_QTextCodec::utf8Codec_data()
     utf8.clear();
     utf8 += char(0xc0);
     utf8 += char(0xaf);
-    str = QChar(QChar::ReplacementCharacter);
+    str = fromInvalidUtf8Sequence(utf8);
     QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.1.1") << utf8 << str << -1;
 
     // 4.1.2
@@ -1252,7 +1250,7 @@ void tst_QTextCodec::utf8Codec_data()
     utf8 += char(0xe0);
     utf8 += char(0x80);
     utf8 += char(0xaf);
-    str = QChar(QChar::ReplacementCharacter);
+    str = fromInvalidUtf8Sequence(utf8);
     QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.1.2") << utf8 << str << -1;
 
     // 4.1.3
@@ -1261,7 +1259,7 @@ void tst_QTextCodec::utf8Codec_data()
     utf8 += char(0x80);
     utf8 += char(0x80);
     utf8 += char(0xaf);
-    str = QChar(QChar::ReplacementCharacter);
+    str = fromInvalidUtf8Sequence(utf8);
     QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.1.3") << utf8 << str << -1;
 
     // 4.1.4
@@ -1289,7 +1287,7 @@ void tst_QTextCodec::utf8Codec_data()
     utf8.clear();
     utf8 += char(0xc1);
     utf8 += char(0xbf);
-    str = QChar(QChar::ReplacementCharacter);
+    str = fromInvalidUtf8Sequence(utf8);
     QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.2.1") << utf8 << str << -1;
 
     // 4.2.2
@@ -1297,7 +1295,7 @@ void tst_QTextCodec::utf8Codec_data()
     utf8 += char(0xe0);
     utf8 += char(0x9f);
     utf8 += char(0xbf);
-    str = QChar(QChar::ReplacementCharacter);
+    str = fromInvalidUtf8Sequence(utf8);
     QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.2.2") << utf8 << str << -1;
 
     // 4.2.3
@@ -1306,7 +1304,7 @@ void tst_QTextCodec::utf8Codec_data()
     utf8 += char(0x8f);
     utf8 += char(0xbf);
     utf8 += char(0xbf);
-    str = QChar(QChar::ReplacementCharacter);
+    str = fromInvalidUtf8Sequence(utf8);
     QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.2.3") << utf8 << str << -1;
 
     // 4.2.4
@@ -1334,7 +1332,7 @@ void tst_QTextCodec::utf8Codec_data()
     utf8.clear();
     utf8 += char(0xc0);
     utf8 += char(0x80);
-    str = QChar(QChar::ReplacementCharacter);
+    str = fromInvalidUtf8Sequence(utf8);
     QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.3.1") << utf8 << str << -1;
 
     // 4.3.2
@@ -1342,7 +1340,7 @@ void tst_QTextCodec::utf8Codec_data()
     utf8 += char(0xe0);
     utf8 += char(0x80);
     utf8 += char(0x80);
-    str = QChar(QChar::ReplacementCharacter);
+    str = fromInvalidUtf8Sequence(utf8);
     QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.3.2") << utf8 << str << -1;
 
     // 4.3.3
@@ -1351,7 +1349,7 @@ void tst_QTextCodec::utf8Codec_data()
     utf8 += char(0x80);
     utf8 += char(0x80);
     utf8 += char(0x80);
-    str = QChar(QChar::ReplacementCharacter);
+    str = fromInvalidUtf8Sequence(utf8);
     QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.3.3") << utf8 << str << -1;
 
     // 4.3.4
@@ -1380,7 +1378,7 @@ void tst_QTextCodec::utf8Codec_data()
     utf8 += char(0xed);
     utf8 += char(0xa0);
     utf8 += char(0x80);
-    str = QChar(QChar::ReplacementCharacter);
+    str = fromInvalidUtf8Sequence(utf8);
     QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.1.1") << utf8 << str << -1;
 
     // 5.1.2
@@ -1388,7 +1386,7 @@ void tst_QTextCodec::utf8Codec_data()
     utf8 += char(0xed);
     utf8 += char(0xad);
     utf8 += char(0xbf);
-    str = QChar(QChar::ReplacementCharacter);
+    str = fromInvalidUtf8Sequence(utf8);
     QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.1.2") << utf8 << str << -1;
 
     // 5.1.3
@@ -1396,7 +1394,7 @@ void tst_QTextCodec::utf8Codec_data()
     utf8 += char(0xed);
     utf8 += char(0xae);
     utf8 += char(0x80);
-    str = QChar(QChar::ReplacementCharacter);
+    str = fromInvalidUtf8Sequence(utf8);
     QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.1.3") << utf8 << str << -1;
 
     // 5.1.4
@@ -1404,7 +1402,7 @@ void tst_QTextCodec::utf8Codec_data()
     utf8 += char(0xed);
     utf8 += char(0xaf);
     utf8 += char(0xbf);
-    str = QChar(QChar::ReplacementCharacter);
+    str = fromInvalidUtf8Sequence(utf8);
     QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.1.4") << utf8 << str << -1;
 
     // 5.1.5
@@ -1412,7 +1410,7 @@ void tst_QTextCodec::utf8Codec_data()
     utf8 += char(0xed);
     utf8 += char(0xb0);
     utf8 += char(0x80);
-    str = QChar(QChar::ReplacementCharacter);
+    str = fromInvalidUtf8Sequence(utf8);
     QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.1.5") << utf8 << str << -1;
 
     // 5.1.6
@@ -1420,7 +1418,7 @@ void tst_QTextCodec::utf8Codec_data()
     utf8 += char(0xed);
     utf8 += char(0xbe);
     utf8 += char(0x80);
-    str = QChar(QChar::ReplacementCharacter);
+    str = fromInvalidUtf8Sequence(utf8);
     QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.1.6") << utf8 << str << -1;
 
     // 5.1.7
@@ -1428,7 +1426,7 @@ void tst_QTextCodec::utf8Codec_data()
     utf8 += char(0xed);
     utf8 += char(0xbf);
     utf8 += char(0xbf);
-    str = QChar(QChar::ReplacementCharacter);
+    str = fromInvalidUtf8Sequence(utf8);
     QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.1.7") << utf8 << str << -1;
 
     // 5.2.1
@@ -1439,9 +1437,7 @@ void tst_QTextCodec::utf8Codec_data()
     utf8 += char(0xed);
     utf8 += char(0xb0);
     utf8 += char(0x80);
-    str.clear();
-    str += QChar(QChar::ReplacementCharacter);
-    str += QChar(QChar::ReplacementCharacter);
+    str = fromInvalidUtf8Sequence(utf8);
     QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.2.1") << utf8 << str << -1;
 
     // 5.2.2
@@ -1452,9 +1448,7 @@ void tst_QTextCodec::utf8Codec_data()
     utf8 += char(0xed);
     utf8 += char(0xbf);
     utf8 += char(0xbf);
-    str.clear();
-    str += QChar(QChar::ReplacementCharacter);
-    str += QChar(QChar::ReplacementCharacter);
+    str = fromInvalidUtf8Sequence(utf8);
     QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.2.2") << utf8 << str << -1;
 
     // 5.2.3
@@ -1465,9 +1459,7 @@ void tst_QTextCodec::utf8Codec_data()
     utf8 += char(0xed);
     utf8 += char(0xb0);
     utf8 += char(0x80);
-    str.clear();
-    str += QChar(QChar::ReplacementCharacter);
-    str += QChar(QChar::ReplacementCharacter);
+    str = fromInvalidUtf8Sequence(utf8);
     QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.2.3") << utf8 << str << -1;
 
     // 5.2.4
@@ -1478,9 +1470,7 @@ void tst_QTextCodec::utf8Codec_data()
     utf8 += char(0xed);
     utf8 += char(0xbf);
     utf8 += char(0xbf);
-    str.clear();
-    str += QChar(QChar::ReplacementCharacter);
-    str += QChar(QChar::ReplacementCharacter);
+    str = fromInvalidUtf8Sequence(utf8);
     QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.2.4") << utf8 << str << -1;
 
     // 5.2.5
@@ -1491,9 +1481,7 @@ void tst_QTextCodec::utf8Codec_data()
     utf8 += char(0xed);
     utf8 += char(0xb0);
     utf8 += char(0x80);
-    str.clear();
-    str += QChar(QChar::ReplacementCharacter);
-    str += QChar(QChar::ReplacementCharacter);
+    str = fromInvalidUtf8Sequence(utf8);
     QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.2.5") << utf8 << str << -1;
 
     // 5.2.6
@@ -1504,9 +1492,7 @@ void tst_QTextCodec::utf8Codec_data()
     utf8 += char(0xed);
     utf8 += char(0xbf);
     utf8 += char(0xbf);
-    str.clear();
-    str += QChar(QChar::ReplacementCharacter);
-    str += QChar(QChar::ReplacementCharacter);
+    str = fromInvalidUtf8Sequence(utf8);
     QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.2.6") << utf8 << str << -1;
 
     // 5.2.7
@@ -1517,9 +1503,7 @@ void tst_QTextCodec::utf8Codec_data()
     utf8 += char(0xed);
     utf8 += char(0xb0);
     utf8 += char(0x80);
-    str.clear();
-    str += QChar(QChar::ReplacementCharacter);
-    str += QChar(QChar::ReplacementCharacter);
+    str = fromInvalidUtf8Sequence(utf8);
     QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.2.7") << utf8 << str << -1;
 
     // 5.2.8
@@ -1530,9 +1514,7 @@ void tst_QTextCodec::utf8Codec_data()
     utf8 += char(0xed);
     utf8 += char(0xbf);
     utf8 += char(0xbf);
-    str.clear();
-    str += QChar(QChar::ReplacementCharacter);
-    str += QChar(QChar::ReplacementCharacter);
+    str = fromInvalidUtf8Sequence(utf8);
     QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.2.8") << utf8 << str << -1;
 
     // 5.3.1 - non-character code
@@ -1541,7 +1523,7 @@ void tst_QTextCodec::utf8Codec_data()
     utf8 += char(0xbf);
     utf8 += char(0xbe);
     //str = QChar(QChar::ReplacementCharacter);
-    str = QString::fromUtf8(utf8);
+    str = QChar(0xfffe);
     QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.3.1") << utf8 << str << -1;
 
     // 5.3.2 - non-character code
@@ -1550,7 +1532,7 @@ void tst_QTextCodec::utf8Codec_data()
     utf8 += char(0xbf);
     utf8 += char(0xbf);
     //str = QChar(QChar::ReplacementCharacter);
-    str = QString::fromUtf8(utf8);
+    str = QChar(0xffff);
     QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.3.2") << utf8 << str << -1;
 }
 
diff --git a/tests/auto/xml/sax/qxmlsimplereader/xmldocs/not-wf/sa/170.xml.ref b/tests/auto/xml/sax/qxmlsimplereader/xmldocs/not-wf/sa/170.xml.ref
index eca786f688..87336aa00f 100644
--- a/tests/auto/xml/sax/qxmlsimplereader/xmldocs/not-wf/sa/170.xml.ref
+++ b/tests/auto/xml/sax/qxmlsimplereader/xmldocs/not-wf/sa/170.xml.ref
@@ -1,6 +1,6 @@
 setDocumentLocator(locator={columnNumber=1, lineNumber=1})
 startDocument()
    startElement(namespaceURI="", localName="doc", qName="doc", atts=[])
-      characters(ch="�")
+      characters(ch="����")
    endElement(namespaceURI="", localName="doc", qName="doc")
 endDocument()