From 8dd47e34b9b96ac27a99cdcf10b8aec506882fc2 Mon Sep 17 00:00:00 2001 From: Thiago Macieira Date: Sun, 20 Oct 2013 17:43:46 +0100 Subject: Add a new UTF-8 decoder, similar to the encoder we've just added Like before, this is taken from the existing QUrl code and is optimized for ASCII handling (for the same reasons). And like previously, make QString::fromUtf8 use a stateless version of the codec, which is faster. There's a small change in behavior in the decoding: we insert a U+FFFD for each byte that cannot be decoded properly. Previously, it would "eat" all bad high-bit bytes and replace them all with one single U+FFFD. Either behavior is allowed by the UTF-8 specifications, even though this new behavior will cause misalignment in the Bradley Kuhn sample UTF-8 text. Change-Id: Ib1b1f0b4291293bab345acaf376e00204ed87565 Reviewed-by: Olivier Goffart Reviewed-by: Thiago Macieira --- .../corelib/codecs/qtextcodec/tst_qtextcodec.cpp | 76 +++++++++------------- 1 file changed, 29 insertions(+), 47 deletions(-) (limited to 'tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp') diff --git a/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp b/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp index 8e1b3cf3b2..12b81ee7d4 100644 --- a/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp +++ b/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp @@ -456,7 +456,7 @@ void tst_QTextCodec::flagF7808080() const //QVERIFY(!codec->canEncode(QChar(0x1C0000))); QTextCodec::ConverterState state(QTextCodec::ConvertInvalidToNull); - QVERIFY(codec->toUnicode(input.constData(), input.length(), &state) == QChar(0)); + QCOMPARE(codec->toUnicode(input.constData(), input.length(), &state), QString(input.size(), QChar(0))); } void tst_QTextCodec::nonFlaggedEFBFBF() const @@ -689,8 +689,7 @@ void tst_QTextCodec::utf8Codec_data() utf8 += char(0xbf); utf8 += char(0xbf); utf8 += char(0xbf); - str.clear(); - str += QChar(QChar::ReplacementCharacter); + str = fromInvalidUtf8Sequence(utf8); QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.2.4") << utf8 << str << -1; // 2.2.5 U+03FFFFFF (not a valid Unicode character) @@ -755,8 +754,7 @@ void tst_QTextCodec::utf8Codec_data() utf8 += char(0x90); utf8 += char(0x80); utf8 += char(0x80); - str.clear(); - str += QChar(QChar::ReplacementCharacter); + str = fromInvalidUtf8Sequence(utf8); QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.3.5") << utf8 << str << -1; // 3.1.1 @@ -1244,7 +1242,7 @@ void tst_QTextCodec::utf8Codec_data() utf8.clear(); utf8 += char(0xc0); utf8 += char(0xaf); - str = QChar(QChar::ReplacementCharacter); + str = fromInvalidUtf8Sequence(utf8); QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.1.1") << utf8 << str << -1; // 4.1.2 @@ -1252,7 +1250,7 @@ void tst_QTextCodec::utf8Codec_data() utf8 += char(0xe0); utf8 += char(0x80); utf8 += char(0xaf); - str = QChar(QChar::ReplacementCharacter); + str = fromInvalidUtf8Sequence(utf8); QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.1.2") << utf8 << str << -1; // 4.1.3 @@ -1261,7 +1259,7 @@ void tst_QTextCodec::utf8Codec_data() utf8 += char(0x80); utf8 += char(0x80); utf8 += char(0xaf); - str = QChar(QChar::ReplacementCharacter); + str = fromInvalidUtf8Sequence(utf8); QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.1.3") << utf8 << str << -1; // 4.1.4 @@ -1289,7 +1287,7 @@ void tst_QTextCodec::utf8Codec_data() utf8.clear(); utf8 += char(0xc1); utf8 += char(0xbf); - str = QChar(QChar::ReplacementCharacter); + str = fromInvalidUtf8Sequence(utf8); QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.2.1") << utf8 << str << -1; // 4.2.2 @@ -1297,7 +1295,7 @@ void tst_QTextCodec::utf8Codec_data() utf8 += char(0xe0); utf8 += char(0x9f); utf8 += char(0xbf); - str = QChar(QChar::ReplacementCharacter); + str = fromInvalidUtf8Sequence(utf8); QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.2.2") << utf8 << str << -1; // 4.2.3 @@ -1306,7 +1304,7 @@ void tst_QTextCodec::utf8Codec_data() utf8 += char(0x8f); utf8 += char(0xbf); utf8 += char(0xbf); - str = QChar(QChar::ReplacementCharacter); + str = fromInvalidUtf8Sequence(utf8); QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.2.3") << utf8 << str << -1; // 4.2.4 @@ -1334,7 +1332,7 @@ void tst_QTextCodec::utf8Codec_data() utf8.clear(); utf8 += char(0xc0); utf8 += char(0x80); - str = QChar(QChar::ReplacementCharacter); + str = fromInvalidUtf8Sequence(utf8); QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.3.1") << utf8 << str << -1; // 4.3.2 @@ -1342,7 +1340,7 @@ void tst_QTextCodec::utf8Codec_data() utf8 += char(0xe0); utf8 += char(0x80); utf8 += char(0x80); - str = QChar(QChar::ReplacementCharacter); + str = fromInvalidUtf8Sequence(utf8); QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.3.2") << utf8 << str << -1; // 4.3.3 @@ -1351,7 +1349,7 @@ void tst_QTextCodec::utf8Codec_data() utf8 += char(0x80); utf8 += char(0x80); utf8 += char(0x80); - str = QChar(QChar::ReplacementCharacter); + str = fromInvalidUtf8Sequence(utf8); QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.3.3") << utf8 << str << -1; // 4.3.4 @@ -1380,7 +1378,7 @@ void tst_QTextCodec::utf8Codec_data() utf8 += char(0xed); utf8 += char(0xa0); utf8 += char(0x80); - str = QChar(QChar::ReplacementCharacter); + str = fromInvalidUtf8Sequence(utf8); QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.1.1") << utf8 << str << -1; // 5.1.2 @@ -1388,7 +1386,7 @@ void tst_QTextCodec::utf8Codec_data() utf8 += char(0xed); utf8 += char(0xad); utf8 += char(0xbf); - str = QChar(QChar::ReplacementCharacter); + str = fromInvalidUtf8Sequence(utf8); QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.1.2") << utf8 << str << -1; // 5.1.3 @@ -1396,7 +1394,7 @@ void tst_QTextCodec::utf8Codec_data() utf8 += char(0xed); utf8 += char(0xae); utf8 += char(0x80); - str = QChar(QChar::ReplacementCharacter); + str = fromInvalidUtf8Sequence(utf8); QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.1.3") << utf8 << str << -1; // 5.1.4 @@ -1404,7 +1402,7 @@ void tst_QTextCodec::utf8Codec_data() utf8 += char(0xed); utf8 += char(0xaf); utf8 += char(0xbf); - str = QChar(QChar::ReplacementCharacter); + str = fromInvalidUtf8Sequence(utf8); QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.1.4") << utf8 << str << -1; // 5.1.5 @@ -1412,7 +1410,7 @@ void tst_QTextCodec::utf8Codec_data() utf8 += char(0xed); utf8 += char(0xb0); utf8 += char(0x80); - str = QChar(QChar::ReplacementCharacter); + str = fromInvalidUtf8Sequence(utf8); QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.1.5") << utf8 << str << -1; // 5.1.6 @@ -1420,7 +1418,7 @@ void tst_QTextCodec::utf8Codec_data() utf8 += char(0xed); utf8 += char(0xbe); utf8 += char(0x80); - str = QChar(QChar::ReplacementCharacter); + str = fromInvalidUtf8Sequence(utf8); QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.1.6") << utf8 << str << -1; // 5.1.7 @@ -1428,7 +1426,7 @@ void tst_QTextCodec::utf8Codec_data() utf8 += char(0xed); utf8 += char(0xbf); utf8 += char(0xbf); - str = QChar(QChar::ReplacementCharacter); + str = fromInvalidUtf8Sequence(utf8); QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.1.7") << utf8 << str << -1; // 5.2.1 @@ -1439,9 +1437,7 @@ void tst_QTextCodec::utf8Codec_data() utf8 += char(0xed); utf8 += char(0xb0); utf8 += char(0x80); - str.clear(); - str += QChar(QChar::ReplacementCharacter); - str += QChar(QChar::ReplacementCharacter); + str = fromInvalidUtf8Sequence(utf8); QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.2.1") << utf8 << str << -1; // 5.2.2 @@ -1452,9 +1448,7 @@ void tst_QTextCodec::utf8Codec_data() utf8 += char(0xed); utf8 += char(0xbf); utf8 += char(0xbf); - str.clear(); - str += QChar(QChar::ReplacementCharacter); - str += QChar(QChar::ReplacementCharacter); + str = fromInvalidUtf8Sequence(utf8); QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.2.2") << utf8 << str << -1; // 5.2.3 @@ -1465,9 +1459,7 @@ void tst_QTextCodec::utf8Codec_data() utf8 += char(0xed); utf8 += char(0xb0); utf8 += char(0x80); - str.clear(); - str += QChar(QChar::ReplacementCharacter); - str += QChar(QChar::ReplacementCharacter); + str = fromInvalidUtf8Sequence(utf8); QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.2.3") << utf8 << str << -1; // 5.2.4 @@ -1478,9 +1470,7 @@ void tst_QTextCodec::utf8Codec_data() utf8 += char(0xed); utf8 += char(0xbf); utf8 += char(0xbf); - str.clear(); - str += QChar(QChar::ReplacementCharacter); - str += QChar(QChar::ReplacementCharacter); + str = fromInvalidUtf8Sequence(utf8); QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.2.4") << utf8 << str << -1; // 5.2.5 @@ -1491,9 +1481,7 @@ void tst_QTextCodec::utf8Codec_data() utf8 += char(0xed); utf8 += char(0xb0); utf8 += char(0x80); - str.clear(); - str += QChar(QChar::ReplacementCharacter); - str += QChar(QChar::ReplacementCharacter); + str = fromInvalidUtf8Sequence(utf8); QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.2.5") << utf8 << str << -1; // 5.2.6 @@ -1504,9 +1492,7 @@ void tst_QTextCodec::utf8Codec_data() utf8 += char(0xed); utf8 += char(0xbf); utf8 += char(0xbf); - str.clear(); - str += QChar(QChar::ReplacementCharacter); - str += QChar(QChar::ReplacementCharacter); + str = fromInvalidUtf8Sequence(utf8); QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.2.6") << utf8 << str << -1; // 5.2.7 @@ -1517,9 +1503,7 @@ void tst_QTextCodec::utf8Codec_data() utf8 += char(0xed); utf8 += char(0xb0); utf8 += char(0x80); - str.clear(); - str += QChar(QChar::ReplacementCharacter); - str += QChar(QChar::ReplacementCharacter); + str = fromInvalidUtf8Sequence(utf8); QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.2.7") << utf8 << str << -1; // 5.2.8 @@ -1530,9 +1514,7 @@ void tst_QTextCodec::utf8Codec_data() utf8 += char(0xed); utf8 += char(0xbf); utf8 += char(0xbf); - str.clear(); - str += QChar(QChar::ReplacementCharacter); - str += QChar(QChar::ReplacementCharacter); + str = fromInvalidUtf8Sequence(utf8); QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.2.8") << utf8 << str << -1; // 5.3.1 - non-character code @@ -1541,7 +1523,7 @@ void tst_QTextCodec::utf8Codec_data() utf8 += char(0xbf); utf8 += char(0xbe); //str = QChar(QChar::ReplacementCharacter); - str = QString::fromUtf8(utf8); + str = QChar(0xfffe); QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.3.1") << utf8 << str << -1; // 5.3.2 - non-character code @@ -1550,7 +1532,7 @@ void tst_QTextCodec::utf8Codec_data() utf8 += char(0xbf); utf8 += char(0xbf); //str = QChar(QChar::ReplacementCharacter); - str = QString::fromUtf8(utf8); + str = QChar(0xffff); QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.3.2") << utf8 << str << -1; } -- cgit v1.2.3