From f64a6bd638d399403845fe52e6f8e52889f1f52b Mon Sep 17 00:00:00 2001 From: Lars Knoll Date: Thu, 30 Apr 2020 10:50:44 +0200 Subject: Start work on a new API to replace QTextCodec The new QStringEncoder and QStringDecoder classes (with a common QStringConverter base class) are there to replace QTextCodec in Qt 6. It currently uses a trivial wrapper around the utf encoding functionality. Added some autotests, mostly copied from the text codec tests. Change-Id: Ib6eeee55fba918b9424be244cbda9dfd5096f7eb Reviewed-by: Thiago Macieira --- .../corelib/codecs/qtextcodec/tst_qtextcodec.cpp | 1 - tests/auto/corelib/text/CMakeLists.txt | 1 + .../corelib/text/qstringconverter/CMakeLists.txt | 10 + .../text/qstringconverter/qstringconverter.pro | 5 + .../text/qstringconverter/tst_qstringconverter.cpp | 1591 ++++++++++++++++++++ tests/auto/corelib/text/qstringconverter/utf8.txt | 1 + tests/auto/corelib/text/text.pro | 1 + 7 files changed, 1609 insertions(+), 1 deletion(-) create mode 100644 tests/auto/corelib/text/qstringconverter/CMakeLists.txt create mode 100644 tests/auto/corelib/text/qstringconverter/qstringconverter.pro create mode 100644 tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp create mode 100644 tests/auto/corelib/text/qstringconverter/utf8.txt (limited to 'tests') diff --git a/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp b/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp index 799c0bfc77..90edac3ed0 100644 --- a/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp +++ b/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp @@ -1686,7 +1686,6 @@ void tst_QTextCodec::utf8stateful() QVERIFY(utf8codec); QTextCodec::ConverterState state; - memset(&state, 0, sizeof state); QString decoded1 = utf8codec->toUnicode(buffer1, buffer1.size(), &state); if (result.isNull()) { diff --git a/tests/auto/corelib/text/CMakeLists.txt b/tests/auto/corelib/text/CMakeLists.txt index e23de92c8c..19cd71a987 100644 --- a/tests/auto/corelib/text/CMakeLists.txt +++ b/tests/auto/corelib/text/CMakeLists.txt @@ -14,6 +14,7 @@ add_subdirectory(qstring) add_subdirectory(qstring_no_cast_from_bytearray) add_subdirectory(qstringapisymmetry) add_subdirectory(qstringbuilder) +add_subdirectory(qstringconverter) add_subdirectory(qstringiterator) add_subdirectory(qstringlist) add_subdirectory(qstringmatcher) diff --git a/tests/auto/corelib/text/qstringconverter/CMakeLists.txt b/tests/auto/corelib/text/qstringconverter/CMakeLists.txt new file mode 100644 index 0000000000..582e4fcf93 --- /dev/null +++ b/tests/auto/corelib/text/qstringconverter/CMakeLists.txt @@ -0,0 +1,10 @@ +# Generated from qstringconverter.pro. + +##################################################################### +## tst_qstringconverter Test: +##################################################################### + +qt_add_test(tst_qstringconverter + SOURCES + tst_qstringconverter.cpp +) diff --git a/tests/auto/corelib/text/qstringconverter/qstringconverter.pro b/tests/auto/corelib/text/qstringconverter/qstringconverter.pro new file mode 100644 index 0000000000..6cd797e805 --- /dev/null +++ b/tests/auto/corelib/text/qstringconverter/qstringconverter.pro @@ -0,0 +1,5 @@ +CONFIG += testcase +QT = core testlib +SOURCES = tst_qstringconverter.cpp + +TARGET = tst_qstringconverter diff --git a/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp b/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp new file mode 100644 index 0000000000..c94f8146ff --- /dev/null +++ b/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp @@ -0,0 +1,1591 @@ +/**************************************************************************** +** +** Copyright (C) 2020 The Qt Company Ltd. +** Copyright (C) 2016 Intel Corporation. +** Contact: https://www.qt.io/licensing/ +** +** This file is part of the test suite of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:GPL-EXCEPT$ +** Commercial License Usage +** Licensees holding valid commercial Qt licenses may use this file in +** accordance with the commercial license agreement provided with the +** Software or, alternatively, in accordance with the terms contained in +** a written agreement between you and The Qt Company. For licensing terms +** and conditions see https://www.qt.io/terms-conditions. For further +** information use the contact form at https://www.qt.io/contact-us. +** +** GNU General Public License Usage +** Alternatively, this file may be used under the terms of the GNU +** General Public License version 3 as published by the Free Software +** Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT +** included in the packaging of this file. Please review the following +** information to ensure the GNU General Public License requirements will +** be met: https://www.gnu.org/licenses/gpl-3.0.html. +** +** $QT_END_LICENSE$ +** +****************************************************************************/ + +#include + +#include +#include + +class tst_QStringConverter : public QObject +{ + Q_OBJECT + +private slots: + void threadSafety(); + + void convertUtf8(); + + void nonFlaggedCodepointFFFF() const; + void flagF7808080() const; + void nonFlaggedEFBFBF() const; + void decode0D() const; + + void utf8Codec_data(); + void utf8Codec(); + + void utf8bom_data(); + void utf8bom(); + + void utf8stateful_data(); + void utf8stateful(); + + void utfHeaders_data(); + void utfHeaders(); +}; + +void tst_QStringConverter::convertUtf8() +{ + QFile file(QFINDTESTDATA("utf8.txt")); + + if (!file.open(QIODevice::ReadOnly)) + QFAIL(qPrintable("File could not be opened: " + file.errorString())); + + QByteArray ba = file.readAll(); + QVERIFY(!ba.isEmpty()); + + { + QStringDecoder decoder(QStringDecoder::Utf8); + QVERIFY(decoder.isValid()); + QString uniString = decoder(ba); + QCOMPARE(uniString, QString::fromUtf8(ba)); + QCOMPARE(ba, uniString.toUtf8()); + + QStringEncoder encoder(QStringEncoder::Utf8); + QCOMPARE(ba, encoder(uniString)); + } + + { + // once again converting char by char + QStringDecoder decoder(QStringDecoder::Utf8); + QVERIFY(decoder.isValid()); + QString uniString; + for (int i = 0; i < ba.size(); ++i) + uniString += decoder(ba.constData() + i, 1); + QCOMPARE(uniString, QString::fromUtf8(ba)); + + QStringEncoder encoder(QStringEncoder::Utf8); + QByteArray reencoded; + for (int i = 0; i < uniString.size(); ++i) + reencoded += encoder(uniString.constData() + i, 1); + QCOMPARE(ba, encoder(uniString)); + } +} + +void tst_QStringConverter::nonFlaggedCodepointFFFF() const +{ + //Check that the code point 0xFFFF (=non-character code 0xEFBFBF) is not flagged + const QChar ch(0xFFFF); + + QStringEncoder encoder(QStringEncoder::Utf8); + QVERIFY(encoder.isValid()); + + const QByteArray asDecoded = encoder(QStringView(&ch, 1)); + QCOMPARE(asDecoded, QByteArray("\357\277\277")); + + QByteArray ffff("\357\277\277"); + QStringDecoder decoder(QStringEncoder::Utf8, QStringDecoder::ConvertInvalidToNull); + QVERIFY(decoder.isValid()); + QVERIFY(decoder(ffff) == QString(1, ch)); +} + +void tst_QStringConverter::flagF7808080() const +{ + /* This test case stems from test not-wf-sa-170, tests/qxmlstream/XML-Test-Suite/xmlconf/xmltest/not-wf/sa/166.xml, + * whose description reads: + * + * "Four byte UTF-8 encodings can encode UCS-4 characters + * which are beyond the range of legal XML characters + * (and can't be expressed in Unicode surrogate pairs). + * This document holds such a character." + * + * In binary, this is: + * 11110111100000001000000010000000 + * * * * * + * 11110www10xxxxxx10yyyyyy10zzzzzz + * + * With multibyte logic removed it is the codepoint 0x1C0000. + */ + QByteArray input; + input.resize(4); + input[0] = char(0xF7); + input[1] = char(0x80); + input[2] = char(0x80); + input[3] = char(0x80); + + QStringDecoder decoder(QStringEncoder::Utf8, QStringDecoder::ConvertInvalidToNull); + QVERIFY(decoder.isValid()); + + QCOMPARE(decoder(input), QString(input.size(), QChar(0))); +} + +void tst_QStringConverter::nonFlaggedEFBFBF() const +{ + /* Check that the codec does NOT flag EFBFBF. + * This is a regression test; see QTBUG-33229 + */ + QByteArray validInput; + validInput.resize(3); + validInput[0] = char(0xEF); + validInput[1] = char(0xBF); + validInput[2] = char(0xBF); + + { + QStringDecoder decoder(QStringEncoder::Utf8, QStringDecoder::ConvertInvalidToNull); + QVERIFY(decoder.isValid()); + QVERIFY(decoder(validInput) == QString::fromUtf8(QByteArray::fromHex("EFBFBF"))); + } + + // Check that 0xEFBFBF is correctly decoded when preceded by an arbitrary character + { + QByteArray start("B"); + start.append(validInput); + + QStringDecoder decoder(QStringEncoder::Utf8, QStringDecoder::ConvertInvalidToNull); + QVERIFY(decoder.isValid()); + QVERIFY(decoder(start) == QString::fromUtf8(QByteArray("B").append(QByteArray::fromHex("EFBFBF")))); + } +} + +void tst_QStringConverter::decode0D() const +{ + QByteArray input; + input.resize(3); + input[0] = 'A'; + input[1] = '\r'; + input[2] = 'B'; + + QCOMPARE(QString::fromUtf8(input.constData()).toUtf8(), input); +} + +static QString fromInvalidUtf8Sequence(const QByteArray &ba) +{ + return QString().fill(QChar::ReplacementCharacter, ba.size()); +} + +// copied from tst_QString::fromUtf8_data() +void tst_QStringConverter::utf8Codec_data() +{ + QTest::addColumn("utf8"); + QTest::addColumn("res"); + QTest::addColumn("len"); + QString str; + + QTest::newRow("str0") << QByteArray("abcdefgh") << QString("abcdefgh") << -1; + QTest::newRow("str0-len") << QByteArray("abcdefgh") << QString("abc") << 3; + QTest::newRow("str1") << QByteArray("\303\266\303\244\303\274\303\226\303\204\303\234\303\270\303\246\303\245\303\230\303\206\303\205") + << QString::fromLatin1("\366\344\374\326\304\334\370\346\345\330\306\305") << -1; + QTest::newRow("str1-len") << QByteArray("\303\266\303\244\303\274\303\226\303\204\303\234\303\270\303\246\303\245\303\230\303\206\303\205") + << QString::fromLatin1("\366\344\374\326\304") << 10; + + str += QChar(0x05e9); + str += QChar(0x05d3); + str += QChar(0x05d2); + QTest::newRow("str2") << QByteArray("\327\251\327\223\327\222") << str << -1; + + str = QChar(0x05e9); + QTest::newRow("str2-len") << QByteArray("\327\251\327\223\327\222") << str << 2; + + str = QChar(0x20ac); + str += " some text"; + QTest::newRow("str3") << QByteArray("\342\202\254 some text") << str << -1; + + str = QChar(0x20ac); + str += " some "; + QTest::newRow("str3-len") << QByteArray("\342\202\254 some text") << str << 9; + + str = "hello"; + str += QChar::ReplacementCharacter; + str += QChar(0x68); + str += QChar::ReplacementCharacter; + str += QChar::ReplacementCharacter; + str += QChar::ReplacementCharacter; + str += QChar::ReplacementCharacter; + str += QChar(0x61); + str += QChar::ReplacementCharacter; + QTest::newRow("invalid utf8") << QByteArray("hello\344h\344\344\366\344a\304") << str << -1; + QTest::newRow("invalid utf8-len") << QByteArray("hello\344h\344\344\366\344a\304") << QString("hello") << 5; + + str = "Prohl"; + str += QChar::ReplacementCharacter; + str += QChar::ReplacementCharacter; + str += QLatin1Char('e'); + str += QChar::ReplacementCharacter; + str += " plugin"; + str += QChar::ReplacementCharacter; + str += " Netscape"; + + QTest::newRow("task28417") << QByteArray("Prohl\355\276e\350 plugin\371 Netscape") << str << -1; + QTest::newRow("task28417-len") << QByteArray("Prohl\355\276e\350 plugin\371 Netscape") << QString("") << 0; + + QTest::newRow("null-1") << QByteArray() << QString() << -1; + QTest::newRow("null0") << QByteArray() << QString() << 0; + // QTest::newRow("null5") << QByteArray() << QString() << 5; + QTest::newRow("empty-1") << QByteArray("\0abcd", 5) << QString() << -1; + QTest::newRow("empty0") << QByteArray() << QString() << 0; + QTest::newRow("empty5") << QByteArray("\0abcd", 5) << QString::fromLatin1("\0abcd", 5) << 5; + QTest::newRow("other-1") << QByteArray("ab\0cd", 5) << QString::fromLatin1("ab") << -1; + QTest::newRow("other5") << QByteArray("ab\0cd", 5) << QString::fromLatin1("ab\0cd", 5) << 5; + + str = "Old Italic: "; + str += QChar(0xd800); + str += QChar(0xdf00); + str += QChar(0xd800); + str += QChar(0xdf01); + str += QChar(0xd800); + str += QChar(0xdf02); + str += QChar(0xd800); + str += QChar(0xdf03); + str += QChar(0xd800); + str += QChar(0xdf04); + QTest::newRow("surrogate") << QByteArray("Old Italic: \360\220\214\200\360\220\214\201\360\220\214\202\360\220\214\203\360\220\214\204") << str << -1; + + QTest::newRow("surrogate-len") << QByteArray("Old Italic: \360\220\214\200\360\220\214\201\360\220\214\202\360\220\214\203\360\220\214\204") << str.left(16) << 20; + + // from http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html + + // 2.1.1 U+00000000 + QByteArray utf8; + utf8 += char(0x00); + str = QChar(QChar::Null); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.1.1") << utf8 << str << 1; + + // 2.1.2 U+00000080 + utf8.clear(); + utf8 += char(0xc2); + utf8 += char(0x80); + str = QChar(0x80); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.1.2") << utf8 << str << -1; + + // 2.1.3 U+00000800 + utf8.clear(); + utf8 += char(0xe0); + utf8 += char(0xa0); + utf8 += char(0x80); + str = QChar(0x800); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.1.3") << utf8 << str << -1; + + // 2.1.4 U+00010000 + utf8.clear(); + utf8 += char(0xf0); + utf8 += char(0x90); + utf8 += char(0x80); + utf8 += char(0x80); + str.clear(); + str += QChar(0xd800); + str += QChar(0xdc00); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.1.4") << utf8 << str << -1; + + // 2.1.5 U+00200000 (not a valid Unicode character) + utf8.clear(); + utf8 += char(0xf8); + utf8 += char(0x88); + utf8 += char(0x80); + utf8 += char(0x80); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.1.5") << utf8 << str << -1; + + // 2.1.6 U+04000000 (not a valid Unicode character) + utf8.clear(); + utf8 += char(0xfc); + utf8 += char(0x84); + utf8 += char(0x80); + utf8 += char(0x80); + utf8 += char(0x80); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.1.6") << utf8 << str << -1; + + // 2.2.1 U+0000007F + utf8.clear(); + utf8 += char(0x7f); + str = QChar(0x7f); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.2.1") << utf8 << str << -1; + + // 2.2.2 U+000007FF + utf8.clear(); + utf8 += char(0xdf); + utf8 += char(0xbf); + str = QChar(0x7ff); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.2.2") << utf8 << str << -1; + + // 2.2.3 U+000FFFF - non-character code + utf8.clear(); + utf8 += char(0xef); + utf8 += char(0xbf); + utf8 += char(0xbf); + str = QString::fromUtf8(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.2.3") << utf8 << str << -1; + + // 2.2.4 U+001FFFFF + utf8.clear(); + utf8 += char(0xf7); + utf8 += char(0xbf); + utf8 += char(0xbf); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.2.4") << utf8 << str << -1; + + // 2.2.5 U+03FFFFFF (not a valid Unicode character) + utf8.clear(); + utf8 += char(0xfb); + utf8 += char(0xbf); + utf8 += char(0xbf); + utf8 += char(0xbf); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.2.5") << utf8 << str << -1; + + // 2.2.6 U+7FFFFFFF + utf8.clear(); + utf8 += char(0xfd); + utf8 += char(0xbf); + utf8 += char(0xbf); + utf8 += char(0xbf); + utf8 += char(0xbf); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.2.6") << utf8 << str << -1; + + // 2.3.1 U+0000D7FF + utf8.clear(); + utf8 += char(0xed); + utf8 += char(0x9f); + utf8 += char(0xbf); + str = QChar(0xd7ff); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.3.1") << utf8 << str << -1; + + // 2.3.2 U+0000E000 + utf8.clear(); + utf8 += char(0xee); + utf8 += char(0x80); + utf8 += char(0x80); + str = QChar(0xe000); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.3.2") << utf8 << str << -1; + + // 2.3.3 U+0000FFFD + utf8.clear(); + utf8 += char(0xef); + utf8 += char(0xbf); + utf8 += char(0xbd); + str = QChar(QChar::ReplacementCharacter); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.3.3") << utf8 << str << -1; + + // 2.3.4 U+0010FFFD + utf8.clear(); + utf8 += char(0xf4); + utf8 += char(0x8f); + utf8 += char(0xbf); + utf8 += char(0xbd); + str.clear(); + str += QChar(0xdbff); + str += QChar(0xdffd); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.3.4") << utf8 << str << -1; + + // 2.3.5 U+00110000 + utf8.clear(); + utf8 += char(0xf4); + utf8 += char(0x90); + utf8 += char(0x80); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.3.5") << utf8 << str << -1; + + // 3.1.1 + utf8.clear(); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.1.1") << utf8 << str << -1; + + // 3.1.2 + utf8.clear(); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.1.2") << utf8 << str << -1; + + // 3.1.3 + utf8.clear(); + utf8 += char(0x80); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.1.3") << utf8 << str << -1; + + // 3.1.4 + utf8.clear(); + utf8 += char(0x80); + utf8 += char(0xbf); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.1.4") << utf8 << str << -1; + + // 3.1.5 + utf8.clear(); + utf8 += char(0x80); + utf8 += char(0xbf); + utf8 += char(0x80); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.1.5") << utf8 << str << -1; + + // 3.1.6 + utf8.clear(); + utf8 += char(0x80); + utf8 += char(0xbf); + utf8 += char(0x80); + utf8 += char(0xbf); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.1.6") << utf8 << str << -1; + + // 3.1.7 + utf8.clear(); + utf8 += char(0x80); + utf8 += char(0xbf); + utf8 += char(0x80); + utf8 += char(0xbf); + utf8 += char(0x80); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.1.7") << utf8 << str << -1; + + // 3.1.8 + utf8.clear(); + utf8 += char(0x80); + utf8 += char(0xbf); + utf8 += char(0x80); + utf8 += char(0xbf); + utf8 += char(0x80); + utf8 += char(0xbf); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.1.8") << utf8 << str << -1; + + // 3.1.9 + utf8.clear(); + for (uint i = 0x80; i<= 0xbf; ++i) + utf8 += i; + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.1.9") << utf8 << str << -1; + + // 3.2.1 + utf8.clear(); + str.clear(); + for (uint i = 0xc8; i <= 0xdf; ++i) { + utf8 += i; + utf8 += char(0x20); + + str += QChar::ReplacementCharacter; + str += QChar(0x0020); + } + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.2.1") << utf8 << str << -1; + + // 3.2.2 + utf8.clear(); + str.clear(); + for (uint i = 0xe0; i <= 0xef; ++i) { + utf8 += i; + utf8 += char(0x20); + + str += QChar::ReplacementCharacter; + str += QChar(0x0020); + } + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.2.2") << utf8 << str << -1; + + // 3.2.3 + utf8.clear(); + str.clear(); + for (uint i = 0xf0; i <= 0xf7; ++i) { + utf8 += i; + utf8 += 0x20; + + str += QChar::ReplacementCharacter; + str += QChar(0x0020); + } + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.2.3") << utf8 << str << -1; + + // 3.2.4 + utf8.clear(); + str.clear(); + for (uint i = 0xf8; i <= 0xfb; ++i) { + utf8 += i; + utf8 += 0x20; + + str += QChar::ReplacementCharacter; + str += QChar(0x0020); + } + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.2.4") << utf8 << str << -1; + + // 3.2.5 + utf8.clear(); + str.clear(); + for (uint i = 0xfc; i <= 0xfd; ++i) { + utf8 += i; + utf8 += 0x20; + + str += QChar::ReplacementCharacter; + str += QChar(0x0020); + } + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.2.5") << utf8 << str << -1; + + // 3.3.1 + utf8.clear(); + utf8 += char(0xc0); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.1") << utf8 << str << -1; + utf8 += char(0x30); + str += 0x30; + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.1-1") << utf8 << str << -1; + + // 3.3.2 + utf8.clear(); + utf8 += char(0xe0); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.2") << utf8 << str << -1; + utf8 += char(0x30); + str += 0x30; + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.2-1") << utf8 << str << -1; + + utf8.clear(); + utf8 += char(0xe0); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.2-2") << utf8 << str << -1; + utf8 += 0x30; + str += 0x30; + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.2-3") << utf8 << str << -1; + + // 3.3.3 + utf8.clear(); + utf8 += char(0xf0); + utf8 += char(0x80); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.3") << utf8 << str << -1; + utf8 += char(0x30); + str += 0x30; + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.3-1") << utf8 << str << -1; + + utf8.clear(); + utf8 += char(0xf0); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.3-2") << utf8 << str << -1; + utf8 += char(0x30); + str += 0x30; + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.3-3") << utf8 << str << -1; + + utf8.clear(); + utf8 += char(0xf0); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.3-4") << utf8 << str << -1; + utf8 += char(0x30); + str += 0x30; + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.3-5") << utf8 << str << -1; + + // 3.3.4 + utf8.clear(); + utf8 += char(0xf8); + utf8 += char(0x80); + utf8 += char(0x80); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.4") << utf8 << str << -1; + utf8 += char(0x30); + str += 0x30; + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.4-1") << utf8 << str << -1; + + utf8.clear(); + utf8 += char(0xf8); + utf8 += char(0x80); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.4-2") << utf8 << str << -1; + utf8 += char(0x30); + str += 0x30; + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.4-3") << utf8 << str << -1; + + utf8.clear(); + utf8 += char(0xf8); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.4-4") << utf8 << str << -1; + utf8 += char(0x30); + str += 0x30; + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.4-5") << utf8 << str << -1; + + utf8.clear(); + utf8 += char(0xf8); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.4-6") << utf8 << str << -1; + utf8 += char(0x30); + str += 0x30; + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.4-7") << utf8 << str << -1; + + // 3.3.5 + utf8.clear(); + utf8 += char(0xfc); + utf8 += char(0x80); + utf8 += char(0x80); + utf8 += char(0x80); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.5") << utf8 << str << -1; + utf8 += char(0x30); + str += 0x30; + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.5-1") << utf8 << str << -1; + + utf8.clear(); + utf8 += char(0xfc); + utf8 += char(0x80); + utf8 += char(0x80); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.5-2") << utf8 << str << -1; + utf8 += char(0x30); + str += 0x30; + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.5-3") << utf8 << str << -1; + + utf8.clear(); + utf8 += char(0xfc); + utf8 += char(0x80); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.5-4") << utf8 << str << -1; + utf8 += char(0x30); + str += 0x30; + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.5-5") << utf8 << str << -1; + + utf8.clear(); + utf8 += char(0xfc); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.5-6") << utf8 << str << -1; + utf8 += char(0x30); + str += 0x30; + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.5-7") << utf8 << str << -1; + + utf8.clear(); + utf8 += char(0xfc); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.5-8") << utf8 << str << -1; + utf8 += char(0x30); + str += 0x30; + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.5-9") << utf8 << str << -1; + + // 3.3.6 + utf8.clear(); + utf8 += char(0xdf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.6") << utf8 << str << -1; + utf8 += char(0x30); + str += 0x30; + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.6-1") << utf8 << str << -1; + + // 3.3.7 + utf8.clear(); + utf8 += char(0xef); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.7") << utf8 << str << -1; + utf8 += char(0x30); + str += 0x30; + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.7-1") << utf8 << str << -1; + + utf8.clear(); + utf8 += char(0xef); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.7-2") << utf8 << str << -1; + utf8 += char(0x30); + str += 0x30; + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.7-3") << utf8 << str << -1; + + // 3.3.8 + utf8.clear(); + utf8 += char(0xf7); + utf8 += char(0xbf); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.8") << utf8 << str << -1; + utf8 += char(0x30); + str += 0x30; + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.8-1") << utf8 << str << -1; + + utf8.clear(); + utf8 += char(0xf7); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.8-2") << utf8 << str << -1; + utf8 += char(0x30); + str += 0x30; + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.8-3") << utf8 << str << -1; + + utf8.clear(); + utf8 += char(0xf7); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.8-4") << utf8 << str << -1; + utf8 += char(0x30); + str += 0x30; + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.8-5") << utf8 << str << -1; + + // 3.3.9 + utf8.clear(); + utf8 += char(0xfb); + utf8 += char(0xbf); + utf8 += char(0xbf); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.9") << utf8 << str << -1; + utf8 += char(0x30); + str += 0x30; + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.9-1") << utf8 << str << -1; + + utf8.clear(); + utf8 += char(0xfb); + utf8 += char(0xbf); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.9-2") << utf8 << str << -1; + utf8 += char(0x30); + str += 0x30; + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.9-3") << utf8 << str << -1; + + utf8.clear(); + utf8 += char(0xfb); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.9-4") << utf8 << str << -1; + utf8 += char(0x30); + str += 0x30; + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.9-5") << utf8 << str << -1; + + utf8.clear(); + utf8 += char(0xfb); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.9-6") << utf8 << str << -1; + utf8 += char(0x30); + str += 0x30; + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.9-7") << utf8 << str << -1; + + // 3.3.10 + utf8.clear(); + utf8 += char(0xfd); + utf8 += char(0xbf); + utf8 += char(0xbf); + utf8 += char(0xbf); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.10") << utf8 << str << -1; + utf8 += char(0x30); + str += 0x30; + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.10-1") << utf8 << str << -1; + + utf8.clear(); + utf8 += char(0xfd); + utf8 += char(0xbf); + utf8 += char(0xbf); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.10-2") << utf8 << str << -1; + utf8 += char(0x30); + str += 0x30; + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.10-3") << utf8 << str << -1; + + utf8.clear(); + utf8 += char(0xfd); + utf8 += char(0xbf); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.10-4") << utf8 << str << -1; + utf8 += char(0x30); + str += 0x30; + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.10-5") << utf8 << str << -1; + + utf8.clear(); + utf8 += char(0xfd); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.10-6") << utf8 << str << -1; + utf8 += char(0x30); + str += 0x30; + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.10-7") << utf8 << str << -1; + + utf8.clear(); + utf8 += char(0xfd); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.10-8") << utf8 << str << -1; + utf8 += char(0x30); + str += 0x30; + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.10-9") << utf8 << str << -1; + + // 3.4 + utf8.clear(); + utf8 += char(0xc0); + utf8 += char(0xe0); + utf8 += char(0x80); + utf8 += char(0xf0); + utf8 += char(0x80); + utf8 += char(0x80); + utf8 += char(0xf8); + utf8 += char(0x80); + utf8 += char(0x80); + utf8 += char(0x80); + utf8 += char(0xfc); + utf8 += char(0x80); + utf8 += char(0x80); + utf8 += char(0x80); + utf8 += char(0x80); + utf8 += char(0xdf); + utf8 += char(0xef); + utf8 += char(0xbf); + utf8 += char(0xf7); + utf8 += char(0xbf); + utf8 += char(0xbf); + utf8 += char(0xfb); + utf8 += char(0xbf); + utf8 += char(0xbf); + utf8 += char(0xbf); + utf8 += char(0xfd); + utf8 += char(0xbf); + utf8 += char(0xbf); + utf8 += char(0xbf); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.4") << utf8 << str << -1; + + // 3.5.1 + utf8.clear(); + utf8 += char(0xfe); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.5.1") << utf8 << str << -1; + + // 3.5.2 + utf8.clear(); + utf8 += char(0xff); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.5.2") << utf8 << str << -1; + + // 3.5.2 + utf8.clear(); + utf8 += char(0xfe); + utf8 += char(0xfe); + utf8 += char(0xff); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.5.2-1") << utf8 << str << -1; + + // 4.1.1 + utf8.clear(); + utf8 += char(0xc0); + utf8 += char(0xaf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.1.1") << utf8 << str << -1; + + // 4.1.2 + utf8.clear(); + utf8 += char(0xe0); + utf8 += char(0x80); + utf8 += char(0xaf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.1.2") << utf8 << str << -1; + + // 4.1.3 + utf8.clear(); + utf8 += char(0xf0); + utf8 += char(0x80); + utf8 += char(0x80); + utf8 += char(0xaf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.1.3") << utf8 << str << -1; + + // 4.1.4 + utf8.clear(); + utf8 += char(0xf8); + utf8 += char(0x80); + utf8 += char(0x80); + utf8 += char(0x80); + utf8 += char(0xaf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.1.4") << utf8 << str << -1; + + // 4.1.5 + utf8.clear(); + utf8 += char(0xfc); + utf8 += char(0x80); + utf8 += char(0x80); + utf8 += char(0x80); + utf8 += char(0x80); + utf8 += char(0xaf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.1.5") << utf8 << str << -1; + + // 4.2.1 + utf8.clear(); + utf8 += char(0xc1); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.2.1") << utf8 << str << -1; + + // 4.2.2 + utf8.clear(); + utf8 += char(0xe0); + utf8 += char(0x9f); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.2.2") << utf8 << str << -1; + + // 4.2.3 + utf8.clear(); + utf8 += char(0xf0); + utf8 += char(0x8f); + utf8 += char(0xbf); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.2.3") << utf8 << str << -1; + + // 4.2.4 + utf8.clear(); + utf8 += char(0xf8); + utf8 += char(0x87); + utf8 += char(0xbf); + utf8 += char(0xbf); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.2.4") << utf8 << str << -1; + + // 4.2.5 + utf8.clear(); + utf8 += char(0xfc); + utf8 += char(0x83); + utf8 += char(0xbf); + utf8 += char(0xbf); + utf8 += char(0xbf); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.2.5") << utf8 << str << -1; + + // 4.3.1 + utf8.clear(); + utf8 += char(0xc0); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.3.1") << utf8 << str << -1; + + // 4.3.2 + utf8.clear(); + utf8 += char(0xe0); + utf8 += char(0x80); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.3.2") << utf8 << str << -1; + + // 4.3.3 + utf8.clear(); + utf8 += char(0xf0); + utf8 += char(0x80); + utf8 += char(0x80); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.3.3") << utf8 << str << -1; + + // 4.3.4 + utf8.clear(); + utf8 += char(0xf8); + utf8 += char(0x80); + utf8 += char(0x80); + utf8 += char(0x80); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.3.4") << utf8 << str << -1; + + // 4.3.5 + utf8.clear(); + utf8 += char(0xfc); + utf8 += char(0x80); + utf8 += char(0x80); + utf8 += char(0x80); + utf8 += char(0x80); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.3.5") << utf8 << str << -1; + + // 5.1.1 + utf8.clear(); + utf8 += char(0xed); + utf8 += char(0xa0); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.1.1") << utf8 << str << -1; + + // 5.1.2 + utf8.clear(); + utf8 += char(0xed); + utf8 += char(0xad); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.1.2") << utf8 << str << -1; + + // 5.1.3 + utf8.clear(); + utf8 += char(0xed); + utf8 += char(0xae); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.1.3") << utf8 << str << -1; + + // 5.1.4 + utf8.clear(); + utf8 += char(0xed); + utf8 += char(0xaf); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.1.4") << utf8 << str << -1; + + // 5.1.5 + utf8.clear(); + utf8 += char(0xed); + utf8 += char(0xb0); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.1.5") << utf8 << str << -1; + + // 5.1.6 + utf8.clear(); + utf8 += char(0xed); + utf8 += char(0xbe); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.1.6") << utf8 << str << -1; + + // 5.1.7 + utf8.clear(); + utf8 += char(0xed); + utf8 += char(0xbf); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.1.7") << utf8 << str << -1; + + // 5.2.1 + utf8.clear(); + utf8 += char(0xed); + utf8 += char(0xa0); + utf8 += char(0x80); + utf8 += char(0xed); + utf8 += char(0xb0); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.2.1") << utf8 << str << -1; + + // 5.2.2 + utf8.clear(); + utf8 += char(0xed); + utf8 += char(0xa0); + utf8 += char(0x80); + utf8 += char(0xed); + utf8 += char(0xbf); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.2.2") << utf8 << str << -1; + + // 5.2.3 + utf8.clear(); + utf8 += char(0xed); + utf8 += char(0xad); + utf8 += char(0xbf); + utf8 += char(0xed); + utf8 += char(0xb0); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.2.3") << utf8 << str << -1; + + // 5.2.4 + utf8.clear(); + utf8 += char(0xed); + utf8 += char(0xad); + utf8 += char(0xbf); + utf8 += char(0xed); + utf8 += char(0xbf); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.2.4") << utf8 << str << -1; + + // 5.2.5 + utf8.clear(); + utf8 += char(0xed); + utf8 += char(0xae); + utf8 += char(0x80); + utf8 += char(0xed); + utf8 += char(0xb0); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.2.5") << utf8 << str << -1; + + // 5.2.6 + utf8.clear(); + utf8 += char(0xed); + utf8 += char(0xae); + utf8 += char(0x80); + utf8 += char(0xed); + utf8 += char(0xbf); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.2.6") << utf8 << str << -1; + + // 5.2.7 + utf8.clear(); + utf8 += char(0xed); + utf8 += char(0xaf); + utf8 += char(0xbf); + utf8 += char(0xed); + utf8 += char(0xb0); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.2.7") << utf8 << str << -1; + + // 5.2.8 + utf8.clear(); + utf8 += char(0xed); + utf8 += char(0xaf); + utf8 += char(0xbf); + utf8 += char(0xed); + utf8 += char(0xbf); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.2.8") << utf8 << str << -1; + + // 5.3.1 - non-character code + utf8.clear(); + utf8 += char(0xef); + utf8 += char(0xbf); + utf8 += char(0xbe); + //str = QChar(QChar::ReplacementCharacter); + str = QChar(0xfffe); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.3.1") << utf8 << str << -1; + + // 5.3.2 - non-character code + utf8.clear(); + utf8 += char(0xef); + utf8 += char(0xbf); + utf8 += char(0xbf); + //str = QChar(QChar::ReplacementCharacter); + str = QChar(0xffff); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.3.2") << utf8 << str << -1; +} + +void tst_QStringConverter::utf8Codec() +{ + QFETCH(QByteArray, utf8); + QFETCH(QString, res); + QFETCH(int, len); + + QStringDecoder decoder(QStringDecoder::Utf8, QStringDecoder::Stateless); + QString str = decoder(utf8.isNull() ? 0 : utf8.constData(), + len < 0 ? qstrlen(utf8.constData()) : len); + QCOMPARE(str, res); + + str = QString::fromUtf8(utf8.isNull() ? 0 : utf8.constData(), len); + QCOMPARE(str, res); +} + +void tst_QStringConverter::utf8bom_data() +{ + QTest::addColumn("data"); + QTest::addColumn("result"); + + QTest::newRow("nobom") + << QByteArray("\302\240", 2) + << QString::fromLatin1("\240"); + + { + static const ushort data[] = { 0x201d }; + QTest::newRow("nobom 2") + << QByteArray("\342\200\235", 3) + << QString::fromUtf16(data, sizeof(data)/sizeof(short)); + } + + { + static const ushort data[] = { 0xf000 }; + QTest::newRow("bom1") + << QByteArray("\357\200\200", 3) + << QString::fromUtf16(data, sizeof(data)/sizeof(short)); + } + + { + static const ushort data[] = { 0xfec0 }; + QTest::newRow("bom2") + << QByteArray("\357\273\200", 3) + << QString::fromUtf16(data, sizeof(data)/sizeof(short)); + } + + { + QTest::newRow("normal-bom") + << QByteArray("\357\273\277a", 4) + << QString("a"); + } + + { // test the non-SIMD code-path + static const ushort data[] = { 0x61, 0xfeff, 0x62 }; + QTest::newRow("middle-bom (non SIMD)") + << QByteArray("a\357\273\277b") + << QString::fromUtf16(data, sizeof(data)/sizeof(short)); + } + + { // test the SIMD code-path + static const ushort data[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0xfeff, 0x6d }; + QTest::newRow("middle-bom (SIMD)") + << QByteArray("abcdefghijkl\357\273\277m") + << QString::fromUtf16(data, sizeof(data)/sizeof(short)); + } +} + +void tst_QStringConverter::utf8bom() +{ + QFETCH(QByteArray, data); + QFETCH(QString, result); + + QStringDecoder decoder(QStringDecoder::Utf8); + + QCOMPARE(decoder(data.constData(), data.length()), result); +} + +void tst_QStringConverter::utf8stateful_data() +{ + QTest::addColumn("buffer1"); + QTest::addColumn("buffer2"); + QTest::addColumn("result"); // null QString indicates decoder error + + // valid buffer continuations + QTest::newRow("1of2+valid") << QByteArray("\xc2") << QByteArray("\xa0") << "\xc2\xa0"; + QTest::newRow("1of3+valid") << QByteArray("\xe0") << QByteArray("\xa0\x80") << "\xe0\xa0\x80"; + QTest::newRow("2of3+valid") << QByteArray("\xe0\xa0") << QByteArray("\x80") << "\xe0\xa0\x80"; + QTest::newRow("1of4+valid") << QByteArray("\360") << QByteArray("\220\210\203") << "\360\220\210\203"; + QTest::newRow("2of4+valid") << QByteArray("\360\220") << QByteArray("\210\203") << "\360\220\210\203"; + QTest::newRow("3of4+valid") << QByteArray("\360\220\210") << QByteArray("\203") << "\360\220\210\203"; + QTest::newRow("1ofBom+valid") << QByteArray("\xef") << QByteArray("\xbb\xbf") << ""; + QTest::newRow("2ofBom+valid") << QByteArray("\xef\xbb") << QByteArray("\xbf") << ""; + + // invalid continuation + QTest::newRow("1of2+invalid") << QByteArray("\xc2") << QByteArray("a") << QString(); + QTest::newRow("1of3+invalid") << QByteArray("\xe0") << QByteArray("a") << QString(); + QTest::newRow("2of3+invalid") << QByteArray("\xe0\xa0") << QByteArray("a") << QString(); + QTest::newRow("1of4+invalid") << QByteArray("\360") << QByteArray("a") << QString(); + QTest::newRow("2of4+invalid") << QByteArray("\360\220") << QByteArray("a") << QString(); + QTest::newRow("3of4+invalid") << QByteArray("\360\220\210") << QByteArray("a") << QString(); + + // invalid: sequence too short (the empty second buffer causes a state reset) + QTest::newRow("1of2+empty") << QByteArray("\xc2") << QByteArray() << QString(); + QTest::newRow("1of3+empty") << QByteArray("\xe0") << QByteArray() << QString(); + QTest::newRow("2of3+empty") << QByteArray("\xe0\xa0") << QByteArray() << QString(); + QTest::newRow("1of4+empty") << QByteArray("\360") << QByteArray() << QString(); + QTest::newRow("2of4+empty") << QByteArray("\360\220") << QByteArray() << QString(); + QTest::newRow("3of4+empty") << QByteArray("\360\220\210") << QByteArray() << QString(); + + // overlong sequence: + QTest::newRow("overlong-1of2") << QByteArray("\xc1") << QByteArray("\x81") << QString(); + QTest::newRow("overlong-1of3") << QByteArray("\xe0") << QByteArray("\x81\x81") << QString(); + QTest::newRow("overlong-2of3") << QByteArray("\xe0\x81") << QByteArray("\x81") << QString(); + QTest::newRow("overlong-1of4") << QByteArray("\xf0") << QByteArray("\x80\x81\x81") << QString(); + QTest::newRow("overlong-2of4") << QByteArray("\xf0\x80") << QByteArray("\x81\x81") << QString(); + QTest::newRow("overlong-3of4") << QByteArray("\xf0\x80\x81") << QByteArray("\x81") << QString(); + + // out of range: + // leading byte 0xF4 can produce codepoints above U+10FFFF, which aren't valid + QTest::newRow("outofrange1-1of4") << QByteArray("\xf4") << QByteArray("\x90\x80\x80") << QString(); + QTest::newRow("outofrange1-2of4") << QByteArray("\xf4\x90") << QByteArray("\x80\x80") << QString(); + QTest::newRow("outofrange1-3of4") << QByteArray("\xf4\x90\x80") << QByteArray("\x80") << QString(); + QTest::newRow("outofrange2-1of4") << QByteArray("\xf5") << QByteArray("\x90\x80\x80") << QString(); + QTest::newRow("outofrange2-2of4") << QByteArray("\xf5\x90") << QByteArray("\x80\x80") << QString(); + QTest::newRow("outofrange2-3of4") << QByteArray("\xf5\x90\x80") << QByteArray("\x80") << QString(); + QTest::newRow("outofrange-1of5") << QByteArray("\xf8") << QByteArray("\x88\x80\x80\x80") << QString(); + QTest::newRow("outofrange-2of5") << QByteArray("\xf8\x88") << QByteArray("\x80\x80\x80") << QString(); + QTest::newRow("outofrange-3of5") << QByteArray("\xf8\x88\x80") << QByteArray("\x80\x80") << QString(); + QTest::newRow("outofrange-4of5") << QByteArray("\xf8\x88\x80\x80") << QByteArray("\x80") << QString(); + QTest::newRow("outofrange-1of6") << QByteArray("\xfc") << QByteArray("\x84\x80\x80\x80\x80") << QString(); + QTest::newRow("outofrange-2of6") << QByteArray("\xfc\x84") << QByteArray("\x80\x80\x80\x80") << QString(); + QTest::newRow("outofrange-3of6") << QByteArray("\xfc\x84\x80") << QByteArray("\x80\x80\x80") << QString(); + QTest::newRow("outofrange-4of6") << QByteArray("\xfc\x84\x80\x80") << QByteArray("\x80\x80") << QString(); + QTest::newRow("outofrange-5of6") << QByteArray("\xfc\x84\x80\x80\x80") << QByteArray("\x80") << QString(); +} + +void tst_QStringConverter::utf8stateful() +{ + QFETCH(QByteArray, buffer1); + QFETCH(QByteArray, buffer2); + QFETCH(QString, result); + + QStringDecoder decoder(QStringDecoder::Utf8); + QVERIFY(decoder.isValid()); + + QString decoded = decoder(buffer1); + if (result.isNull()) { + if (!decoder.hasError()) { + // incomplete data + decoded += decoder(buffer2); + QVERIFY(decoder.hasError()); + } + } else { + QVERIFY(!decoder.hasError()); + decoded += decoder(buffer2); + QVERIFY(!decoder.hasError()); + QCOMPARE(decoded, result); + } +} + +void tst_QStringConverter::utfHeaders_data() +{ + QTest::addColumn("encoding"); + QTest::addColumn("flags"); + QTest::addColumn("encoded"); + QTest::addColumn("unicode"); + + QTest::newRow("utf8 bom") + << QStringConverter::Utf8 + << QStringConverter::DefaultConversion + << QByteArray("\xef\xbb\xbfhello") + << QString::fromLatin1("hello"); + QTest::newRow("utf8 nobom") + << QStringConverter::Utf8 + << QStringConverter::DefaultConversion + << QByteArray("hello") + << QString::fromLatin1("hello"); + QTest::newRow("utf8 bom ignore header") + << QStringConverter::Utf8 + << QStringConverter::IgnoreHeader + << QByteArray("\xef\xbb\xbfhello") + << (QString(QChar(0xfeff)) + QString::fromLatin1("hello")); + QTest::newRow("utf8 nobom ignore header") + << QStringConverter::Utf8 + << QStringConverter::IgnoreHeader + << QByteArray("hello") + << QString::fromLatin1("hello"); + + QTest::newRow("utf16 bom be") + << QStringConverter::Utf16 + << QStringConverter::DefaultConversion + << QByteArray("\xfe\xff\0h\0e\0l", 8) + << QString::fromLatin1("hel"); + QTest::newRow("utf16 bom le") + << QStringConverter::Utf16 + << QStringConverter::DefaultConversion + << QByteArray("\xff\xfeh\0e\0l\0", 8) + << QString::fromLatin1("hel"); + if (QSysInfo::ByteOrder == QSysInfo::BigEndian) { + QTest::newRow("utf16 nobom") + << QStringConverter::Utf16 + << QStringConverter::DefaultConversion + << QByteArray("\0h\0e\0l", 6) + << QString::fromLatin1("hel"); + QTest::newRow("utf16 bom be ignore header") + << QStringConverter::Utf16 + << QStringConverter::IgnoreHeader + << QByteArray("\xfe\xff\0h\0e\0l", 8) + << (QString(QChar(0xfeff)) + QString::fromLatin1("hel")); + } else { + QTest::newRow("utf16 nobom") + << QStringConverter::Utf16 + << QStringConverter::DefaultConversion + << QByteArray("h\0e\0l\0", 6) + << QString::fromLatin1("hel"); + QTest::newRow("utf16 bom le ignore header") + << QStringConverter::Utf16 + << QStringConverter::IgnoreHeader + << QByteArray("\xff\xfeh\0e\0l\0", 8) + << (QString(QChar(0xfeff)) + QString::fromLatin1("hel")); + } + + QTest::newRow("utf16-be bom be") + << QStringConverter::Utf16BE + << QStringConverter::DefaultConversion + << QByteArray("\xfe\xff\0h\0e\0l", 8) + << QString::fromLatin1("hel"); + QTest::newRow("utf16-be nobom") + << QStringConverter::Utf16BE + << QStringConverter::DefaultConversion + << QByteArray("\0h\0e\0l", 6) + << QString::fromLatin1("hel"); + QTest::newRow("utf16-be bom be ignore header") + << QStringConverter::Utf16BE + << QStringConverter::IgnoreHeader + << QByteArray("\xfe\xff\0h\0e\0l", 8) + << (QString(QChar(0xfeff)) + QString::fromLatin1("hel")); + + QTest::newRow("utf16-le bom le") + << QStringConverter::Utf16LE + << QStringConverter::DefaultConversion + << QByteArray("\xff\xfeh\0e\0l\0", 8) + << QString::fromLatin1("hel"); + QTest::newRow("utf16-le nobom") + << QStringConverter::Utf16LE + << QStringConverter::DefaultConversion + << QByteArray("h\0e\0l\0", 6) + << QString::fromLatin1("hel"); + QTest::newRow("utf16-le bom le ignore header") + << QStringConverter::Utf16LE + << QStringConverter::IgnoreHeader + << QByteArray("\xff\xfeh\0e\0l\0", 8) + << (QString(QChar(0xfeff)) + QString::fromLatin1("hel")); + + QTest::newRow("utf32 bom be") + << QStringConverter::Utf32 + << QStringConverter::DefaultConversion + << QByteArray("\0\0\xfe\xff\0\0\0h\0\0\0e\0\0\0l", 16) + << QString::fromLatin1("hel"); + QTest::newRow("utf32 bom le") + << QStringConverter::Utf32 + << QStringConverter::DefaultConversion + << QByteArray("\xff\xfe\0\0h\0\0\0e\0\0\0l\0\0\0", 16) + << QString::fromLatin1("hel"); + if (QSysInfo::ByteOrder == QSysInfo::BigEndian) { + QTest::newRow("utf32 nobom") + << QStringConverter::Utf32 + << QStringConverter::DefaultConversion + << QByteArray("\0\0\0h\0\0\0e\0\0\0l", 12) + << QString::fromLatin1("hel"); + QTest::newRow("utf32 bom be ignore header") + << QStringConverter::Utf32 + << QStringConverter::IgnoreHeader + << QByteArray("\0\0\xfe\xff\0\0\0h\0\0\0e\0\0\0l", 16) + << (QString(QChar(0xfeff)) + QString::fromLatin1("hel")); + } else { + QTest::newRow("utf32 nobom") + << QStringConverter::Utf32 + << QStringConverter::DefaultConversion + << QByteArray("h\0\0\0e\0\0\0l\0\0\0", 12) + << QString::fromLatin1("hel"); + QTest::newRow("utf32 bom le ignore header") + << QStringConverter::Utf32 + << QStringConverter::IgnoreHeader + << QByteArray("\xff\xfe\0\0h\0\0\0e\0\0\0l\0\0\0", 16) + << (QString(QChar(0xfeff)) + QString::fromLatin1("hel")); + } + + QTest::newRow("utf32-be bom be") + << QStringConverter::Utf32BE + << QStringConverter::DefaultConversion + << QByteArray("\0\0\xfe\xff\0\0\0h\0\0\0e\0\0\0l", 16) + << QString::fromLatin1("hel"); + QTest::newRow("utf32-be nobom") + << QStringConverter::Utf32BE + << QStringConverter::DefaultConversion + << QByteArray("\0\0\0h\0\0\0e\0\0\0l", 12) + << QString::fromLatin1("hel"); + QTest::newRow("utf32-be bom be ignore header") + << QStringConverter::Utf32BE + << QStringConverter::IgnoreHeader + << QByteArray("\0\0\xfe\xff\0\0\0h\0\0\0e\0\0\0l", 16) + << (QString(QChar(0xfeff)) + QString::fromLatin1("hel")); + + QTest::newRow("utf32-le bom le") + << QStringConverter::Utf32LE + << QStringConverter::DefaultConversion + << QByteArray("\xff\xfe\0\0h\0\0\0e\0\0\0l\0\0\0", 16) + << QString::fromLatin1("hel"); + QTest::newRow("utf32-le nobom") + << QStringConverter::Utf32LE + << QStringConverter::DefaultConversion + << QByteArray("h\0\0\0e\0\0\0l\0\0\0", 12) + << QString::fromLatin1("hel"); + QTest::newRow("utf32-le bom le ignore header") + << QStringConverter::Utf32LE + << QStringConverter::IgnoreHeader + << QByteArray("\xff\xfe\0\0h\0\0\0e\0\0\0l\0\0\0", 16) + << (QString(QChar(0xfeff)) + QString::fromLatin1("hel")); +} + +void tst_QStringConverter::utfHeaders() +{ + QFETCH(QStringConverter::Encoding, encoding); + QFETCH(QStringConverter::Flag, flags); + QFETCH(QByteArray, encoded); + QFETCH(QString, unicode); + + QLatin1String ignoreReverseTestOn = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? QLatin1String(" le") : QLatin1String(" be"); + QString rowName(QTest::currentDataTag()); + + QStringDecoder decode(encoding, flags); + QVERIFY(decode.isValid()); + + QString result = decode(encoded); + QCOMPARE(result.length(), unicode.length()); + QCOMPARE(result, unicode); + + if (!rowName.endsWith("nobom") && !rowName.contains(ignoreReverseTestOn)) { + QStringEncoder encode(encoding, flags); + QVERIFY(encode.isValid()); + QByteArray reencoded = encode(unicode); + QCOMPARE(reencoded, encoded); + } +} + +class LoadAndConvert: public QRunnable +{ +public: + LoadAndConvert(QStringConverter::Encoding encoding, QString *destination) + : encode(encoding), decode(encoding), target(destination) + {} + QStringEncoder encode; + QStringDecoder decode; + QString *target; + void run() + { + QString str = QString::fromLatin1("abcdefghijklmonpqrstufvxyz"); + for (int i = 0; i < 10000; ++i) { + QByteArray b = encode(str); + *target = decode(b); + QCOMPARE(*target, str); + } + } +}; + +void tst_QStringConverter::threadSafety() +{ + QThreadPool::globalInstance()->setMaxThreadCount(12); + + QVector res; + res.resize(QStringConverter::LastEncoding + 1); + for (int i = 0; i < QStringConverter::LastEncoding + 1; ++i) { + QThreadPool::globalInstance()->start(new LoadAndConvert(QStringConverter::Encoding(i), &res[i])); + } + + // wait for all threads to finish working + QThreadPool::globalInstance()->waitForDone(); + + for (auto b : res) + QCOMPARE(b, QString::fromLatin1("abcdefghijklmonpqrstufvxyz")); +} + +struct DontCrashAtExit { + ~DontCrashAtExit() { + QStringDecoder decoder(QStringDecoder::Utf8); + QVERIFY(decoder.isValid()); + (void)decoder("azerty"); + } +} dontCrashAtExit; + + +QTEST_MAIN(tst_QStringConverter) +#include "tst_qstringconverter.moc" diff --git a/tests/auto/corelib/text/qstringconverter/utf8.txt b/tests/auto/corelib/text/qstringconverter/utf8.txt new file mode 100644 index 0000000000..f5ab44c8f4 --- /dev/null +++ b/tests/auto/corelib/text/qstringconverter/utf8.txt @@ -0,0 +1 @@ +𐀀􏿽 diff --git a/tests/auto/corelib/text/text.pro b/tests/auto/corelib/text/text.pro index 09d58bd74c..cb7de443bd 100644 --- a/tests/auto/corelib/text/text.pro +++ b/tests/auto/corelib/text/text.pro @@ -15,6 +15,7 @@ SUBDIRS = \ qstring_no_cast_from_bytearray \ qstringapisymmetry \ qstringbuilder \ + qstringconverter \ qstringiterator \ qstringlist \ qstringmatcher \ -- cgit v1.2.3