diff options
Diffstat (limited to 'tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp')
-rw-r--r-- | tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp | 2839 |
1 files changed, 2839 insertions, 0 deletions
diff --git a/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp b/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp new file mode 100644 index 0000000000..ed3f91ac94 --- /dev/null +++ b/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp @@ -0,0 +1,2839 @@ +// Copyright (C) 2021 The Qt Company Ltd. +// Copyright (C) 2016 Intel Corporation. +// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only + +#include <QTest> + +#include <QtCore/private/qglobal_p.h> +#include <qstringconverter.h> +#include <private/qstringconverter_p.h> +#include <qthreadpool.h> + +#include <array> +#include <numeric> + +using namespace Qt::StringLiterals; + +QT_BEGIN_NAMESPACE +namespace QTest { +template <typename T> +char *toString(const std::optional<T> &opt) +{ + if (opt) + return QTest::toString(*opt); + else + return qstrdup("std::nullopt"); +} +} // namespace QTest +QT_END_NAMESPACE + +using QTest::toString; + +static constexpr bool IsBigEndian = QSysInfo::ByteOrder == QSysInfo::BigEndian; +enum CodecLimitation { + AsciiOnly, + Latin1Only, + FullUnicode +}; + +#ifdef Q_OS_WIN +# include <qt_windows.h> +static bool localeIsUtf8() +{ + return GetACP() == CP_UTF8; +} +#else +static constexpr bool localeIsUtf8() +{ + return true; +} +#endif + +struct Codec +{ + const char name[12]; + QStringConverter::Encoding code; + CodecLimitation limitation = FullUnicode; +}; +static const std::array codes = { + Codec{ "UTF-8", QStringConverter::Utf8 }, + Codec{ "UTF-16", QStringConverter::Utf16 }, + Codec{ "UTF-16-le", QStringConverter::Utf16LE }, + Codec{ "UTF-16-be", QStringConverter::Utf16BE }, + Codec{ "UTF-32", QStringConverter::Utf32 }, + Codec{ "UTF-32-le", QStringConverter::Utf32LE }, + Codec{ "UTF-32-be", QStringConverter::Utf32BE }, + Codec{ "Latin-1", QStringConverter::Latin1, Latin1Only }, + Codec{ "System", QStringConverter::System, localeIsUtf8() ? FullUnicode : AsciiOnly } +}; + +static const std::array encodedBoms = { + QByteArrayView("\xef\xbb\xbf"), // Utf8, + QByteArrayView(IsBigEndian ? "\xfe\xff" : "\xff\xfe"), // Utf16, + QByteArrayView("\xff\xfe"), // Utf16LE, + QByteArrayView("\xfe\xff"), // Utf16BE, + QByteArrayView(IsBigEndian ? "\0\0\xfe\xff" : "\xff\xfe\0", 4), // Utf32, + QByteArrayView("\xff\xfe\0", 4), // Utf32LE, + QByteArrayView("\0\0\xfe\xff", 4), // Utf32BE, +}; + +struct TestString +{ + const char *description; + QUtf8StringView utf8; + QStringView utf16; + CodecLimitation limitation = FullUnicode; +}; +static const std::array testStrings = { + TestString{ "empty", "", u"", AsciiOnly }, + TestString{ "null-character", QUtf8StringView("", 1), QStringView(u"", 1), AsciiOnly }, + TestString{ "ascii-text", + "This is a standard US-ASCII message", + "This is a standard US-ASCII message" u"", + AsciiOnly + }, + TestString{ "ascii-with-carriage-return", "a\rb", u"a\rb", AsciiOnly }, + TestString{ "ascii-with-control", + "\1This\2is\3an\4US-ASCII\020 message interspersed with control chars", + "\1This\2is\3an\4US-ASCII\020 message interspersed with control chars" u"", + AsciiOnly + }, + + TestString{ "nbsp", "\u00a0", u"\u00a0", Latin1Only }, + TestString{ "latin1-text", + "Hyvää päivää, käyhän että tuon kannettavani saunaan?", + "Hyvää päivää, käyhän että tuon kannettavani saunaan?" u"", + Latin1Only + }, + +#define ROW(name, string) TestString{ name, u8"" string, u"" string } + ROW("euro", "€"), + ROW("character+bom", "b\ufeff"), + /* Check that the codec does NOT flag EFBFBF. + * This is a regression test; see QTBUG-33229 + */ + ROW("last-bmp", "\uffff"), + ROW("character+last-bmp", "b\uffff"), + ROW("replacement", "\ufffd"), + ROW("supplementary-plane", "\U00010203"), + ROW("mahjong", "\U0001f000\U0001f001\U0001f002\U0001f003\U0001f004\U0001f005" + "\U0001f006\U0001f007\U0001f008\U0001f009\U0001f00a\U0001f00b\U0001f00c" + "\U0001f00d\U0001f00e\U0001f00f"), + ROW("emojis", "😂, 😃, 🧘🏻♂️, 🌍, 🌦️, 🍞, 🚗, 📞, 🎉, ❤️, 🏁"), // https://en.wikipedia.org/wiki/Emoji + ROW("last-valid", "\U0010fffd"), // U+10FFFF is the strict last, but it's a non-character + ROW("mixed-bmp-only", "abc\u00a0\u00e1\u00e9\u01fd \u20acdef"), + ROW("mixed-full", "abc\u00a0\u00e1\u00e9\u01fd \U0010FFFD \u20acdef"), + ROW("xml", "<doc>\U00010000\U0010FFFD</doc>\r\n") +#undef ROW +}; + +class tst_QStringConverter : public QObject +{ + Q_OBJECT + +private slots: + void initTestCase(); + + void threadSafety(); + + void constructByName(); + + void invalidConverter(); + + void convertUtf8_data(); + void convertUtf8(); + void convertUtf8CharByChar_data() { convertUtf8_data(); } + void convertUtf8CharByChar(); + void roundtrip_data(); + void roundtrip(); + + void convertL1U8(); + + void convertL1U16(); + +#if QT_CONFIG(icu) + void roundtripIcu_data(); + void roundtripIcu(); + void icuInvalidCharacter_data(); + void icuInvalidCharacter(); + void icuEncodeEdgeCases_data(); + void icuEncodeEdgeCases(); + void icuUsableAfterMove(); + void charByCharConsistency_data(); + void charByCharConsistency(); + void byteByByteConsistency_data(); + void byteByByteConsistency(); + void statefulPieceWise(); +#endif + + void flagF7808080() const; + + void utf8Codec_data(); + void utf8Codec(); + + void utf8bom_data(); + void utf8bom(); + void roundtripBom_data(); + void roundtripBom(); + + void utf8stateful_data(); + void utf8stateful(); + + void utfHeaders_data(); + void utfHeaders(); + + void encodingForName_data(); + void encodingForName(); + + void nameForEncoding_data(); + void nameForEncoding(); + + void encodingForData_data(); + void encodingForData(); + + void encodingForHtml_data(); + void encodingForHtml(); + + void availableCodesAreAvailable(); + +#ifdef Q_OS_WIN + // On all other systems local 8-bit encoding is UTF-8 + void fromLocal8Bit_data(); + void fromLocal8Bit(); + void fromLocal8Bit_special_cases(); + void fromLocal8Bit_2GiB(); + void toLocal8Bit_data(); + void toLocal8Bit(); + void toLocal8Bit_special_cases(); + void toLocal8Bit_2GiB(); +#endif +}; + +void tst_QStringConverter::constructByName() +{ + QStringDecoder decoder("UTF-8"); + QVERIFY(decoder.isValid()); + QVERIFY(!strcmp(decoder.name(), "UTF-8")); + decoder = QStringDecoder("XXX"); + QVERIFY(!decoder.isValid()); + decoder = QStringDecoder("ISO-8859-1"); + QVERIFY(decoder.isValid()); + QVERIFY(!strcmp(decoder.name(), "ISO-8859-1")); + decoder = QStringDecoder("UTF-16LE"); + QVERIFY(decoder.isValid()); + QVERIFY(!strcmp(decoder.name(), "UTF-16LE")); + + decoder = QStringDecoder("utf8"); + QVERIFY(decoder.isValid()); + QVERIFY(!strcmp(decoder.name(), "UTF-8")); + decoder = QStringDecoder("iso8859-1"); + QVERIFY(decoder.isValid()); + QVERIFY(!strcmp(decoder.name(), "ISO-8859-1")); + decoder = QStringDecoder("utf-16"); + QVERIFY(decoder.isValid()); + QVERIFY(!strcmp(decoder.name(), "UTF-16")); +} + +void tst_QStringConverter::invalidConverter() +{ + // QStringEncoder tests + { + QStringEncoder encoder; + QVERIFY(!encoder.isValid()); + QVERIFY(!encoder.name()); + QByteArray encoded = encoder(u"Some text"); + QVERIFY(encoded.isEmpty()); + QVERIFY(encoder.hasError()); + + encoder.resetState(); + QVERIFY(!encoder.hasError()); + + encoded = encoder.encode(u"More text"); + QVERIFY(encoded.isEmpty()); + QVERIFY(encoder.hasError()); + QCOMPARE(encoder.requiredSpace(42), 0); + + encoder.resetState(); + QVERIFY(!encoder.hasError()); + char buffer[100]; + char *position = encoder.appendToBuffer(buffer, u"Even more"); + QCOMPARE(position, buffer); + QVERIFY(encoder.hasError()); + } + + // QStringDecoder tests + { + QStringDecoder decoder; + QVERIFY(!decoder.name()); + QVERIFY(!decoder.isValid()); + QString decoded = decoder("Some text"); + QVERIFY(decoded.isEmpty()); + QVERIFY(decoder.hasError()); + + decoder.resetState(); + QVERIFY(!decoder.hasError()); + + decoded = decoder.decode("More text"); + QVERIFY(decoded.isEmpty()); + QVERIFY(decoder.hasError()); + + QCOMPARE(decoder.requiredSpace(42), 0); + + decoder.resetState(); + QVERIFY(!decoder.hasError()); + char16_t buffer[100]; + char16_t *position = decoder.appendToBuffer(buffer, "Even more"); + QCOMPARE(position, buffer); + QVERIFY(decoder.hasError()); + } +} + +void tst_QStringConverter::convertUtf8_data() +{ + QTest::addColumn<QStringConverter::Encoding>("encoding"); + QTest::addColumn<QUtf8StringView>("utf8"); + QTest::addColumn<QStringView>("utf16"); + auto addRow = [](const TestString &s) { + QTest::addRow("Utf8:%s", s.description) << QStringDecoder::Utf8 << s.utf8 << s.utf16; + if (localeIsUtf8()) + QTest::addRow("System:%s", s.description) << QStringDecoder::System << s.utf8 << s.utf16; + }; + + for (const TestString &s : testStrings) + addRow(s); +} + +void tst_QStringConverter::convertUtf8() +{ + QFETCH(QStringConverter::Encoding, encoding); + QFETCH(QUtf8StringView, utf8); + QFETCH(QStringView, utf16); + + QByteArray ba = QByteArray::fromRawData(utf8.data(), utf8.size()); + + QStringDecoder decoder(encoding); + QVERIFY(decoder.isValid()); + QString uniString = decoder(ba); + QCOMPARE(uniString, utf16); + QCOMPARE(uniString, QString::fromUtf8(ba)); + QCOMPARE(ba, uniString.toUtf8()); + + // do it again (using .decode()) + uniString = decoder.decode(ba); + QCOMPARE(uniString, utf16); + QCOMPARE(uniString, QString::fromUtf8(ba)); + QCOMPARE(ba, uniString.toUtf8()); + + QStringEncoder encoder(encoding); + QByteArray reencoded = encoder(utf16); + QCOMPARE(reencoded, utf8); + QCOMPARE(reencoded, uniString.toUtf8()); + + // do it again (using .encode()) + reencoded = encoder.encode(utf16); + QCOMPARE(reencoded, utf8); + QCOMPARE(reencoded, uniString.toUtf8()); + + if (utf16.isEmpty()) + return; + + // repeat, with a longer string + constexpr qsizetype MinSize = 128; + uniString = utf16.toString(); + while (uniString.size() < MinSize && ba.size() < MinSize) { + uniString += uniString; + ba += ba; + } + QCOMPARE(decoder(ba), uniString); + QCOMPARE(encoder(uniString), ba); +} + +void tst_QStringConverter::convertUtf8CharByChar() +{ + QFETCH(QStringConverter::Encoding, encoding); + QFETCH(QUtf8StringView, utf8); + QFETCH(QStringView, utf16); + + QByteArray ba = QByteArray::fromRawData(utf8.data(), utf8.size()); + + QStringDecoder decoder(encoding); + QVERIFY(decoder.isValid()); + QString uniString; + for (int i = 0; i < ba.size(); ++i) + uniString += decoder(QByteArrayView(ba).sliced(i, 1)); + QCOMPARE(uniString, utf16); + QCOMPARE(uniString, QString::fromUtf8(ba)); + uniString.clear(); + + // do it again (using .decode()) + for (int i = 0; i < ba.size(); ++i) + uniString += decoder.decode(QByteArrayView(ba).sliced(i, 1)); + QCOMPARE(uniString, utf16); + QCOMPARE(uniString, QString::fromUtf8(ba)); + + QStringEncoder encoder(encoding); + QByteArray reencoded; + for (int i = 0; i < utf16.size(); ++i) + reencoded += encoder(utf16.sliced(i, 1)); + QCOMPARE(reencoded, ba); + reencoded.clear(); + + // do it again (using .encode()) + for (int i = 0; i < utf16.size(); ++i) + reencoded += encoder.encode(utf16.sliced(i, 1)); + QCOMPARE(reencoded, ba); +} + +void tst_QStringConverter::convertL1U16() +{ + const QLatin1StringView latin1("some plain latin1 text"); + const QString qstr(latin1); + + QStringDecoder decoder(QStringConverter::Latin1); + QVERIFY(decoder.isValid()); + QString uniString = decoder(latin1); + QCOMPARE(uniString, qstr); + QCOMPARE(latin1, uniString.toLatin1()); + + // do it again (using .decode()) + uniString = decoder.decode(latin1); + QCOMPARE(uniString, qstr); + QCOMPARE(latin1, uniString.toLatin1()); + + QStringEncoder encoder(QStringConverter::Latin1); + QByteArray reencoded = encoder(uniString); + QCOMPARE(reencoded, QByteArrayView(latin1)); + QCOMPARE(reencoded, uniString.toLatin1()); + + // do it again (using .encode()) + reencoded = encoder.encode(uniString); + QCOMPARE(reencoded, QByteArrayView(latin1)); + QCOMPARE(reencoded, uniString.toLatin1()); +} + +void tst_QStringConverter::roundtrip_data() +{ + QTest::addColumn<QStringView>("utf16"); + QTest::addColumn<QStringConverter::Encoding>("code"); + + for (const auto &code : codes) { + for (const TestString &s : testStrings) { + // rules: + // 1) don't pass the null character to the System codec + // 2) only pass operate on a string that will properly convert + if (code.code == QStringConverter::System && s.utf16.contains(QChar(0))) + continue; + if (code.limitation < s.limitation) + continue; + QTest::addRow("%s:%s", code.name, s.description) << s.utf16 << code.code; + } + + if (code.limitation == FullUnicode) { + using Digits = std::array<QChar, 2>; + using DigitsArray = std::array<Digits, 10>; + static constexpr DigitsArray chakmaDigits = []() { + const char32_t zeroVal = 0x11136; // Unicode's representation of Chakma zero + DigitsArray r; + for (int i = 0; i < int(r.size()); ++i) + r[i] = { QChar::highSurrogate(zeroVal + i), QChar::lowSurrogate(zeroVal + i) }; + return r; + }(); + for (int i = 0; i < int(chakmaDigits.size()); ++i) + QTest::addRow("%s:Chakma-digit-%d", code.name, i) << QStringView(chakmaDigits[i]) << code.code; + } + } +} + +void tst_QStringConverter::roundtrip() +{ + QFETCH(QStringView, utf16); + QFETCH(QStringConverter::Encoding, code); + QStringEncoder out(code); + QByteArray encoded = out.encode(utf16); + QStringDecoder back(code); + QString decoded = back.decode(encoded); + QCOMPARE(decoded, utf16); + + // test some flags + QStringConverter::Flags flag = QStringEncoder::Flag::Stateless; + { + QStringEncoder out2(code, flag); + QStringDecoder back2(code, flag); + decoded = back2.decode(out2.encode(utf16)); + QCOMPARE(decoded, utf16); + } + flag |= QStringConverter::Flag::ConvertInvalidToNull; + { + QStringEncoder out2(code, flag); + QStringDecoder back2(code, flag); + decoded = back2.decode(out2.encode(utf16)); + QCOMPARE(decoded, utf16); + } + + if (utf16.isEmpty()) + return; + + // repeat, with a longer string + constexpr qsizetype MinSize = 128; + QString uniString = utf16.toString(); + while (uniString.size() < MinSize && encoded.size() < MinSize) { + uniString += uniString; + encoded += encoded; + } + QCOMPARE(out.encode(uniString), encoded); + QCOMPARE(back.decode(encoded), uniString); + + QStringEncoder out2(code, flag); + QStringDecoder back2(code, flag); + decoded = back2.decode(out2.encode(uniString)); + QCOMPARE(decoded, uniString); +} + +void tst_QStringConverter::convertL1U8() +{ + { + std::array<char, 256> latin1; + std::iota(latin1.data(), latin1.data() + latin1.size(), uchar(0)); + std::array<char, 512> utf8; + auto out = QUtf8::convertFromLatin1(utf8.data(), QLatin1StringView{latin1.data(), latin1.size()}); + QCOMPARE(QString::fromLatin1(latin1.data(), latin1.size()), + QString::fromUtf8(utf8.data(), out - utf8.data())); + } +} + +#if QT_CONFIG(icu) + +void tst_QStringConverter::roundtripIcu_data() +{ + QTest::addColumn<QString>("original"); + QTest::addColumn<QByteArray>("codec"); + + QTest::addRow("shift_jis") << u"古池や 蛙飛び込む 水の音"_s << QByteArray("shift_jis"); + QTest::addRow("UTF7") << u"Übermäßig: čçö"_s << QByteArray("UTF-7"); +} + +void tst_QStringConverter::roundtripIcu() +{ + QFETCH(QString, original); + QFETCH(QByteArray, codec); + QStringEncoder fromUtf16(codec); + if (!fromUtf16.isValid()) + QSKIP("Unsupported codec"); + QStringDecoder toUtf16(codec); + QByteArray asShiftJIS = fromUtf16(original); + QString roundTripped = toUtf16(asShiftJIS); + QCOMPARE(roundTripped, original); +} + +void tst_QStringConverter::icuEncodeEdgeCases_data() +{ + QTest::addColumn<QString>("source"); + QTest::addColumn<QByteArray>("expected") ; + QTest::addColumn<QByteArray>("codec"); + + QTest::addRow("empty") << QString() << QByteArray() << QByteArray("ISO-2022-CN"); + QTest::addRow("BOMonly") << QString(QChar(QChar::ByteOrderMark)) << QByteArray() << QByteArray("ISO-2022-CN"); + QTest::addRow("1to6") << u"좋"_s << QByteArray::fromHex("1b2428434141") << QByteArray("ISO-2022-JP-2"); + QTest::addRow("1to7") << u"漢"_s << QByteArray::fromHex("1b2429470e6947") << QByteArray("ISO-2022-CN"); + QTest::addRow("1to8") << u"墎"_s << QByteArray::fromHex("1b242a481b4e4949") << QByteArray("ISO-2022-CN"); + QTest::addRow("utf7") << u"Übergröße"_s << QByteArray("+ANw-bergr+APYA3w-e") << QByteArray("UTF-7"); +} + +void tst_QStringConverter::icuEncodeEdgeCases() +{ + QFETCH(QString, source); + QFETCH(QByteArray, expected); + QFETCH(QByteArray, codec); + QStringEncoder encoder(codec); + if (!encoder.isValid()) + QSKIP("Unsupported codec"); + QVERIFY(encoder.isValid()); + QByteArray encoded = encoder.encode(source); + QCOMPARE(encoded, expected); +} + +void tst_QStringConverter::charByCharConsistency_data() +{ + QTest::addColumn<QStringView>("source"); + QTest::addColumn<QByteArray>("codec"); + + auto addRow = [](const TestString &s) { + QTest::addRow("%s_shift_jis", s.description) << s.utf16 << QByteArray("shift_jis"); + QTest::addRow("%s_EUC-CN", s.description) << s.utf16 << QByteArray("EUC-CN"); + }; + + for (const TestString &s : testStrings) { + if (s.utf16.isEmpty()) + continue; + addRow(s); + } +} + +void tst_QStringConverter::charByCharConsistency() +{ + QFETCH(const QStringView, source); + QFETCH(const QByteArray, codec); + + const auto check = [&](QStringEncoder encoder){ + if (!encoder.isValid()) + QSKIP("Unsupported codec"); + + QByteArray fullyConverted = encoder.encode(source); + encoder.resetState(); + QByteArray stepByStepConverted; + for (const auto& codeUnit: source) { + stepByStepConverted += encoder.encode(codeUnit); + } + QCOMPARE(stepByStepConverted, fullyConverted); + }; + + check(QStringEncoder(codec)); + if (QTest::currentTestResolved()) return; + + check(QStringEncoder(codec, QStringConverter::Flag::ConvertInvalidToNull)); + if (QTest::currentTestResolved()) return; + + // moved codecs also work: + + { + QStringEncoder dec(codec); + check(std::move(dec)); + } + if (QTest::currentTestResolved()) return; + + { + QStringEncoder dec(codec, QStringConverter::Flag::ConvertInvalidToNull); + check(std::move(dec)); + } + if (QTest::currentTestResolved()) return; + +} + +void tst_QStringConverter::byteByByteConsistency_data() +{ + QTest::addColumn<QByteArray>("source"); + QTest::addColumn<QByteArray>("codec"); + + QTest::addRow("plain_ascii_utf7") << QByteArray("Hello, world!") << QByteArray("UTF-7"); + QFile eucKr(":/euc_kr.txt"); + if (eucKr.open(QFile::ReadOnly)) + QTest::addRow("euc_kr_storing_jp") << eucKr.readAll() << QByteArray("EUC-KR"); + QTest::addRow("incomplete_euc_jp") << QByteArrayLiteral("test\x8Ftest") << QByteArray("EUC-JP"); +} + +void tst_QStringConverter::byteByByteConsistency() +{ + QFETCH(const QByteArray, source); + QFETCH(const QByteArray, codec); + + const auto check = [&](QStringDecoder decoder) { + if (!decoder.isValid()) + QSKIP("Unsupported codec"); + + QString fullyConverted = decoder.decode(source); + decoder.resetState(); + QString stepByStepConverted; + for (const auto& byte: source) { + QByteArray singleChar; + singleChar.append(byte); + stepByStepConverted += decoder.decode(singleChar); + } + QCOMPARE(stepByStepConverted, fullyConverted); + }; + + check(QStringDecoder(codec)); + if (QTest::currentTestResolved()) return; + + check(QStringDecoder(codec, QStringConverter::Flag::ConvertInvalidToNull)); + if (QTest::currentTestResolved()) return; + + // moved codecs also work: + + { + QStringDecoder dec(codec); + check(std::move(dec)); + } + if (QTest::currentTestResolved()) return; + + { + QStringDecoder dec(codec, QStringConverter::Flag::ConvertInvalidToNull); + check(std::move(dec)); + } + if (QTest::currentTestResolved()) return; + +} + +void tst_QStringConverter::statefulPieceWise() +{ + QStringDecoder decoder("HZ"); + if (!decoder.isValid()) + QSKIP("Unsupported codec"); + QString start = decoder.decode("pure ASCII"); + QCOMPARE(start, u"pure ASCII"); + QString shifted = decoder.decode("~{"); + // shift out changes the state, but won't create any output + QCOMPARE(shifted, ""); + QString continuation = decoder.decode("\x42\x43"); + QCOMPARE(continuation, "旅"); + decoder.resetState(); + // after resetting the state we're in N0 again + QString afterReset = decoder.decode("\x42\x43"); + QCOMPARE(afterReset, "BC"); +} + +void tst_QStringConverter::icuUsableAfterMove() +{ + { + QStringDecoder decoder("EUC-JP"); + QVERIFY(decoder.isValid()); + QString partial = decoder.decode("Test\x8E"); + QCOMPARE(partial, u"Test"_s); + QStringDecoder moved(std::move(decoder)); + QString complete = partial + moved.decode("\xA1Test"); + QCOMPARE(complete, u"Test\uFF61Test"_s); + } + { + QStringEncoder encoder("Big5"); + QVERIFY(encoder.isValid()); + QByteArray encoded = encoder.encode("hello"_L1); + QCOMPARE(encoded, "hello"); + QStringEncoder moved(std::move(encoder)); + encoded = moved.encode("bye"); + QCOMPARE(encoded, "bye"); + } +} + +void tst_QStringConverter::icuInvalidCharacter_data() +{ + QTest::addColumn<QString>("string"); + QTest::addColumn<QByteArray>("bytearray"); + QTest::addColumn<QByteArray>("codec"); + QTest::addColumn<QStringConverter::Flags>("flags"); + QTest::addColumn<bool>("shouldDecode"); + + using Flags = QStringConverter::Flags; + using Flag = QStringConverter::Flag; + QTest::addRow("encode") + << u"Test👪Test"_s + << QByteArrayLiteral("\xE3\x85\xA2\xA3\x3F\xE3\x85\xA2\xA3") + << QByteArray("IBM-037") << Flags(Flag::Default) + << false; + QTest::addRow("encode_null") + << u"Test👪Test"_s + << QByteArrayLiteral("\xE3\x85\xA2\xA3\0\xE3\x85\xA2\xA3") + << QByteArray("IBM-037") << Flags(Flag::ConvertInvalidToNull) + << false; + QTest::addRow("decode_incomplete_EUC-JP") + << u"test"_s + << QByteArrayLiteral("test\x8F") + << QByteArray("EUC-JP") << Flags(Flag::Stateless) + << true; + QTest::addRow("decode_invalid_EUC-JP_sequence") + << u"test\0test"_s + << QByteArrayLiteral("test\x8Ftest") + << QByteArray("EUC-JP") << Flags(Flag::ConvertInvalidToNull) + << true; + QTest::addRow("encode_incomplete_surrogate") + << u"test"_s + QChar::highSurrogate(0x11136) + << QByteArray("test") + << QByteArray("EUC-JP") << Flags(Flag::Stateless) + << false; +} + +void tst_QStringConverter::icuInvalidCharacter() +{ + QFETCH(QString, string); + QFETCH(QByteArray, bytearray); + QFETCH(QByteArray, codec); + QFETCH(QStringConverter::Flags, flags); + QFETCH(bool, shouldDecode); + if (shouldDecode) { + QStringDecoder decoder(codec.data(), flags); + QVERIFY(decoder.isValid()); + QString decoded = decoder.decode(bytearray); + QVERIFY(decoder.hasError()); + QCOMPARE(decoded, string); + } else { + QStringEncoder encoder(codec.data(), flags); + QVERIFY(encoder.isValid()); + QByteArray encoded = encoder.encode(string); + QVERIFY(encoder.hasError()); + QCOMPARE(encoded, bytearray); + } +} + +#endif + +void tst_QStringConverter::flagF7808080() const +{ + /* This test case stems from test not-wf-sa-170, tests/qxmlstream/XML-Test-Suite/xmlconf/xmltest/not-wf/sa/166.xml, + * whose description reads: + * + * "Four byte UTF-8 encodings can encode UCS-4 characters + * which are beyond the range of legal XML characters + * (and can't be expressed in Unicode surrogate pairs). + * This document holds such a character." + * + * In binary, this is: + * 11110111100000001000000010000000 + * * * * * + * 11110www10xxxxxx10yyyyyy10zzzzzz + * + * With multibyte logic removed it is the codepoint 0x1C0000. + */ + QByteArray input; + input.resize(4); + input[0] = char(0xF7); + input[1] = char(0x80); + input[2] = char(0x80); + input[3] = char(0x80); + + QStringDecoder decoder(QStringEncoder::Utf8, QStringDecoder::Flag::ConvertInvalidToNull); + QVERIFY(decoder.isValid()); + + QCOMPARE(decoder(input), QString(input.size(), QChar(0))); +} + +static QString fromInvalidUtf8Sequence(const QByteArray &ba) +{ + return QString().fill(QChar::ReplacementCharacter, ba.size()); +} + +// copied from tst_QString::fromUtf8_data() +void tst_QStringConverter::utf8Codec_data() +{ + QTest::addColumn<QByteArray>("utf8"); + QTest::addColumn<QString>("res"); + QTest::addColumn<int>("len"); + QString str; + + QTest::newRow("str0") << QByteArray("abcdefgh") << QString("abcdefgh") << -1; + QTest::newRow("str0-len") << QByteArray("abcdefgh") << QString("abc") << 3; + QTest::newRow("str1") << QByteArray("\303\266\303\244\303\274\303\226\303\204\303\234\303\270\303\246\303\245\303\230\303\206\303\205") + << QString::fromLatin1("\366\344\374\326\304\334\370\346\345\330\306\305") << -1; + QTest::newRow("str1-len") << QByteArray("\303\266\303\244\303\274\303\226\303\204\303\234\303\270\303\246\303\245\303\230\303\206\303\205") + << QString::fromLatin1("\366\344\374\326\304") << 10; + + str += QChar(0x05e9); + str += QChar(0x05d3); + str += QChar(0x05d2); + QTest::newRow("str2") << QByteArray("\327\251\327\223\327\222") << str << -1; + + str = QChar(0x05e9); + QTest::newRow("str2-len") << QByteArray("\327\251\327\223\327\222") << str << 2; + + str = QChar(0x20ac); + str += " some text"; + QTest::newRow("str3") << QByteArray("\342\202\254 some text") << str << -1; + + str = QChar(0x20ac); + str += " some "; + QTest::newRow("str3-len") << QByteArray("\342\202\254 some text") << str << 9; + + str = "hello"; + str += QChar::ReplacementCharacter; + str += QChar(0x68); + str += QChar::ReplacementCharacter; + str += QChar::ReplacementCharacter; + str += QChar::ReplacementCharacter; + str += QChar::ReplacementCharacter; + str += QChar(0x61); + str += QChar::ReplacementCharacter; + QTest::newRow("invalid utf8") << QByteArray("hello\344h\344\344\366\344a\304") << str << -1; + QTest::newRow("invalid utf8-len") << QByteArray("hello\344h\344\344\366\344a\304") << QString("hello") << 5; + + str = "Prohl"; + str += QChar::ReplacementCharacter; + str += QChar::ReplacementCharacter; + str += QLatin1Char('e'); + str += QChar::ReplacementCharacter; + str += " plugin"; + str += QChar::ReplacementCharacter; + str += " Netscape"; + + QTest::newRow("task28417") << QByteArray("Prohl\355\276e\350 plugin\371 Netscape") << str << -1; + QTest::newRow("task28417-len") << QByteArray("Prohl\355\276e\350 plugin\371 Netscape") << QString("") << 0; + + QTest::newRow("null-1") << QByteArray() << QString() << -1; + QTest::newRow("null0") << QByteArray() << QString() << 0; + // QTest::newRow("null5") << QByteArray() << QString() << 5; + QTest::newRow("empty-1") << QByteArray("\0abcd", 5) << QString() << -1; + QTest::newRow("empty0") << QByteArray() << QString() << 0; + QTest::newRow("empty5") << QByteArray("\0abcd", 5) << QString::fromLatin1("\0abcd", 5) << 5; + QTest::newRow("other-1") << QByteArray("ab\0cd", 5) << QString::fromLatin1("ab") << -1; + QTest::newRow("other5") << QByteArray("ab\0cd", 5) << QString::fromLatin1("ab\0cd", 5) << 5; + + str = "Old Italic: "; + str += QChar(0xd800); + str += QChar(0xdf00); + str += QChar(0xd800); + str += QChar(0xdf01); + str += QChar(0xd800); + str += QChar(0xdf02); + str += QChar(0xd800); + str += QChar(0xdf03); + str += QChar(0xd800); + str += QChar(0xdf04); + QTest::newRow("surrogate") << QByteArray("Old Italic: \360\220\214\200\360\220\214\201\360\220\214\202\360\220\214\203\360\220\214\204") << str << -1; + + QTest::newRow("surrogate-len") << QByteArray("Old Italic: \360\220\214\200\360\220\214\201\360\220\214\202\360\220\214\203\360\220\214\204") << str.left(16) << 20; + + // from http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html + + // 2.1.1 U+00000000 + QByteArray utf8; + utf8 += char(0x00); + str = QChar(QChar::Null); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.1.1") << utf8 << str << 1; + + // 2.1.2 U+00000080 + utf8.clear(); + utf8 += char(0xc2); + utf8 += char(0x80); + str = QChar(0x80); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.1.2") << utf8 << str << -1; + + // 2.1.3 U+00000800 + utf8.clear(); + utf8 += char(0xe0); + utf8 += char(0xa0); + utf8 += char(0x80); + str = QChar(0x800); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.1.3") << utf8 << str << -1; + + // 2.1.4 U+00010000 + utf8.clear(); + utf8 += char(0xf0); + utf8 += char(0x90); + utf8 += char(0x80); + utf8 += char(0x80); + str.clear(); + str += QChar(0xd800); + str += QChar(0xdc00); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.1.4") << utf8 << str << -1; + + // 2.1.5 U+00200000 (not a valid Unicode character) + utf8.clear(); + utf8 += char(0xf8); + utf8 += char(0x88); + utf8 += char(0x80); + utf8 += char(0x80); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.1.5") << utf8 << str << -1; + + // 2.1.6 U+04000000 (not a valid Unicode character) + utf8.clear(); + utf8 += char(0xfc); + utf8 += char(0x84); + utf8 += char(0x80); + utf8 += char(0x80); + utf8 += char(0x80); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.1.6") << utf8 << str << -1; + + // 2.2.1 U+0000007F + utf8.clear(); + utf8 += char(0x7f); + str = QChar(0x7f); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.2.1") << utf8 << str << -1; + + // 2.2.2 U+000007FF + utf8.clear(); + utf8 += char(0xdf); + utf8 += char(0xbf); + str = QChar(0x7ff); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.2.2") << utf8 << str << -1; + + // 2.2.3 U+000FFFF - non-character code + utf8.clear(); + utf8 += char(0xef); + utf8 += char(0xbf); + utf8 += char(0xbf); + str = QString::fromUtf8(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.2.3") << utf8 << str << -1; + + // 2.2.4 U+001FFFFF + utf8.clear(); + utf8 += char(0xf7); + utf8 += char(0xbf); + utf8 += char(0xbf); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.2.4") << utf8 << str << -1; + + // 2.2.5 U+03FFFFFF (not a valid Unicode character) + utf8.clear(); + utf8 += char(0xfb); + utf8 += char(0xbf); + utf8 += char(0xbf); + utf8 += char(0xbf); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.2.5") << utf8 << str << -1; + + // 2.2.6 U+7FFFFFFF + utf8.clear(); + utf8 += char(0xfd); + utf8 += char(0xbf); + utf8 += char(0xbf); + utf8 += char(0xbf); + utf8 += char(0xbf); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.2.6") << utf8 << str << -1; + + // 2.3.1 U+0000D7FF + utf8.clear(); + utf8 += char(0xed); + utf8 += char(0x9f); + utf8 += char(0xbf); + str = QChar(0xd7ff); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.3.1") << utf8 << str << -1; + + // 2.3.2 U+0000E000 + utf8.clear(); + utf8 += char(0xee); + utf8 += char(0x80); + utf8 += char(0x80); + str = QChar(0xe000); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.3.2") << utf8 << str << -1; + + // 2.3.3 U+0000FFFD + utf8.clear(); + utf8 += char(0xef); + utf8 += char(0xbf); + utf8 += char(0xbd); + str = QChar(QChar::ReplacementCharacter); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.3.3") << utf8 << str << -1; + + // 2.3.4 U+0010FFFD + utf8.clear(); + utf8 += char(0xf4); + utf8 += char(0x8f); + utf8 += char(0xbf); + utf8 += char(0xbd); + str.clear(); + str += QChar(0xdbff); + str += QChar(0xdffd); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.3.4") << utf8 << str << -1; + + // 2.3.5 U+00110000 + utf8.clear(); + utf8 += char(0xf4); + utf8 += char(0x90); + utf8 += char(0x80); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.3.5") << utf8 << str << -1; + + // 3.1.1 + utf8.clear(); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.1.1") << utf8 << str << -1; + + // 3.1.2 + utf8.clear(); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.1.2") << utf8 << str << -1; + + // 3.1.3 + utf8.clear(); + utf8 += char(0x80); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.1.3") << utf8 << str << -1; + + // 3.1.4 + utf8.clear(); + utf8 += char(0x80); + utf8 += char(0xbf); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.1.4") << utf8 << str << -1; + + // 3.1.5 + utf8.clear(); + utf8 += char(0x80); + utf8 += char(0xbf); + utf8 += char(0x80); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.1.5") << utf8 << str << -1; + + // 3.1.6 + utf8.clear(); + utf8 += char(0x80); + utf8 += char(0xbf); + utf8 += char(0x80); + utf8 += char(0xbf); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.1.6") << utf8 << str << -1; + + // 3.1.7 + utf8.clear(); + utf8 += char(0x80); + utf8 += char(0xbf); + utf8 += char(0x80); + utf8 += char(0xbf); + utf8 += char(0x80); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.1.7") << utf8 << str << -1; + + // 3.1.8 + utf8.clear(); + utf8 += char(0x80); + utf8 += char(0xbf); + utf8 += char(0x80); + utf8 += char(0xbf); + utf8 += char(0x80); + utf8 += char(0xbf); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.1.8") << utf8 << str << -1; + + // 3.1.9 + utf8.clear(); + for (uint i = 0x80; i<= 0xbf; ++i) + utf8 += i; + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.1.9") << utf8 << str << -1; + + // 3.2.1 + utf8.clear(); + str.clear(); + for (uint i = 0xc8; i <= 0xdf; ++i) { + utf8 += i; + utf8 += char(0x20); + + str += QChar::ReplacementCharacter; + str += QChar(0x0020); + } + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.2.1") << utf8 << str << -1; + + // 3.2.2 + utf8.clear(); + str.clear(); + for (uint i = 0xe0; i <= 0xef; ++i) { + utf8 += i; + utf8 += char(0x20); + + str += QChar::ReplacementCharacter; + str += QChar(0x0020); + } + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.2.2") << utf8 << str << -1; + + // 3.2.3 + utf8.clear(); + str.clear(); + for (uint i = 0xf0; i <= 0xf7; ++i) { + utf8 += i; + utf8 += 0x20; + + str += QChar::ReplacementCharacter; + str += QChar(0x0020); + } + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.2.3") << utf8 << str << -1; + + // 3.2.4 + utf8.clear(); + str.clear(); + for (uint i = 0xf8; i <= 0xfb; ++i) { + utf8 += i; + utf8 += 0x20; + + str += QChar::ReplacementCharacter; + str += QChar(0x0020); + } + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.2.4") << utf8 << str << -1; + + // 3.2.5 + utf8.clear(); + str.clear(); + for (uint i = 0xfc; i <= 0xfd; ++i) { + utf8 += i; + utf8 += 0x20; + + str += QChar::ReplacementCharacter; + str += QChar(0x0020); + } + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.2.5") << utf8 << str << -1; + + // 3.3.1 + utf8.clear(); + utf8 += char(0xc0); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.1") << utf8 << str << -1; + utf8 += char(0x30); + str += QChar(0x30); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.1-1") << utf8 << str << -1; + + // 3.3.2 + utf8.clear(); + utf8 += char(0xe0); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.2") << utf8 << str << -1; + utf8 += char(0x30); + str += QChar(0x30); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.2-1") << utf8 << str << -1; + + utf8.clear(); + utf8 += char(0xe0); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.2-2") << utf8 << str << -1; + utf8 += 0x30; + str += QChar(0x30); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.2-3") << utf8 << str << -1; + + // 3.3.3 + utf8.clear(); + utf8 += char(0xf0); + utf8 += char(0x80); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.3") << utf8 << str << -1; + utf8 += char(0x30); + str += QChar(0x30); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.3-1") << utf8 << str << -1; + + utf8.clear(); + utf8 += char(0xf0); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.3-2") << utf8 << str << -1; + utf8 += char(0x30); + str += QChar(0x30); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.3-3") << utf8 << str << -1; + + utf8.clear(); + utf8 += char(0xf0); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.3-4") << utf8 << str << -1; + utf8 += char(0x30); + str += QChar(0x30); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.3-5") << utf8 << str << -1; + + // 3.3.4 + utf8.clear(); + utf8 += char(0xf8); + utf8 += char(0x80); + utf8 += char(0x80); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.4") << utf8 << str << -1; + utf8 += char(0x30); + str += QChar(0x30); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.4-1") << utf8 << str << -1; + + utf8.clear(); + utf8 += char(0xf8); + utf8 += char(0x80); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.4-2") << utf8 << str << -1; + utf8 += char(0x30); + str += QChar(0x30); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.4-3") << utf8 << str << -1; + + utf8.clear(); + utf8 += char(0xf8); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.4-4") << utf8 << str << -1; + utf8 += char(0x30); + str += QChar(0x30); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.4-5") << utf8 << str << -1; + + utf8.clear(); + utf8 += char(0xf8); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.4-6") << utf8 << str << -1; + utf8 += char(0x30); + str += QChar(0x30); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.4-7") << utf8 << str << -1; + + // 3.3.5 + utf8.clear(); + utf8 += char(0xfc); + utf8 += char(0x80); + utf8 += char(0x80); + utf8 += char(0x80); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.5") << utf8 << str << -1; + utf8 += char(0x30); + str += QChar(0x30); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.5-1") << utf8 << str << -1; + + utf8.clear(); + utf8 += char(0xfc); + utf8 += char(0x80); + utf8 += char(0x80); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.5-2") << utf8 << str << -1; + utf8 += char(0x30); + str += QChar(0x30); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.5-3") << utf8 << str << -1; + + utf8.clear(); + utf8 += char(0xfc); + utf8 += char(0x80); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.5-4") << utf8 << str << -1; + utf8 += char(0x30); + str += QChar(0x30); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.5-5") << utf8 << str << -1; + + utf8.clear(); + utf8 += char(0xfc); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.5-6") << utf8 << str << -1; + utf8 += char(0x30); + str += QChar(0x30); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.5-7") << utf8 << str << -1; + + utf8.clear(); + utf8 += char(0xfc); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.5-8") << utf8 << str << -1; + utf8 += char(0x30); + str += QChar(0x30); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.5-9") << utf8 << str << -1; + + // 3.3.6 + utf8.clear(); + utf8 += char(0xdf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.6") << utf8 << str << -1; + utf8 += char(0x30); + str += QChar(0x30); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.6-1") << utf8 << str << -1; + + // 3.3.7 + utf8.clear(); + utf8 += char(0xef); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.7") << utf8 << str << -1; + utf8 += char(0x30); + str += QChar(0x30); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.7-1") << utf8 << str << -1; + + utf8.clear(); + utf8 += char(0xef); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.7-2") << utf8 << str << -1; + utf8 += char(0x30); + str += QChar(0x30); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.7-3") << utf8 << str << -1; + + // 3.3.8 + utf8.clear(); + utf8 += char(0xf7); + utf8 += char(0xbf); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.8") << utf8 << str << -1; + utf8 += char(0x30); + str += QChar(0x30); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.8-1") << utf8 << str << -1; + + utf8.clear(); + utf8 += char(0xf7); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.8-2") << utf8 << str << -1; + utf8 += char(0x30); + str += QChar(0x30); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.8-3") << utf8 << str << -1; + + utf8.clear(); + utf8 += char(0xf7); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.8-4") << utf8 << str << -1; + utf8 += char(0x30); + str += QChar(0x30); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.8-5") << utf8 << str << -1; + + // 3.3.9 + utf8.clear(); + utf8 += char(0xfb); + utf8 += char(0xbf); + utf8 += char(0xbf); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.9") << utf8 << str << -1; + utf8 += char(0x30); + str += QChar(0x30); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.9-1") << utf8 << str << -1; + + utf8.clear(); + utf8 += char(0xfb); + utf8 += char(0xbf); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.9-2") << utf8 << str << -1; + utf8 += char(0x30); + str += QChar(0x30); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.9-3") << utf8 << str << -1; + + utf8.clear(); + utf8 += char(0xfb); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.9-4") << utf8 << str << -1; + utf8 += char(0x30); + str += QChar(0x30); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.9-5") << utf8 << str << -1; + + utf8.clear(); + utf8 += char(0xfb); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.9-6") << utf8 << str << -1; + utf8 += char(0x30); + str += QChar(0x30); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.9-7") << utf8 << str << -1; + + // 3.3.10 + utf8.clear(); + utf8 += char(0xfd); + utf8 += char(0xbf); + utf8 += char(0xbf); + utf8 += char(0xbf); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.10") << utf8 << str << -1; + utf8 += char(0x30); + str += QChar(0x30); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.10-1") << utf8 << str << -1; + + utf8.clear(); + utf8 += char(0xfd); + utf8 += char(0xbf); + utf8 += char(0xbf); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.10-2") << utf8 << str << -1; + utf8 += char(0x30); + str += QChar(0x30); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.10-3") << utf8 << str << -1; + + utf8.clear(); + utf8 += char(0xfd); + utf8 += char(0xbf); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.10-4") << utf8 << str << -1; + utf8 += char(0x30); + str += QChar(0x30); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.10-5") << utf8 << str << -1; + + utf8.clear(); + utf8 += char(0xfd); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.10-6") << utf8 << str << -1; + utf8 += char(0x30); + str += QChar(0x30); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.10-7") << utf8 << str << -1; + + utf8.clear(); + utf8 += char(0xfd); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.10-8") << utf8 << str << -1; + utf8 += char(0x30); + str += QChar(0x30); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.3.10-9") << utf8 << str << -1; + + // 3.4 + utf8.clear(); + utf8 += char(0xc0); + utf8 += char(0xe0); + utf8 += char(0x80); + utf8 += char(0xf0); + utf8 += char(0x80); + utf8 += char(0x80); + utf8 += char(0xf8); + utf8 += char(0x80); + utf8 += char(0x80); + utf8 += char(0x80); + utf8 += char(0xfc); + utf8 += char(0x80); + utf8 += char(0x80); + utf8 += char(0x80); + utf8 += char(0x80); + utf8 += char(0xdf); + utf8 += char(0xef); + utf8 += char(0xbf); + utf8 += char(0xf7); + utf8 += char(0xbf); + utf8 += char(0xbf); + utf8 += char(0xfb); + utf8 += char(0xbf); + utf8 += char(0xbf); + utf8 += char(0xbf); + utf8 += char(0xfd); + utf8 += char(0xbf); + utf8 += char(0xbf); + utf8 += char(0xbf); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.4") << utf8 << str << -1; + + // 3.5.1 + utf8.clear(); + utf8 += char(0xfe); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.5.1") << utf8 << str << -1; + + // 3.5.2 + utf8.clear(); + utf8 += char(0xff); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.5.2") << utf8 << str << -1; + + // 3.5.2 + utf8.clear(); + utf8 += char(0xfe); + utf8 += char(0xfe); + utf8 += char(0xff); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 3.5.2-1") << utf8 << str << -1; + + // 4.1.1 + utf8.clear(); + utf8 += char(0xc0); + utf8 += char(0xaf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.1.1") << utf8 << str << -1; + + // 4.1.2 + utf8.clear(); + utf8 += char(0xe0); + utf8 += char(0x80); + utf8 += char(0xaf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.1.2") << utf8 << str << -1; + + // 4.1.3 + utf8.clear(); + utf8 += char(0xf0); + utf8 += char(0x80); + utf8 += char(0x80); + utf8 += char(0xaf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.1.3") << utf8 << str << -1; + + // 4.1.4 + utf8.clear(); + utf8 += char(0xf8); + utf8 += char(0x80); + utf8 += char(0x80); + utf8 += char(0x80); + utf8 += char(0xaf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.1.4") << utf8 << str << -1; + + // 4.1.5 + utf8.clear(); + utf8 += char(0xfc); + utf8 += char(0x80); + utf8 += char(0x80); + utf8 += char(0x80); + utf8 += char(0x80); + utf8 += char(0xaf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.1.5") << utf8 << str << -1; + + // 4.2.1 + utf8.clear(); + utf8 += char(0xc1); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.2.1") << utf8 << str << -1; + + // 4.2.2 + utf8.clear(); + utf8 += char(0xe0); + utf8 += char(0x9f); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.2.2") << utf8 << str << -1; + + // 4.2.3 + utf8.clear(); + utf8 += char(0xf0); + utf8 += char(0x8f); + utf8 += char(0xbf); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.2.3") << utf8 << str << -1; + + // 4.2.4 + utf8.clear(); + utf8 += char(0xf8); + utf8 += char(0x87); + utf8 += char(0xbf); + utf8 += char(0xbf); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.2.4") << utf8 << str << -1; + + // 4.2.5 + utf8.clear(); + utf8 += char(0xfc); + utf8 += char(0x83); + utf8 += char(0xbf); + utf8 += char(0xbf); + utf8 += char(0xbf); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.2.5") << utf8 << str << -1; + + // 4.3.1 + utf8.clear(); + utf8 += char(0xc0); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.3.1") << utf8 << str << -1; + + // 4.3.2 + utf8.clear(); + utf8 += char(0xe0); + utf8 += char(0x80); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.3.2") << utf8 << str << -1; + + // 4.3.3 + utf8.clear(); + utf8 += char(0xf0); + utf8 += char(0x80); + utf8 += char(0x80); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.3.3") << utf8 << str << -1; + + // 4.3.4 + utf8.clear(); + utf8 += char(0xf8); + utf8 += char(0x80); + utf8 += char(0x80); + utf8 += char(0x80); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.3.4") << utf8 << str << -1; + + // 4.3.5 + utf8.clear(); + utf8 += char(0xfc); + utf8 += char(0x80); + utf8 += char(0x80); + utf8 += char(0x80); + utf8 += char(0x80); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.3.5") << utf8 << str << -1; + + // 5.1.1 + utf8.clear(); + utf8 += char(0xed); + utf8 += char(0xa0); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.1.1") << utf8 << str << -1; + + // 5.1.2 + utf8.clear(); + utf8 += char(0xed); + utf8 += char(0xad); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.1.2") << utf8 << str << -1; + + // 5.1.3 + utf8.clear(); + utf8 += char(0xed); + utf8 += char(0xae); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.1.3") << utf8 << str << -1; + + // 5.1.4 + utf8.clear(); + utf8 += char(0xed); + utf8 += char(0xaf); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.1.4") << utf8 << str << -1; + + // 5.1.5 + utf8.clear(); + utf8 += char(0xed); + utf8 += char(0xb0); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.1.5") << utf8 << str << -1; + + // 5.1.6 + utf8.clear(); + utf8 += char(0xed); + utf8 += char(0xbe); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.1.6") << utf8 << str << -1; + + // 5.1.7 + utf8.clear(); + utf8 += char(0xed); + utf8 += char(0xbf); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.1.7") << utf8 << str << -1; + + // 5.2.1 + utf8.clear(); + utf8 += char(0xed); + utf8 += char(0xa0); + utf8 += char(0x80); + utf8 += char(0xed); + utf8 += char(0xb0); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.2.1") << utf8 << str << -1; + + // 5.2.2 + utf8.clear(); + utf8 += char(0xed); + utf8 += char(0xa0); + utf8 += char(0x80); + utf8 += char(0xed); + utf8 += char(0xbf); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.2.2") << utf8 << str << -1; + + // 5.2.3 + utf8.clear(); + utf8 += char(0xed); + utf8 += char(0xad); + utf8 += char(0xbf); + utf8 += char(0xed); + utf8 += char(0xb0); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.2.3") << utf8 << str << -1; + + // 5.2.4 + utf8.clear(); + utf8 += char(0xed); + utf8 += char(0xad); + utf8 += char(0xbf); + utf8 += char(0xed); + utf8 += char(0xbf); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.2.4") << utf8 << str << -1; + + // 5.2.5 + utf8.clear(); + utf8 += char(0xed); + utf8 += char(0xae); + utf8 += char(0x80); + utf8 += char(0xed); + utf8 += char(0xb0); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.2.5") << utf8 << str << -1; + + // 5.2.6 + utf8.clear(); + utf8 += char(0xed); + utf8 += char(0xae); + utf8 += char(0x80); + utf8 += char(0xed); + utf8 += char(0xbf); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.2.6") << utf8 << str << -1; + + // 5.2.7 + utf8.clear(); + utf8 += char(0xed); + utf8 += char(0xaf); + utf8 += char(0xbf); + utf8 += char(0xed); + utf8 += char(0xb0); + utf8 += char(0x80); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.2.7") << utf8 << str << -1; + + // 5.2.8 + utf8.clear(); + utf8 += char(0xed); + utf8 += char(0xaf); + utf8 += char(0xbf); + utf8 += char(0xed); + utf8 += char(0xbf); + utf8 += char(0xbf); + str = fromInvalidUtf8Sequence(utf8); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.2.8") << utf8 << str << -1; + + // 5.3.1 - non-character code + utf8.clear(); + utf8 += char(0xef); + utf8 += char(0xbf); + utf8 += char(0xbe); + //str = QChar(QChar::ReplacementCharacter); + str = QChar(0xfffe); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.3.1") << utf8 << str << -1; + + // 5.3.2 - non-character code + utf8.clear(); + utf8 += char(0xef); + utf8 += char(0xbf); + utf8 += char(0xbf); + //str = QChar(QChar::ReplacementCharacter); + str = QChar(0xffff); + QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.3.2") << utf8 << str << -1; +} + +void tst_QStringConverter::utf8Codec() +{ + QFETCH(QByteArray, utf8); + QFETCH(QString, res); + QFETCH(int, len); + + QStringDecoder decoder(QStringDecoder::Utf8, QStringDecoder::Flag::Stateless); + QString str = decoder(QByteArrayView(utf8).first(len < 0 ? qstrlen(utf8.constData()) : len)); + QCOMPARE(str, res); + + str = QString::fromUtf8(utf8.isNull() ? 0 : utf8.constData(), len); + QCOMPARE(str, res); +} + +QT_WARNING_PUSH +QT_WARNING_DISABLE_DEPRECATED +void tst_QStringConverter::utf8bom_data() +{ + QTest::addColumn<QByteArray>("data"); + QTest::addColumn<QString>("result"); + + QTest::newRow("nobom") + << QByteArray("\302\240", 2) + << QString::fromLatin1("\240"); + + { + static const char16_t data[] = { 0x201d }; + QTest::newRow("nobom 2") + << QByteArray("\342\200\235", 3) + << QString::fromUtf16(data, std::size(data)); + } + + { + static const char16_t data[] = { 0xf000 }; + QTest::newRow("bom1") + << QByteArray("\357\200\200", 3) + << QString::fromUtf16(data, std::size(data)); + } + + { + static const char16_t data[] = { 0xfec0 }; + QTest::newRow("bom2") + << QByteArray("\357\273\200", 3) + << QString::fromUtf16(data, std::size(data)); + } + + { + QTest::newRow("normal-bom") + << QByteArray("\357\273\277a", 4) + << QString("a"); + } + + { // test the non-SIMD code-path + static const char16_t data[] = { 0x61, 0xfeff, 0x62 }; + QTest::newRow("middle-bom (non SIMD)") + << QByteArray("a\357\273\277b") + << QString::fromUtf16(data, std::size(data)); + } + + { // test the SIMD code-path + static const char16_t data[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0xfeff, 0x6d }; + QTest::newRow("middle-bom (SIMD)") + << QByteArray("abcdefghijkl\357\273\277m") + << QString::fromUtf16(data, std::size(data)); + } +} +QT_WARNING_POP + +void tst_QStringConverter::utf8bom() +{ + QFETCH(QByteArray, data); + QFETCH(QString, result); + + QStringDecoder decoder(QStringDecoder::Utf8); + + QCOMPARE(decoder(data), result); +} + +// someone set us up the BOM! +void tst_QStringConverter::roundtripBom_data() +{ + QTest::addColumn<QStringView>("utf16"); + QTest::addColumn<QStringConverter::Encoding>("code"); + + for (const auto &code : codes) { + if (size_t(code.code) >= encodedBoms.size()) + break; + if (code.limitation != FullUnicode) + continue; // can't represent BOM + + for (const TestString &s : testStrings) { + if (s.utf16.isEmpty()) + continue; + QTest::addRow("%s:%s", code.name, s.description) << s.utf16 << code.code; + } + } +} + +void tst_QStringConverter::roundtripBom() +{ + QFETCH(QStringView, utf16); + QFETCH(QStringConverter::Encoding, code); + QByteArray encodedBom = encodedBoms[code].toByteArray(); + QChar bom = QChar::ByteOrderMark; + + // QStringConverter defaults to producing no BOM, but interpreting it if it + // is there + + QStringEncoder encoderWithoutBom(code); + QStringEncoder encoderWithBom(code, QStringEncoder::Flag::WriteBom); + QByteArray encodedWithoutBom = encoderWithoutBom(utf16); + QByteArray encodedWithBom = encoderWithBom(utf16); + QCOMPARE(encodedWithBom, encodedBom + encodedWithoutBom); + + QStringDecoder decoderWithoutBom(code, QStringDecoder::Flag::ConvertInitialBom); + QStringDecoder decoderWithBom(code); + QString decoded = decoderWithBom(encodedWithBom); + QCOMPARE(decoded, utf16); + + decoded = decoderWithoutBom(encodedWithBom); + QCOMPARE(decoded, bom + utf16.toString()); +} + +void tst_QStringConverter::utf8stateful_data() +{ + QTest::addColumn<QByteArray>("buffer1"); + QTest::addColumn<QByteArray>("buffer2"); + QTest::addColumn<QString>("result"); // null QString indicates decoder error + + // valid buffer continuations + QTest::newRow("1of2+valid") << QByteArray("\xc2") << QByteArray("\xa0") << "\xc2\xa0"; + QTest::newRow("1of3+valid") << QByteArray("\xe0") << QByteArray("\xa0\x80") << "\xe0\xa0\x80"; + QTest::newRow("2of3+valid") << QByteArray("\xe0\xa0") << QByteArray("\x80") << "\xe0\xa0\x80"; + QTest::newRow("1of4+valid") << QByteArray("\360") << QByteArray("\220\210\203") << "\360\220\210\203"; + QTest::newRow("2of4+valid") << QByteArray("\360\220") << QByteArray("\210\203") << "\360\220\210\203"; + QTest::newRow("3of4+valid") << QByteArray("\360\220\210") << QByteArray("\203") << "\360\220\210\203"; + QTest::newRow("1ofBom+valid") << QByteArray("\xef") << QByteArray("\xbb\xbf") << ""; + QTest::newRow("2ofBom+valid") << QByteArray("\xef\xbb") << QByteArray("\xbf") << ""; + + // invalid continuation + QTest::newRow("1of2+invalid") << QByteArray("\xc2") << QByteArray("a") << QString(); + QTest::newRow("1of3+invalid") << QByteArray("\xe0") << QByteArray("a") << QString(); + QTest::newRow("2of3+invalid") << QByteArray("\xe0\xa0") << QByteArray("a") << QString(); + QTest::newRow("1of4+invalid") << QByteArray("\360") << QByteArray("a") << QString(); + QTest::newRow("2of4+invalid") << QByteArray("\360\220") << QByteArray("a") << QString(); + QTest::newRow("3of4+invalid") << QByteArray("\360\220\210") << QByteArray("a") << QString(); + + // overlong sequence: + QTest::newRow("overlong-1of2") << QByteArray("\xc1") << QByteArray("\x81") << QString(); + QTest::newRow("overlong-1of3") << QByteArray("\xe0") << QByteArray("\x81\x81") << QString(); + QTest::newRow("overlong-2of3") << QByteArray("\xe0\x81") << QByteArray("\x81") << QString(); + QTest::newRow("overlong-1of4") << QByteArray("\xf0") << QByteArray("\x80\x81\x81") << QString(); + QTest::newRow("overlong-2of4") << QByteArray("\xf0\x80") << QByteArray("\x81\x81") << QString(); + QTest::newRow("overlong-3of4") << QByteArray("\xf0\x80\x81") << QByteArray("\x81") << QString(); + + // out of range: + // leading byte 0xF4 can produce codepoints above U+10FFFF, which aren't valid + QTest::newRow("outofrange1-1of4") << QByteArray("\xf4") << QByteArray("\x90\x80\x80") << QString(); + QTest::newRow("outofrange1-2of4") << QByteArray("\xf4\x90") << QByteArray("\x80\x80") << QString(); + QTest::newRow("outofrange1-3of4") << QByteArray("\xf4\x90\x80") << QByteArray("\x80") << QString(); + QTest::newRow("outofrange2-1of4") << QByteArray("\xf5") << QByteArray("\x90\x80\x80") << QString(); + QTest::newRow("outofrange2-2of4") << QByteArray("\xf5\x90") << QByteArray("\x80\x80") << QString(); + QTest::newRow("outofrange2-3of4") << QByteArray("\xf5\x90\x80") << QByteArray("\x80") << QString(); + QTest::newRow("outofrange-1of5") << QByteArray("\xf8") << QByteArray("\x88\x80\x80\x80") << QString(); + QTest::newRow("outofrange-2of5") << QByteArray("\xf8\x88") << QByteArray("\x80\x80\x80") << QString(); + QTest::newRow("outofrange-3of5") << QByteArray("\xf8\x88\x80") << QByteArray("\x80\x80") << QString(); + QTest::newRow("outofrange-4of5") << QByteArray("\xf8\x88\x80\x80") << QByteArray("\x80") << QString(); + QTest::newRow("outofrange-1of6") << QByteArray("\xfc") << QByteArray("\x84\x80\x80\x80\x80") << QString(); + QTest::newRow("outofrange-2of6") << QByteArray("\xfc\x84") << QByteArray("\x80\x80\x80\x80") << QString(); + QTest::newRow("outofrange-3of6") << QByteArray("\xfc\x84\x80") << QByteArray("\x80\x80\x80") << QString(); + QTest::newRow("outofrange-4of6") << QByteArray("\xfc\x84\x80\x80") << QByteArray("\x80\x80") << QString(); + QTest::newRow("outofrange-5of6") << QByteArray("\xfc\x84\x80\x80\x80") << QByteArray("\x80") << QString(); +} + +void tst_QStringConverter::utf8stateful() +{ + QFETCH(QByteArray, buffer1); + QFETCH(QByteArray, buffer2); + QFETCH(QString, result); + + { + QStringDecoder decoder(QStringDecoder::Utf8); + QVERIFY(decoder.isValid()); + + QString decoded = decoder(buffer1); + if (result.isNull()) { + if (!decoder.hasError()) { + // incomplete data + decoded += decoder(buffer2); + QVERIFY(decoder.hasError()); + } + } else { + QVERIFY(!decoder.hasError()); + decoded += decoder(buffer2); + QVERIFY(!decoder.hasError()); + QCOMPARE(decoded, result); + } + } + + if (!buffer2.isEmpty()) { + QStringDecoder decoder(QStringDecoder::Utf8); + QVERIFY(decoder.isValid()); + + QString decoded; + for (char c : buffer1) + decoded += decoder(QByteArrayView(&c, 1)); + for (char c : buffer2) + decoded += decoder(QByteArrayView(&c, 1)); + if (result.isNull()) { + QVERIFY(decoder.hasError()); + } else { + QVERIFY(!decoder.hasError()); + QCOMPARE(decoded, result); + } + } +} + +void tst_QStringConverter::utfHeaders_data() +{ + QTest::addColumn<QStringConverter::Encoding>("encoding"); + QTest::addColumn<QStringConverter::Flag>("flags"); + QTest::addColumn<QByteArray>("encoded"); + QTest::addColumn<QString>("unicode"); + + QTest::newRow("utf8 bom") + << QStringConverter::Utf8 + << QStringConverter::Flag::WriteBom + << QByteArray("\xef\xbb\xbfhello") + << QString::fromLatin1("hello"); + QTest::newRow("utf8 nobom") + << QStringConverter::Utf8 + << QStringConverter::Flag::WriteBom + << QByteArray("hello") + << QString::fromLatin1("hello"); + QTest::newRow("utf8 bom ignore header") + << QStringConverter::Utf8 + << QStringConverter::Flag::ConvertInitialBom + << QByteArray("\xef\xbb\xbfhello") + << (QString(QChar(0xfeff)) + QString::fromLatin1("hello")); + QTest::newRow("utf8 nobom ignore header") + << QStringConverter::Utf8 + << QStringConverter::Flag::ConvertInitialBom + << QByteArray("hello") + << QString::fromLatin1("hello"); + + QTest::newRow("utf16 bom be") + << QStringConverter::Utf16 + << QStringConverter::Flag::WriteBom + << QByteArray("\xfe\xff\0h\0e\0l", 8) + << QString::fromLatin1("hel"); + QTest::newRow("utf16 bom le") + << QStringConverter::Utf16 + << QStringConverter::Flag::WriteBom + << QByteArray("\xff\xfeh\0e\0l\0", 8) + << QString::fromLatin1("hel"); + if (QSysInfo::ByteOrder == QSysInfo::BigEndian) { + QTest::newRow("utf16 nobom") + << QStringConverter::Utf16 + << QStringConverter::Flag::WriteBom + << QByteArray("\0h\0e\0l", 6) + << QString::fromLatin1("hel"); + QTest::newRow("utf16 bom be ignore header") + << QStringConverter::Utf16 + << QStringConverter::Flag::ConvertInitialBom + << QByteArray("\xfe\xff\0h\0e\0l", 8) + << (QString(QChar(0xfeff)) + QString::fromLatin1("hel")); + } else { + QTest::newRow("utf16 nobom") + << QStringConverter::Utf16 + << QStringConverter::Flag::WriteBom + << QByteArray("h\0e\0l\0", 6) + << QString::fromLatin1("hel"); + QTest::newRow("utf16 bom le ignore header") + << QStringConverter::Utf16 + << QStringConverter::Flag::ConvertInitialBom + << QByteArray("\xff\xfeh\0e\0l\0", 8) + << (QString(QChar(0xfeff)) + QString::fromLatin1("hel")); + } + + QTest::newRow("utf16-be bom be") + << QStringConverter::Utf16BE + << QStringConverter::Flag::WriteBom + << QByteArray("\xfe\xff\0h\0e\0l", 8) + << QString::fromLatin1("hel"); + QTest::newRow("utf16-be nobom") + << QStringConverter::Utf16BE + << QStringConverter::Flag::WriteBom + << QByteArray("\0h\0e\0l", 6) + << QString::fromLatin1("hel"); + QTest::newRow("utf16-be bom be ignore header") + << QStringConverter::Utf16BE + << QStringConverter::Flag::ConvertInitialBom + << QByteArray("\xfe\xff\0h\0e\0l", 8) + << (QString(QChar(0xfeff)) + QString::fromLatin1("hel")); + + QTest::newRow("utf16-le bom le") + << QStringConverter::Utf16LE + << QStringConverter::Flag::WriteBom + << QByteArray("\xff\xfeh\0e\0l\0", 8) + << QString::fromLatin1("hel"); + QTest::newRow("utf16-le nobom") + << QStringConverter::Utf16LE + << QStringConverter::Flag::WriteBom + << QByteArray("h\0e\0l\0", 6) + << QString::fromLatin1("hel"); + QTest::newRow("utf16-le bom le ignore header") + << QStringConverter::Utf16LE + << QStringConverter::Flag::ConvertInitialBom + << QByteArray("\xff\xfeh\0e\0l\0", 8) + << (QString(QChar(0xfeff)) + QString::fromLatin1("hel")); + + QTest::newRow("utf32 bom be") + << QStringConverter::Utf32 + << QStringConverter::Flag::WriteBom + << QByteArray("\0\0\xfe\xff\0\0\0h\0\0\0e\0\0\0l", 16) + << QString::fromLatin1("hel"); + QTest::newRow("utf32 bom le") + << QStringConverter::Utf32 + << QStringConverter::Flag::WriteBom + << QByteArray("\xff\xfe\0\0h\0\0\0e\0\0\0l\0\0\0", 16) + << QString::fromLatin1("hel"); + if (QSysInfo::ByteOrder == QSysInfo::BigEndian) { + QTest::newRow("utf32 nobom") + << QStringConverter::Utf32 + << QStringConverter::Flag::WriteBom + << QByteArray("\0\0\0h\0\0\0e\0\0\0l", 12) + << QString::fromLatin1("hel"); + QTest::newRow("utf32 bom be ignore header") + << QStringConverter::Utf32 + << QStringConverter::Flag::ConvertInitialBom + << QByteArray("\0\0\xfe\xff\0\0\0h\0\0\0e\0\0\0l", 16) + << (QString(QChar(0xfeff)) + QString::fromLatin1("hel")); + } else { + QTest::newRow("utf32 nobom") + << QStringConverter::Utf32 + << QStringConverter::Flag::WriteBom + << QByteArray("h\0\0\0e\0\0\0l\0\0\0", 12) + << QString::fromLatin1("hel"); + QTest::newRow("utf32 bom le ignore header") + << QStringConverter::Utf32 + << QStringConverter::Flag::ConvertInitialBom + << QByteArray("\xff\xfe\0\0h\0\0\0e\0\0\0l\0\0\0", 16) + << (QString(QChar(0xfeff)) + QString::fromLatin1("hel")); + } + + QTest::newRow("utf32-be bom be") + << QStringConverter::Utf32BE + << QStringConverter::Flag::WriteBom + << QByteArray("\0\0\xfe\xff\0\0\0h\0\0\0e\0\0\0l", 16) + << QString::fromLatin1("hel"); + QTest::newRow("utf32-be nobom") + << QStringConverter::Utf32BE + << QStringConverter::Flag::WriteBom + << QByteArray("\0\0\0h\0\0\0e\0\0\0l", 12) + << QString::fromLatin1("hel"); + QTest::newRow("utf32-be bom be ignore header") + << QStringConverter::Utf32BE + << QStringConverter::Flag::ConvertInitialBom + << QByteArray("\0\0\xfe\xff\0\0\0h\0\0\0e\0\0\0l", 16) + << (QString(QChar(0xfeff)) + QString::fromLatin1("hel")); + + QTest::newRow("utf32-le bom le") + << QStringConverter::Utf32LE + << QStringConverter::Flag::WriteBom + << QByteArray("\xff\xfe\0\0h\0\0\0e\0\0\0l\0\0\0", 16) + << QString::fromLatin1("hel"); + QTest::newRow("utf32-le nobom") + << QStringConverter::Utf32LE + << QStringConverter::Flag::WriteBom + << QByteArray("h\0\0\0e\0\0\0l\0\0\0", 12) + << QString::fromLatin1("hel"); + QTest::newRow("utf32-le bom le ignore header") + << QStringConverter::Utf32LE + << QStringConverter::Flag::ConvertInitialBom + << QByteArray("\xff\xfe\0\0h\0\0\0e\0\0\0l\0\0\0", 16) + << (QString(QChar(0xfeff)) + QString::fromLatin1("hel")); +} + +void tst_QStringConverter::utfHeaders() +{ + QFETCH(QStringConverter::Encoding, encoding); + QFETCH(QStringConverter::Flag, flags); + QFETCH(QByteArray, encoded); + QFETCH(QString, unicode); + + QLatin1String ignoreReverseTestOn = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? QLatin1String(" le") : QLatin1String(" be"); + QString rowName(QTest::currentDataTag()); + + { + QStringDecoder decode(encoding, flags); + QVERIFY(decode.isValid()); + + QString result = decode(encoded); + QCOMPARE(result.size(), unicode.size()); + QCOMPARE(result, unicode); + } + + { + QStringDecoder decode(encoding, flags); + QVERIFY(decode.isValid()); + + QString result; + for (char c : encoded) + result += decode(QByteArrayView(&c, 1)); + QCOMPARE(result.size(), unicode.size()); + QCOMPARE(result, unicode); + } + + if (!rowName.endsWith("nobom") && !rowName.contains(ignoreReverseTestOn)) { + { + QStringEncoder encode(encoding, flags); + QVERIFY(encode.isValid()); + QByteArray reencoded = encode(unicode); + QCOMPARE(reencoded, encoded); + } + + { + QStringEncoder encode(encoding, flags); + QVERIFY(encode.isValid()); + QByteArray reencoded; + for (QChar c : unicode) + reencoded += encode(QStringView(&c, 1)); + QCOMPARE(reencoded, encoded); + } + } +} + +void tst_QStringConverter::encodingForName_data() +{ + QTest::addColumn<QByteArray>("name"); + QTest::addColumn<std::optional<QStringConverter::Encoding>>("encoding"); + + auto row = [](const char *name, std::optional<QStringConverter::Encoding> expected = std::nullopt) { + auto protect = [](auto p) { return p ? *p ? p : "<empty>" : "<nullptr>"; }; + QTest::addRow("%s", protect(name)) << QByteArray(name) << expected; + }; + + row("UTF-8", QStringConverter::Utf8); + row("utf8", QStringConverter::Utf8); + row("Utf-8", QStringConverter::Utf8); + row("UTF-16", QStringConverter::Utf16); + row("UTF-16le", QStringConverter::Utf16LE); + row("ISO-8859-1", QStringConverter::Latin1); + row("ISO8859-1", QStringConverter::Latin1); + row("iso8859-1", QStringConverter::Latin1); + row("latin1", QStringConverter::Latin1); + row("latin-1_-", QStringConverter::Latin1); + row("latin_1-_", QStringConverter::Latin1); + row("-_latin-1", QStringConverter::Latin1); + row("_-latin_1", QStringConverter::Latin1); + + // failures: + row(nullptr); + row(""); + row("latin2"); + row("latin42"); + row(" latin1"); // spaces are significant + row("\tlatin1"); // HTs are significant +} + +void tst_QStringConverter::encodingForName() +{ + QFETCH(const QByteArray, name); + QFETCH(const std::optional<QStringConverter::Encoding>, encoding); + + const auto *ptr = name.isNull() ? nullptr : name.data(); + + const auto e = QStringConverter::encodingForName(ptr); + QCOMPARE(e, encoding); +} + +void tst_QStringConverter::nameForEncoding_data() +{ + QTest::addColumn<QByteArray>("name"); + QTest::addColumn<QStringConverter::Encoding>("encoding"); + + QTest::newRow("UTF-8") << QByteArray("UTF-8") << QStringConverter::Utf8; + QTest::newRow("UTF-16") << QByteArray("UTF-16") << QStringConverter::Utf16; + QTest::newRow("UTF-16LE") << QByteArray("UTF-16LE") << QStringConverter::Utf16LE; + QTest::newRow("ISO-8859-1") << QByteArray("ISO-8859-1") << QStringConverter::Latin1; +} + +void tst_QStringConverter::nameForEncoding() +{ + QFETCH(QByteArray, name); + QFETCH(QStringConverter::Encoding, encoding); + + QByteArray n = QStringConverter::nameForEncoding(encoding); + QCOMPARE(n, name); +} + +void tst_QStringConverter::encodingForData_data() +{ + QTest::addColumn<QByteArray>("encoded"); + QTest::addColumn<std::optional<QStringConverter::Encoding>>("encoding"); + + + QTest::newRow("utf8 bom") + << QByteArray("\xef\xbb\xbfhello") + << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8); + QTest::newRow("utf8 nobom") + << QByteArray("hello") + << std::optional<QStringConverter::Encoding>(); + + QTest::newRow("utf16 bom be") + << QByteArray("\xfe\xff\0h\0e\0l", 8) + << std::optional<QStringConverter::Encoding>(QStringConverter::Utf16BE); + QTest::newRow("utf16 bom le") + << QByteArray("\xff\xfeh\0e\0l\0", 8) + << std::optional<QStringConverter::Encoding>(QStringConverter::Utf16LE); + QTest::newRow("utf16 nobom be") + << QByteArray("\0<\0e\0l", 6) + << std::optional<QStringConverter::Encoding>(QStringConverter::Utf16BE); + QTest::newRow("utf16 nobom le") + << QByteArray("<\0e\0l\0", 6) + << std::optional<QStringConverter::Encoding>(QStringConverter::Utf16LE); + QTest::newRow("utf16 nobom no match") + << QByteArray("h\0e\0l\0", 6) + << std::optional<QStringConverter::Encoding>(); + + QTest::newRow("utf32 bom be") + << QByteArray("\0\0\xfe\xff\0\0\0h\0\0\0e\0\0\0l", 16) + << std::optional<QStringConverter::Encoding>(QStringConverter::Utf32BE); + QTest::newRow("utf32 bom le") + << QByteArray("\xff\xfe\0\0h\0\0\0e\0\0\0l\0\0\0", 16) + << std::optional<QStringConverter::Encoding>(QStringConverter::Utf32LE); + QTest::newRow("utf32 nobom be") + << QByteArray("\0\0\0<\0\0\0e\0\0\0l", 12) + << std::optional<QStringConverter::Encoding>(QStringConverter::Utf32BE); + QTest::newRow("utf32 nobom") + << QByteArray("<\0\0\0e\0\0\0l\0\0\0", 12) + << std::optional<QStringConverter::Encoding>(QStringConverter::Utf32LE); + QTest::newRow("utf32 nobom no match") + << QByteArray("\0\0\0h\0\0\0e\0\0\0l", 12) + << std::optional<QStringConverter::Encoding>(); +} + +void tst_QStringConverter::encodingForData() +{ + QFETCH(QByteArray, encoded); + QFETCH(std::optional<QStringConverter::Encoding>, encoding); + + auto e = QStringConverter::encodingForData(encoded, char16_t('<')); + QCOMPARE(e, encoding); +} + + +void tst_QStringConverter::encodingForHtml_data() +{ + QTest::addColumn<QByteArray>("html"); + QTest::addColumn<std::optional<QStringConverter::Encoding>>("encoding"); + QTest::addColumn<QByteArray>("name"); // ICU name if we have ICU support + + QByteArray html = "<html><head></head><body>blah</body></html>"; + QTest::newRow("no charset") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8"); + + html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=ISO-8859-15\" /></head></html>"; + QTest::newRow("latin 15") << html << std::optional<QStringConverter::Encoding>() << QByteArray("ISO-8859-15"); + + html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=SJIS\" /></head></html>"; + QTest::newRow("sjis") << html << std::optional<QStringConverter::Encoding>() << QByteArray("Shift_JIS"); + + html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=ISO-2022-JP\" /></head></html>"; + QTest::newRow("ISO-2022-JP") << html << std::optional<QStringConverter::Encoding>() << QByteArray("ISO-2022-JP"); + + html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=ISO-2022\" /></head></html>"; + QTest::newRow("ISO-2022") << html << std::optional<QStringConverter::Encoding>() << QByteArray("ISO-2022-JP"); + + html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=GB2312\" /></head></html>"; + QTest::newRow("GB2312") << html << std::optional<QStringConverter::Encoding>() << QByteArray("GB2312"); + + html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=Big5\" /></head></html>"; + QTest::newRow("Big5") << html << std::optional<QStringConverter::Encoding>() << QByteArray("Big5"); + + html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=GB18030\" /></head></html>"; + QTest::newRow("GB18030") << html << std::optional<QStringConverter::Encoding>() << QByteArray("GB18030"); + + html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=GB2312-HKSCS\" /></head></html>"; + QTest::newRow("GB2312-HKSCS") << html << std::optional<QStringConverter::Encoding>() << QByteArray("GB2312-HKSCS"); + + html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=Big5-HKSCS\" /></head></html>"; + QTest::newRow("Big5-HKSCS") << html << std::optional<QStringConverter::Encoding>() << QByteArray("Big5-HKSCS"); + + html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=EucJP\" /></head></html>"; + QTest::newRow("EucJP") << html << std::optional<QStringConverter::Encoding>() << QByteArray("EUC-JP"); + + html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=EucKR\" /></head></html>"; + QTest::newRow("EucKR") << html << std::optional<QStringConverter::Encoding>() << QByteArray("EUC-KR"); + + html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=KOI8-R\" /></head></html>"; + QTest::newRow("KOI8-R") << html << std::optional<QStringConverter::Encoding>() << QByteArray("KOI8-R"); + + html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=KOI8-U\" /></head></html>"; + QTest::newRow("KOI8-U") << html << std::optional<QStringConverter::Encoding>() << QByteArray("KOI8-U"); + + html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=ISO-8859-1\" /></head></html>"; + QTest::newRow("latin 1") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Latin1) << QByteArray("ISO-8859-1"); + + html = "<!DOCTYPE html><html><head><meta charset=\"ISO_8859-1:1987\"><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><title>Test</title></head>"; + QTest::newRow("latin 1 (#2)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Latin1) << QByteArray("ISO-8859-1"); + + html = "<!DOCTYPE html><html><head><meta charset=\"utf-8\"><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><title>Test</title></head>"; + QTest::newRow("UTF-8") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8"); + + html = "<!DOCTYPE html><html><head><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><meta charset=\"utf-8\"><title>Test</title></head>"; + QTest::newRow("UTF-8 (#2)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8"); + + html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=UTF-8/></head></html>"; + QTest::newRow("UTF-8, no quotes") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8"); + + html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset='UTF-8'/></head></html>"; + QTest::newRow("UTF-8, single quotes") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8"); + + html = "<!DOCTYPE html><html><head><meta charset=utf-8><title>Test</title></head>"; + QTest::newRow("UTF-8, > terminator") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8"); + + html = "<!DOCTYPE html><html><head><meta charset= utf-8 ><title>Test</title></head>"; + QTest::newRow("UTF-8, > terminator with spaces") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8"); + + // Test invalid charsets. + html = "<!DOCTYPE html><html><head><meta charset= utf/8 ><title>Test</title></head>"; + QTest::newRow("utf/8") << html << std::optional<QStringConverter::Encoding>() << QByteArray(); + + html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=invalid-foo\" /></head></html>"; + QTest::newRow("invalid charset, no default") << html << std::optional<QStringConverter::Encoding>() << QByteArray("UTF-8"); + + html = "<!DOCTYPE html><html><head><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><meta charset=\""; + html.prepend(QByteArray().fill(' ', 512 - html.size())); + QTest::newRow("invalid charset (large header)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8"); + + + html = "<!DOCTYPE html><html><head><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><meta charset=\"utf-8"; + QTest::newRow("invalid charset (no closing double quote)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8"); + + + html = "<!DOCTYPE html><html><head><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><meta charset='utf-8"; + QTest::newRow("invalid charset (no closing single quote)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8"); + + html = "<!DOCTYPE html><html><head><meta charset=utf-8 foo=bar><title>Test</title></head>"; + QTest::newRow("invalid (space terminator)") << html << std::optional<QStringConverter::Encoding>() << QByteArray(); + + html = "<!DOCTYPE html><html><head><meta charset=\" utf' 8 /><title>Test</title></head>"; + QTest::newRow("invalid charset, early terminator (')") << html << std::optional<QStringConverter::Encoding>() << QByteArray(); + + const char src[] = { char(0xff), char(0xfe), char(0x7a), char(0x03), 0, 0 }; + html = src; + QTest::newRow("greek text UTF-16LE") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf16LE) << QByteArray("UTF-16LE"); + + + html = "<meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\"><span style=\"color: rgb(0, 0, 0); font-family: " + "'Galatia SIL'; font-size: 27px; font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; " + "line-height: normal; orphans: auto; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; widows: " + "auto; word-spacing: 0px; -webkit-text-size-adjust: auto; -webkit-text-stroke-width: 0px; display: inline !important; float: " + "none;\">ͻ</span>\000"; + QTest::newRow("greek text UTF-8") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8"); + + html = "<!DOCTYPE html><html><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=unicode\">" + "<head/><body><p>bla</p></body></html>"; // QTBUG-41998, ICU will return UTF-16. + QTest::newRow("legacy unicode UTF-8") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8"); +} + +void tst_QStringConverter::encodingForHtml() +{ + QFETCH(QByteArray, html); + QFETCH(std::optional<QStringConverter::Encoding>, encoding); + QFETCH(QByteArray, name); + + QCOMPARE(QStringConverter::encodingForHtml(html), encoding); + + QStringDecoder decoder = QStringDecoder::decoderForHtml(html); + if (encoding || // we should have a valid decoder independent of ICU support + decoder.isValid()) { // we got a valid decoder through ICU + QCOMPARE(decoder.name(), name); + } +} + +void tst_QStringConverter::availableCodesAreAvailable() +{ + auto codecs = QStringConverter::availableCodecs(); + for (const auto &codecName: codecs) + QVERIFY(QStringEncoder(codecName.toLatin1()).isValid()); +} + +class LoadAndConvert: public QRunnable +{ +public: + LoadAndConvert(QStringConverter::Encoding encoding, QString *destination) + : encode(encoding), decode(encoding), target(destination) + {} + QStringEncoder encode; + QStringDecoder decode; + QString *target; + void run() override + { + QString str = QString::fromLatin1("abcdefghijklmonpqrstufvxyz"); + for (int i = 0; i < 10000; ++i) { + QByteArray b = encode(str); + *target = decode(b); + QCOMPARE(*target, str); + } + } +}; + +void tst_QStringConverter::initTestCase() +{ + if (localeIsUtf8()) + qInfo("System locale is UTF-8"); + else + qInfo("System locale is not UTF-8"); +} + +void tst_QStringConverter::threadSafety() +{ +#if defined(Q_OS_WASM) + QSKIP("This test misbehaves on WASM. Investigation needed (QTBUG-110067)"); +#endif + + QThreadPool::globalInstance()->setMaxThreadCount(12); + + QList<QString> res; + res.resize(QStringConverter::LastEncoding + 1); + for (int i = 0; i < QStringConverter::LastEncoding + 1; ++i) { + QThreadPool::globalInstance()->start(new LoadAndConvert(QStringConverter::Encoding(i), &res[i])); + } + + // wait for all threads to finish working + QThreadPool::globalInstance()->waitForDone(); + + for (auto b : res) + QCOMPARE(b, QString::fromLatin1("abcdefghijklmonpqrstufvxyz")); +} + +#ifdef Q_OS_WIN +void tst_QStringConverter::fromLocal8Bit_data() +{ + QTest::addColumn<QByteArray>("eightBit"); + QTest::addColumn<QString>("utf16"); + QTest::addColumn<quint32>("codePage"); + + constexpr uint WINDOWS_1252 = 1252u; + QTest::newRow("windows-1252") << "Hello, world!"_ba << u"Hello, world!"_s << WINDOWS_1252; + constexpr uint SHIFT_JIS = 932u; + // Mostly two byte characters, but the comma is a single byte character (0xa4) + QTest::newRow("shiftJIS") + << "\x82\xb1\x82\xf1\x82\xc9\x82\xbf\x82\xcd\xa4\x90\xa2\x8a\x45\x81\x49"_ba + << u"こんにちは、世界!"_s << SHIFT_JIS; + + constexpr uint GB_18030 = 54936u; + QTest::newRow("GB-18030") << "\xc4\xe3\xba\xc3\xca\xc0\xbd\xe7\xa3\xa1"_ba << u"你好世界!"_s + << GB_18030; +} + +void tst_QStringConverter::fromLocal8Bit() +{ + QFETCH(const QByteArray, eightBit); + QFETCH(const QString, utf16); + QFETCH(const quint32, codePage); + + QStringConverter::State state; + + QString result = QLocal8Bit::convertToUnicode_sys(eightBit, codePage, &state); + QCOMPARE(result, utf16); + QCOMPARE(state.remainingChars, 0); + + result.clear(); + state.clear(); + for (char c : eightBit) + result += QLocal8Bit::convertToUnicode_sys({&c, 1}, codePage, &state); + QCOMPARE(result, utf16); + QCOMPARE(state.remainingChars, 0); + + result.clear(); + state.clear(); + // Decode the full string again, this time without state + state.flags |= QStringConverter::Flag::Stateless; + result = QLocal8Bit::convertToUnicode_sys(eightBit, codePage, &state); + QCOMPARE(result, utf16); + QCOMPARE(state.remainingChars, 0); +} + +void tst_QStringConverter::fromLocal8Bit_special_cases() +{ + QStringConverter::State state; + constexpr uint SHIFT_JIS = 932u; + // Decode a 2-octet character, but only provide 1 octet at first: + QString result = QLocal8Bit::convertToUnicode_sys("\x82", SHIFT_JIS, &state); + QCOMPARE(result, QString()); + QVERIFY(result.isNull()); + QCOMPARE_GT(state.remainingChars, 0); + // Then provide the second octet: + result = QLocal8Bit::convertToUnicode_sys("\xb1", SHIFT_JIS, &state); + QCOMPARE(result, u"こ"); + QCOMPARE(state.remainingChars, 0); + + // And without state: + result.clear(); + QStringConverter::State statelessState; + statelessState.flags |= QStringConverter::Flag::Stateless; + result = QLocal8Bit::convertToUnicode_sys("\x82", SHIFT_JIS, &statelessState); + result += QLocal8Bit::convertToUnicode_sys("\xb1", SHIFT_JIS, &statelessState); + // 0xb1 is a valid single-octet character in Shift-JIS, so the output + // isn't really what you would expect: + QCOMPARE(result, QString(QChar::ReplacementCharacter) + u'ア'); + QCOMPARE(statelessState.remainingChars, 0); + + // Now try a 3-octet UTF-8 sequence: + result.clear(); + state.clear(); + constexpr uint UTF8 = 65001u; + // First the first 2 octets: + result = QLocal8Bit::convertToUnicode_sys("\xe4\xbd", UTF8, &state); + QCOMPARE(result, QString()); + QVERIFY(result.isNull()); + QCOMPARE_GT(state.remainingChars, 0); + // Then provide the remaining octet: + result = QLocal8Bit::convertToUnicode_sys("\xa0", UTF8, &state); + QCOMPARE(result, u"你"); + QCOMPARE(state.remainingChars, 0); + + // Now the same, but there is an incomplete sequence at the start + result.clear(); + state.clear(); + result = QLocal8Bit::convertToUnicode_sys("\xe4\xe4\xbd", UTF8, &state); + QCOMPARE(result, QString()); + QVERIFY(result.isNull()); + // Remaining octet (and a '.' to force it to discard something from the + // internal state which is currently limited to 4 octets): + result += QLocal8Bit::convertToUnicode_sys("\xa0.", UTF8, &state); + QCOMPARE(result, QChar::ReplacementCharacter + u"你."_s); + QCOMPARE(state.remainingChars, 0); + + // Test QTBUG-118834, which is failing + result.clear(); + state.clear(); + result = QLocal8Bit::convertToUnicode_sys("\xe4\xe4\xbd", UTF8, &state); + QCOMPARE(result, QString()); + QVERIFY(result.isNull()); + // Remaining octet: + result += QLocal8Bit::convertToUnicode_sys("\xa0", UTF8, &state); + QEXPECT_FAIL("", "QTBUG-118834: We don't output anything because it's " + "within the size of our internal state, and we cannot " + "signal that it needs to be drained.", Continue); + QCOMPARE(result, QChar::ReplacementCharacter + u"你"_s); + QEXPECT_FAIL("", "QTBUG-118834: As above", Continue); + QCOMPARE(state.remainingChars, 0); + + // Now try a 4-octet GB 18030 sequence: + result.clear(); + state.clear(); + constexpr uint GB_18030 = 54936u; + const char sequence[] = "\x95\x32\x90\x31"; + // Repeat the sequence multiple times to test handling of exhaustion of + // internal buffer + QByteArray repeated = QByteArray(sequence).repeated(2049); + QByteArrayView octets = QByteArrayView(repeated); + result = QLocal8Bit::convertToUnicode_sys(octets.first(2), GB_18030, &state); + QCOMPARE(result, QString()); + QVERIFY(result.isNull()); + QCOMPARE_GT(state.remainingChars, 0); + // Then provide one more octet: + result = QLocal8Bit::convertToUnicode_sys(octets.sliced(2, 1), GB_18030, &state); + QCOMPARE(result, QString()); + QVERIFY(result.isNull()); + QCOMPARE_GT(state.remainingChars, 0); + // Then provide the last octet + the rest of the string + result = QLocal8Bit::convertToUnicode_sys(octets.sliced(3), GB_18030, &state); + QCOMPARE(result.first(2), u"𠂇"); + QCOMPARE(state.remainingChars, 0); +} + +void tst_QStringConverter::fromLocal8Bit_2GiB() +{ +#if QT_POINTER_SIZE == 4 + QSKIP("This test is only relevant for 64-bit builds"); +#else + qsizetype size = qsizetype(std::numeric_limits<int>::max()) + 3; + QByteArray input; + QT_TRY { + input.reserve(size); + } QT_CATCH (const std::bad_alloc &) { + QSKIP("Out of memory"); + } + // fill with '、' - a single octet character in Shift-JIS + input.fill('\xa4', std::numeric_limits<int>::max() - 1); + // then append 'こ' - a two octet character in Shift-JIS + // which is now straddling the 2 GiB boundary + input += "\x82\xb1"; + // then append another two '、', so that our output is also crossing the + // 2 GiB boundary + input += "\xa4\xa4"; + QCOMPARE(input.size(), input.capacity()); + constexpr uint SHIFT_JIS = 932u; + QStringConverter::State state; + QString result; + QT_TRY { + result = QLocal8Bit::convertToUnicode_sys(input, SHIFT_JIS, &state); + } QT_CATCH (const std::bad_alloc &) { + QSKIP("Out of memory"); + } + QCOMPARE(result.size(), size - 1); // The 2-octet character is only 1 code unit in UTF-16 + QCOMPARE(result.last(4), u"、こ、、"); // Check we correctly decoded it + QCOMPARE(state.remainingChars, 0); // and there is nothing left in the state +#endif +} + +void tst_QStringConverter::toLocal8Bit_data() +{ + fromLocal8Bit_data(); +} + +void tst_QStringConverter::toLocal8Bit() +{ + QFETCH(const QByteArray, eightBit); + QFETCH(const QString, utf16); + QFETCH(const quint32, codePage); + + QStringConverter::State state; + + QByteArray result = QLocal8Bit::convertFromUnicode_sys(utf16, codePage, &state); + QCOMPARE(result, eightBit); + QCOMPARE(state.remainingChars, 0); + + result.clear(); + state.clear(); + for (QChar c : utf16) + result += QLocal8Bit::convertFromUnicode_sys(QStringView(&c, 1), codePage, &state); + QCOMPARE(result, eightBit); + QCOMPARE(state.remainingChars, 0); + + result.clear(); + state.clear(); + // Decode the full string again, this time without state + state.flags |= QStringConverter::Flag::Stateless; + result = QLocal8Bit::convertFromUnicode_sys(utf16, codePage, &state); + QCOMPARE(result, eightBit); + QCOMPARE(state.remainingChars, 0); +} + +void tst_QStringConverter::toLocal8Bit_special_cases() +{ + QStringConverter::State state; + // Normally utf8 goes through a different code path, but we can force it here + constexpr uint UTF8 = 65001u; + // Decode a 2-code unit character, but only provide 1 code unit at first: + const char16_t a[] = u"𬽦"; + QStringView codeUnits = a; + QByteArray result = QLocal8Bit::convertFromUnicode_sys(codeUnits.first(1), UTF8, &state); + QCOMPARE(result, QString()); + QVERIFY(result.isNull()); + QCOMPARE_GT(state.remainingChars, 0); + // Then provide the second code unit: + result = QLocal8Bit::convertFromUnicode_sys(codeUnits.sliced(1), UTF8, &state); + QCOMPARE(result, "\xf0\xac\xbd\xa6"_ba); + QCOMPARE(state.remainingChars, 0); + + // Retain compat with the behavior for toLocal8Bit: + QCOMPARE(codeUnits.first(1).toLocal8Bit(), "?"); + + // QString::toLocal8Bit is already stateless, but test stateless handling + // explicitly anyway: + result.clear(); + QStringConverter::State statelessState; + statelessState.flags |= QStringConverter::Flag::Stateless; + result = QLocal8Bit::convertFromUnicode_sys(codeUnits.first(1), UTF8, &statelessState); + result += QLocal8Bit::convertFromUnicode_sys(codeUnits.sliced(1), UTF8, &statelessState); + // Windows uses the replacement character for invalid characters: + QCOMPARE(result, "\ufffd\ufffd"); + + // Now do the same, but the second time we feed in a character, we also + // provide many more so the internal stack buffer is not large enough. + result.clear(); + state.clear(); + QString str = QStringView(a).toString().repeated(2048); + codeUnits = str; + result = QLocal8Bit::convertFromUnicode_sys(codeUnits.first(1), UTF8, &state); + QCOMPARE(result, QString()); + QVERIFY(result.isNull()); + QCOMPARE_GT(state.remainingChars, 0); + // Then we provide the rest of the string: + result = QLocal8Bit::convertFromUnicode_sys(codeUnits.sliced(1), UTF8, &state); + QCOMPARE(result.first(4), "\xf0\xac\xbd\xa6"_ba); + QCOMPARE(state.remainingChars, 0); +} + +void tst_QStringConverter::toLocal8Bit_2GiB() +{ +#if QT_POINTER_SIZE == 4 + QSKIP("This test is only relevant for 64-bit builds"); +#else + constexpr qsizetype TwoGiB = qsizetype(std::numeric_limits<int>::max()); + QString input; + QT_TRY { + input.reserve(TwoGiB + 1); + } QT_CATCH (const std::bad_alloc &) { + QSKIP("Out of memory"); + } + // Fill with a single code unit character + input.fill(u'.', TwoGiB - 1); + // Then append a 2 code unit character, so that the input straddles the 2 GiB + // boundary + input += u"🙂"; + QCOMPARE(input.size(), input.capacity()); + constexpr uint UTF8 = 65001u; + QStringConverter::State state; + QByteArray result; + QT_TRY { + result = QLocal8Bit::convertFromUnicode_sys(input, UTF8, &state); + } QT_CATCH (const std::bad_alloc &) { + QSKIP("Out of memory"); + } + QUtf8StringView rView = result; + QCOMPARE(rView.size(), TwoGiB + 3); // The 2 code unit smiley is 4 code units in UTF-8 + QCOMPARE(rView.last(7), u8"...🙂"); // Check we correctly decoded it + QCOMPARE(state.remainingChars, 0); // and there is nothing left in the state +#endif +} +#endif // Q_OS_WIN + +struct DontCrashAtExit { + ~DontCrashAtExit() { + QStringDecoder decoder(QStringDecoder::Utf8); + QVERIFY(decoder.isValid()); + (void)decoder("azerty"); + } +} dontCrashAtExit; + + +QTEST_MAIN(tst_QStringConverter) +#include "tst_qstringconverter.moc" |