diff options
Diffstat (limited to 'tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp')
-rw-r--r-- | tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp | 815 |
1 files changed, 765 insertions, 50 deletions
diff --git a/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp b/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp index 231df0390e..ed3f91ac94 100644 --- a/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp +++ b/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp @@ -1,13 +1,33 @@ // Copyright (C) 2021 The Qt Company Ltd. // Copyright (C) 2016 Intel Corporation. -// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only WITH Qt-GPL-exception-1.0 +// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only #include <QTest> +#include <QtCore/private/qglobal_p.h> #include <qstringconverter.h> +#include <private/qstringconverter_p.h> #include <qthreadpool.h> #include <array> +#include <numeric> + +using namespace Qt::StringLiterals; + +QT_BEGIN_NAMESPACE +namespace QTest { +template <typename T> +char *toString(const std::optional<T> &opt) +{ + if (opt) + return QTest::toString(*opt); + else + return qstrdup("std::nullopt"); +} +} // namespace QTest +QT_END_NAMESPACE + +using QTest::toString; static constexpr bool IsBigEndian = QSysInfo::ByteOrder == QSysInfo::BigEndian; enum CodecLimitation { @@ -29,8 +49,6 @@ static constexpr bool localeIsUtf8() } #endif -using namespace Qt::StringLiterals; - struct Codec { const char name[12]; @@ -129,6 +147,25 @@ private slots: void roundtrip_data(); void roundtrip(); + void convertL1U8(); + + void convertL1U16(); + +#if QT_CONFIG(icu) + void roundtripIcu_data(); + void roundtripIcu(); + void icuInvalidCharacter_data(); + void icuInvalidCharacter(); + void icuEncodeEdgeCases_data(); + void icuEncodeEdgeCases(); + void icuUsableAfterMove(); + void charByCharConsistency_data(); + void charByCharConsistency(); + void byteByByteConsistency_data(); + void byteByByteConsistency(); + void statefulPieceWise(); +#endif + void flagF7808080() const; void utf8Codec_data(); @@ -156,6 +193,20 @@ private slots: void encodingForHtml_data(); void encodingForHtml(); + + void availableCodesAreAvailable(); + +#ifdef Q_OS_WIN + // On all other systems local 8-bit encoding is UTF-8 + void fromLocal8Bit_data(); + void fromLocal8Bit(); + void fromLocal8Bit_special_cases(); + void fromLocal8Bit_2GiB(); + void toLocal8Bit_data(); + void toLocal8Bit(); + void toLocal8Bit_special_cases(); + void toLocal8Bit_2GiB(); +#endif }; void tst_QStringConverter::constructByName() @@ -230,8 +281,8 @@ void tst_QStringConverter::invalidConverter() decoder.resetState(); QVERIFY(!decoder.hasError()); - QChar buffer[100]; - QChar *position = decoder.appendToBuffer(buffer, "Even more"); + char16_t buffer[100]; + char16_t *position = decoder.appendToBuffer(buffer, "Even more"); QCOMPARE(position, buffer); QVERIFY(decoder.hasError()); } @@ -333,12 +384,39 @@ void tst_QStringConverter::convertUtf8CharByChar() QCOMPARE(reencoded, ba); } +void tst_QStringConverter::convertL1U16() +{ + const QLatin1StringView latin1("some plain latin1 text"); + const QString qstr(latin1); + + QStringDecoder decoder(QStringConverter::Latin1); + QVERIFY(decoder.isValid()); + QString uniString = decoder(latin1); + QCOMPARE(uniString, qstr); + QCOMPARE(latin1, uniString.toLatin1()); + + // do it again (using .decode()) + uniString = decoder.decode(latin1); + QCOMPARE(uniString, qstr); + QCOMPARE(latin1, uniString.toLatin1()); + + QStringEncoder encoder(QStringConverter::Latin1); + QByteArray reencoded = encoder(uniString); + QCOMPARE(reencoded, QByteArrayView(latin1)); + QCOMPARE(reencoded, uniString.toLatin1()); + + // do it again (using .encode()) + reencoded = encoder.encode(uniString); + QCOMPARE(reencoded, QByteArrayView(latin1)); + QCOMPARE(reencoded, uniString.toLatin1()); +} + void tst_QStringConverter::roundtrip_data() { QTest::addColumn<QStringView>("utf16"); QTest::addColumn<QStringConverter::Encoding>("code"); - for (const auto code : codes) { + for (const auto &code : codes) { for (const TestString &s : testStrings) { // rules: // 1) don't pass the null character to the System codec @@ -351,13 +429,17 @@ void tst_QStringConverter::roundtrip_data() } if (code.limitation == FullUnicode) { - const char32_t zeroVal = 0x11136; // Unicode's representation of Chakma zero - for (int i = 0; i < 10; ++i) { - QChar data[] = { - QChar::highSurrogate(zeroVal + i), QChar::lowSurrogate(zeroVal + i), - }; - QTest::addRow("%s:Chakma-digit-%d", code.name, i) << QStringView(data) << code.code; - } + using Digits = std::array<QChar, 2>; + using DigitsArray = std::array<Digits, 10>; + static constexpr DigitsArray chakmaDigits = []() { + const char32_t zeroVal = 0x11136; // Unicode's representation of Chakma zero + DigitsArray r; + for (int i = 0; i < int(r.size()); ++i) + r[i] = { QChar::highSurrogate(zeroVal + i), QChar::lowSurrogate(zeroVal + i) }; + return r; + }(); + for (int i = 0; i < int(chakmaDigits.size()); ++i) + QTest::addRow("%s:Chakma-digit-%d", code.name, i) << QStringView(chakmaDigits[i]) << code.code; } } } @@ -407,6 +489,281 @@ void tst_QStringConverter::roundtrip() QCOMPARE(decoded, uniString); } +void tst_QStringConverter::convertL1U8() +{ + { + std::array<char, 256> latin1; + std::iota(latin1.data(), latin1.data() + latin1.size(), uchar(0)); + std::array<char, 512> utf8; + auto out = QUtf8::convertFromLatin1(utf8.data(), QLatin1StringView{latin1.data(), latin1.size()}); + QCOMPARE(QString::fromLatin1(latin1.data(), latin1.size()), + QString::fromUtf8(utf8.data(), out - utf8.data())); + } +} + +#if QT_CONFIG(icu) + +void tst_QStringConverter::roundtripIcu_data() +{ + QTest::addColumn<QString>("original"); + QTest::addColumn<QByteArray>("codec"); + + QTest::addRow("shift_jis") << u"古池や 蛙飛び込む 水の音"_s << QByteArray("shift_jis"); + QTest::addRow("UTF7") << u"Übermäßig: čçö"_s << QByteArray("UTF-7"); +} + +void tst_QStringConverter::roundtripIcu() +{ + QFETCH(QString, original); + QFETCH(QByteArray, codec); + QStringEncoder fromUtf16(codec); + if (!fromUtf16.isValid()) + QSKIP("Unsupported codec"); + QStringDecoder toUtf16(codec); + QByteArray asShiftJIS = fromUtf16(original); + QString roundTripped = toUtf16(asShiftJIS); + QCOMPARE(roundTripped, original); +} + +void tst_QStringConverter::icuEncodeEdgeCases_data() +{ + QTest::addColumn<QString>("source"); + QTest::addColumn<QByteArray>("expected") ; + QTest::addColumn<QByteArray>("codec"); + + QTest::addRow("empty") << QString() << QByteArray() << QByteArray("ISO-2022-CN"); + QTest::addRow("BOMonly") << QString(QChar(QChar::ByteOrderMark)) << QByteArray() << QByteArray("ISO-2022-CN"); + QTest::addRow("1to6") << u"좋"_s << QByteArray::fromHex("1b2428434141") << QByteArray("ISO-2022-JP-2"); + QTest::addRow("1to7") << u"漢"_s << QByteArray::fromHex("1b2429470e6947") << QByteArray("ISO-2022-CN"); + QTest::addRow("1to8") << u"墎"_s << QByteArray::fromHex("1b242a481b4e4949") << QByteArray("ISO-2022-CN"); + QTest::addRow("utf7") << u"Übergröße"_s << QByteArray("+ANw-bergr+APYA3w-e") << QByteArray("UTF-7"); +} + +void tst_QStringConverter::icuEncodeEdgeCases() +{ + QFETCH(QString, source); + QFETCH(QByteArray, expected); + QFETCH(QByteArray, codec); + QStringEncoder encoder(codec); + if (!encoder.isValid()) + QSKIP("Unsupported codec"); + QVERIFY(encoder.isValid()); + QByteArray encoded = encoder.encode(source); + QCOMPARE(encoded, expected); +} + +void tst_QStringConverter::charByCharConsistency_data() +{ + QTest::addColumn<QStringView>("source"); + QTest::addColumn<QByteArray>("codec"); + + auto addRow = [](const TestString &s) { + QTest::addRow("%s_shift_jis", s.description) << s.utf16 << QByteArray("shift_jis"); + QTest::addRow("%s_EUC-CN", s.description) << s.utf16 << QByteArray("EUC-CN"); + }; + + for (const TestString &s : testStrings) { + if (s.utf16.isEmpty()) + continue; + addRow(s); + } +} + +void tst_QStringConverter::charByCharConsistency() +{ + QFETCH(const QStringView, source); + QFETCH(const QByteArray, codec); + + const auto check = [&](QStringEncoder encoder){ + if (!encoder.isValid()) + QSKIP("Unsupported codec"); + + QByteArray fullyConverted = encoder.encode(source); + encoder.resetState(); + QByteArray stepByStepConverted; + for (const auto& codeUnit: source) { + stepByStepConverted += encoder.encode(codeUnit); + } + QCOMPARE(stepByStepConverted, fullyConverted); + }; + + check(QStringEncoder(codec)); + if (QTest::currentTestResolved()) return; + + check(QStringEncoder(codec, QStringConverter::Flag::ConvertInvalidToNull)); + if (QTest::currentTestResolved()) return; + + // moved codecs also work: + + { + QStringEncoder dec(codec); + check(std::move(dec)); + } + if (QTest::currentTestResolved()) return; + + { + QStringEncoder dec(codec, QStringConverter::Flag::ConvertInvalidToNull); + check(std::move(dec)); + } + if (QTest::currentTestResolved()) return; + +} + +void tst_QStringConverter::byteByByteConsistency_data() +{ + QTest::addColumn<QByteArray>("source"); + QTest::addColumn<QByteArray>("codec"); + + QTest::addRow("plain_ascii_utf7") << QByteArray("Hello, world!") << QByteArray("UTF-7"); + QFile eucKr(":/euc_kr.txt"); + if (eucKr.open(QFile::ReadOnly)) + QTest::addRow("euc_kr_storing_jp") << eucKr.readAll() << QByteArray("EUC-KR"); + QTest::addRow("incomplete_euc_jp") << QByteArrayLiteral("test\x8Ftest") << QByteArray("EUC-JP"); +} + +void tst_QStringConverter::byteByByteConsistency() +{ + QFETCH(const QByteArray, source); + QFETCH(const QByteArray, codec); + + const auto check = [&](QStringDecoder decoder) { + if (!decoder.isValid()) + QSKIP("Unsupported codec"); + + QString fullyConverted = decoder.decode(source); + decoder.resetState(); + QString stepByStepConverted; + for (const auto& byte: source) { + QByteArray singleChar; + singleChar.append(byte); + stepByStepConverted += decoder.decode(singleChar); + } + QCOMPARE(stepByStepConverted, fullyConverted); + }; + + check(QStringDecoder(codec)); + if (QTest::currentTestResolved()) return; + + check(QStringDecoder(codec, QStringConverter::Flag::ConvertInvalidToNull)); + if (QTest::currentTestResolved()) return; + + // moved codecs also work: + + { + QStringDecoder dec(codec); + check(std::move(dec)); + } + if (QTest::currentTestResolved()) return; + + { + QStringDecoder dec(codec, QStringConverter::Flag::ConvertInvalidToNull); + check(std::move(dec)); + } + if (QTest::currentTestResolved()) return; + +} + +void tst_QStringConverter::statefulPieceWise() +{ + QStringDecoder decoder("HZ"); + if (!decoder.isValid()) + QSKIP("Unsupported codec"); + QString start = decoder.decode("pure ASCII"); + QCOMPARE(start, u"pure ASCII"); + QString shifted = decoder.decode("~{"); + // shift out changes the state, but won't create any output + QCOMPARE(shifted, ""); + QString continuation = decoder.decode("\x42\x43"); + QCOMPARE(continuation, "旅"); + decoder.resetState(); + // after resetting the state we're in N0 again + QString afterReset = decoder.decode("\x42\x43"); + QCOMPARE(afterReset, "BC"); +} + +void tst_QStringConverter::icuUsableAfterMove() +{ + { + QStringDecoder decoder("EUC-JP"); + QVERIFY(decoder.isValid()); + QString partial = decoder.decode("Test\x8E"); + QCOMPARE(partial, u"Test"_s); + QStringDecoder moved(std::move(decoder)); + QString complete = partial + moved.decode("\xA1Test"); + QCOMPARE(complete, u"Test\uFF61Test"_s); + } + { + QStringEncoder encoder("Big5"); + QVERIFY(encoder.isValid()); + QByteArray encoded = encoder.encode("hello"_L1); + QCOMPARE(encoded, "hello"); + QStringEncoder moved(std::move(encoder)); + encoded = moved.encode("bye"); + QCOMPARE(encoded, "bye"); + } +} + +void tst_QStringConverter::icuInvalidCharacter_data() +{ + QTest::addColumn<QString>("string"); + QTest::addColumn<QByteArray>("bytearray"); + QTest::addColumn<QByteArray>("codec"); + QTest::addColumn<QStringConverter::Flags>("flags"); + QTest::addColumn<bool>("shouldDecode"); + + using Flags = QStringConverter::Flags; + using Flag = QStringConverter::Flag; + QTest::addRow("encode") + << u"Test👪Test"_s + << QByteArrayLiteral("\xE3\x85\xA2\xA3\x3F\xE3\x85\xA2\xA3") + << QByteArray("IBM-037") << Flags(Flag::Default) + << false; + QTest::addRow("encode_null") + << u"Test👪Test"_s + << QByteArrayLiteral("\xE3\x85\xA2\xA3\0\xE3\x85\xA2\xA3") + << QByteArray("IBM-037") << Flags(Flag::ConvertInvalidToNull) + << false; + QTest::addRow("decode_incomplete_EUC-JP") + << u"test"_s + << QByteArrayLiteral("test\x8F") + << QByteArray("EUC-JP") << Flags(Flag::Stateless) + << true; + QTest::addRow("decode_invalid_EUC-JP_sequence") + << u"test\0test"_s + << QByteArrayLiteral("test\x8Ftest") + << QByteArray("EUC-JP") << Flags(Flag::ConvertInvalidToNull) + << true; + QTest::addRow("encode_incomplete_surrogate") + << u"test"_s + QChar::highSurrogate(0x11136) + << QByteArray("test") + << QByteArray("EUC-JP") << Flags(Flag::Stateless) + << false; +} + +void tst_QStringConverter::icuInvalidCharacter() +{ + QFETCH(QString, string); + QFETCH(QByteArray, bytearray); + QFETCH(QByteArray, codec); + QFETCH(QStringConverter::Flags, flags); + QFETCH(bool, shouldDecode); + if (shouldDecode) { + QStringDecoder decoder(codec.data(), flags); + QVERIFY(decoder.isValid()); + QString decoded = decoder.decode(bytearray); + QVERIFY(decoder.hasError()); + QCOMPARE(decoded, string); + } else { + QStringEncoder encoder(codec.data(), flags); + QVERIFY(encoder.isValid()); + QByteArray encoded = encoder.encode(string); + QVERIFY(encoder.hasError()); + QCOMPARE(encoded, bytearray); + } +} + +#endif + void tst_QStringConverter::flagF7808080() const { /* This test case stems from test not-wf-sa-170, tests/qxmlstream/XML-Test-Suite/xmlconf/xmltest/not-wf/sa/166.xml, @@ -1535,7 +1892,7 @@ void tst_QStringConverter::roundtripBom_data() QTest::addColumn<QStringView>("utf16"); QTest::addColumn<QStringConverter::Encoding>("code"); - for (const auto code : codes) { + for (const auto &code : codes) { if (size_t(code.code) >= encodedBoms.size()) break; if (code.limitation != FullUnicode) @@ -1844,7 +2201,7 @@ void tst_QStringConverter::utfHeaders() QVERIFY(decode.isValid()); QString result = decode(encoded); - QCOMPARE(result.length(), unicode.length()); + QCOMPARE(result.size(), unicode.size()); QCOMPARE(result, unicode); } @@ -1855,7 +2212,7 @@ void tst_QStringConverter::utfHeaders() QString result; for (char c : encoded) result += decode(QByteArrayView(&c, 1)); - QCOMPARE(result.length(), unicode.length()); + QCOMPARE(result.size(), unicode.size()); QCOMPARE(result, unicode); } @@ -1883,25 +2240,42 @@ void tst_QStringConverter::encodingForName_data() QTest::addColumn<QByteArray>("name"); QTest::addColumn<std::optional<QStringConverter::Encoding>>("encoding"); - QTest::newRow("UTF-8") << QByteArray("UTF-8") << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8); - QTest::newRow("utf8") << QByteArray("utf8") << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8); - QTest::newRow("Utf-8") << QByteArray("Utf-8") << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8); - QTest::newRow("UTF-16") << QByteArray("UTF-16") << std::optional<QStringConverter::Encoding>(QStringConverter::Utf16); - QTest::newRow("UTF-16le") << QByteArray("UTF-16le") << std::optional<QStringConverter::Encoding>(QStringConverter::Utf16LE); - QTest::newRow("ISO-8859-1") << QByteArray("ISO-8859-1") << std::optional<QStringConverter::Encoding>(QStringConverter::Latin1); - QTest::newRow("ISO8859-1") << QByteArray("ISO8859-1") << std::optional<QStringConverter::Encoding>(QStringConverter::Latin1); - QTest::newRow("iso8859-1") << QByteArray("iso8859-1") << std::optional<QStringConverter::Encoding>(QStringConverter::Latin1); - QTest::newRow("latin1") << QByteArray("latin1") << std::optional<QStringConverter::Encoding>(QStringConverter::Latin1); - QTest::newRow("latin2") << QByteArray("latin2") << std::optional<QStringConverter::Encoding>(); - QTest::newRow("latin15") << QByteArray("latin15") << std::optional<QStringConverter::Encoding>(); + auto row = [](const char *name, std::optional<QStringConverter::Encoding> expected = std::nullopt) { + auto protect = [](auto p) { return p ? *p ? p : "<empty>" : "<nullptr>"; }; + QTest::addRow("%s", protect(name)) << QByteArray(name) << expected; + }; + + row("UTF-8", QStringConverter::Utf8); + row("utf8", QStringConverter::Utf8); + row("Utf-8", QStringConverter::Utf8); + row("UTF-16", QStringConverter::Utf16); + row("UTF-16le", QStringConverter::Utf16LE); + row("ISO-8859-1", QStringConverter::Latin1); + row("ISO8859-1", QStringConverter::Latin1); + row("iso8859-1", QStringConverter::Latin1); + row("latin1", QStringConverter::Latin1); + row("latin-1_-", QStringConverter::Latin1); + row("latin_1-_", QStringConverter::Latin1); + row("-_latin-1", QStringConverter::Latin1); + row("_-latin_1", QStringConverter::Latin1); + + // failures: + row(nullptr); + row(""); + row("latin2"); + row("latin42"); + row(" latin1"); // spaces are significant + row("\tlatin1"); // HTs are significant } void tst_QStringConverter::encodingForName() { - QFETCH(QByteArray, name); - QFETCH(std::optional<QStringConverter::Encoding>, encoding); + QFETCH(const QByteArray, name); + QFETCH(const std::optional<QStringConverter::Encoding>, encoding); - auto e = QStringConverter::encodingForName(name); + const auto *ptr = name.isNull() ? nullptr : name.data(); + + const auto e = QStringConverter::encodingForName(ptr); QCOMPARE(e, encoding); } @@ -1985,65 +2359,102 @@ void tst_QStringConverter::encodingForHtml_data() { QTest::addColumn<QByteArray>("html"); QTest::addColumn<std::optional<QStringConverter::Encoding>>("encoding"); + QTest::addColumn<QByteArray>("name"); // ICU name if we have ICU support QByteArray html = "<html><head></head><body>blah</body></html>"; - QTest::newRow("no charset") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8); + QTest::newRow("no charset") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8"); html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=ISO-8859-15\" /></head></html>"; - QTest::newRow("latin 15") << html << std::optional<QStringConverter::Encoding>(); + QTest::newRow("latin 15") << html << std::optional<QStringConverter::Encoding>() << QByteArray("ISO-8859-15"); + + html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=SJIS\" /></head></html>"; + QTest::newRow("sjis") << html << std::optional<QStringConverter::Encoding>() << QByteArray("Shift_JIS"); + + html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=ISO-2022-JP\" /></head></html>"; + QTest::newRow("ISO-2022-JP") << html << std::optional<QStringConverter::Encoding>() << QByteArray("ISO-2022-JP"); + + html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=ISO-2022\" /></head></html>"; + QTest::newRow("ISO-2022") << html << std::optional<QStringConverter::Encoding>() << QByteArray("ISO-2022-JP"); + + html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=GB2312\" /></head></html>"; + QTest::newRow("GB2312") << html << std::optional<QStringConverter::Encoding>() << QByteArray("GB2312"); + + html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=Big5\" /></head></html>"; + QTest::newRow("Big5") << html << std::optional<QStringConverter::Encoding>() << QByteArray("Big5"); + + html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=GB18030\" /></head></html>"; + QTest::newRow("GB18030") << html << std::optional<QStringConverter::Encoding>() << QByteArray("GB18030"); + + html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=GB2312-HKSCS\" /></head></html>"; + QTest::newRow("GB2312-HKSCS") << html << std::optional<QStringConverter::Encoding>() << QByteArray("GB2312-HKSCS"); + + html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=Big5-HKSCS\" /></head></html>"; + QTest::newRow("Big5-HKSCS") << html << std::optional<QStringConverter::Encoding>() << QByteArray("Big5-HKSCS"); + + html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=EucJP\" /></head></html>"; + QTest::newRow("EucJP") << html << std::optional<QStringConverter::Encoding>() << QByteArray("EUC-JP"); + + html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=EucKR\" /></head></html>"; + QTest::newRow("EucKR") << html << std::optional<QStringConverter::Encoding>() << QByteArray("EUC-KR"); + + html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=KOI8-R\" /></head></html>"; + QTest::newRow("KOI8-R") << html << std::optional<QStringConverter::Encoding>() << QByteArray("KOI8-R"); + + html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=KOI8-U\" /></head></html>"; + QTest::newRow("KOI8-U") << html << std::optional<QStringConverter::Encoding>() << QByteArray("KOI8-U"); html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=ISO-8859-1\" /></head></html>"; - QTest::newRow("latin 1") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Latin1); + QTest::newRow("latin 1") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Latin1) << QByteArray("ISO-8859-1"); html = "<!DOCTYPE html><html><head><meta charset=\"ISO_8859-1:1987\"><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><title>Test</title></head>"; - QTest::newRow("latin 1 (#2)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Latin1); + QTest::newRow("latin 1 (#2)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Latin1) << QByteArray("ISO-8859-1"); html = "<!DOCTYPE html><html><head><meta charset=\"utf-8\"><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><title>Test</title></head>"; - QTest::newRow("UTF-8") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8); + QTest::newRow("UTF-8") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8"); html = "<!DOCTYPE html><html><head><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><meta charset=\"utf-8\"><title>Test</title></head>"; - QTest::newRow("UTF-8 (#2)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8); + QTest::newRow("UTF-8 (#2)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8"); html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=UTF-8/></head></html>"; - QTest::newRow("UTF-8, no quotes") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8); + QTest::newRow("UTF-8, no quotes") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8"); html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset='UTF-8'/></head></html>"; - QTest::newRow("UTF-8, single quotes") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8); + QTest::newRow("UTF-8, single quotes") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8"); html = "<!DOCTYPE html><html><head><meta charset=utf-8><title>Test</title></head>"; - QTest::newRow("UTF-8, > terminator") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8); + QTest::newRow("UTF-8, > terminator") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8"); html = "<!DOCTYPE html><html><head><meta charset= utf-8 ><title>Test</title></head>"; - QTest::newRow("UTF-8, > terminator with spaces") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8); + QTest::newRow("UTF-8, > terminator with spaces") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8"); // Test invalid charsets. html = "<!DOCTYPE html><html><head><meta charset= utf/8 ><title>Test</title></head>"; - QTest::newRow("utf/8") << html << std::optional<QStringConverter::Encoding>(); + QTest::newRow("utf/8") << html << std::optional<QStringConverter::Encoding>() << QByteArray(); html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=invalid-foo\" /></head></html>"; - QTest::newRow("invalid charset, no default") << html << std::optional<QStringConverter::Encoding>(); + QTest::newRow("invalid charset, no default") << html << std::optional<QStringConverter::Encoding>() << QByteArray("UTF-8"); html = "<!DOCTYPE html><html><head><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><meta charset=\""; html.prepend(QByteArray().fill(' ', 512 - html.size())); - QTest::newRow("invalid charset (large header)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8); + QTest::newRow("invalid charset (large header)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8"); html = "<!DOCTYPE html><html><head><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><meta charset=\"utf-8"; - QTest::newRow("invalid charset (no closing double quote)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8); + QTest::newRow("invalid charset (no closing double quote)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8"); html = "<!DOCTYPE html><html><head><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><meta charset='utf-8"; - QTest::newRow("invalid charset (no closing single quote)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8); + QTest::newRow("invalid charset (no closing single quote)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8"); html = "<!DOCTYPE html><html><head><meta charset=utf-8 foo=bar><title>Test</title></head>"; - QTest::newRow("invalid (space terminator)") << html << std::optional<QStringConverter::Encoding>(); + QTest::newRow("invalid (space terminator)") << html << std::optional<QStringConverter::Encoding>() << QByteArray(); html = "<!DOCTYPE html><html><head><meta charset=\" utf' 8 /><title>Test</title></head>"; - QTest::newRow("invalid charset, early terminator (')") << html << std::optional<QStringConverter::Encoding>(); + QTest::newRow("invalid charset, early terminator (')") << html << std::optional<QStringConverter::Encoding>() << QByteArray(); const char src[] = { char(0xff), char(0xfe), char(0x7a), char(0x03), 0, 0 }; html = src; - QTest::newRow("greek text UTF-16LE") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf16LE); + QTest::newRow("greek text UTF-16LE") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf16LE) << QByteArray("UTF-16LE"); html = "<meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\"><span style=\"color: rgb(0, 0, 0); font-family: " @@ -2051,19 +2462,33 @@ void tst_QStringConverter::encodingForHtml_data() "line-height: normal; orphans: auto; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; widows: " "auto; word-spacing: 0px; -webkit-text-size-adjust: auto; -webkit-text-stroke-width: 0px; display: inline !important; float: " "none;\">ͻ</span>\000"; - QTest::newRow("greek text UTF-8") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8); + QTest::newRow("greek text UTF-8") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8"); html = "<!DOCTYPE html><html><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=unicode\">" "<head/><body><p>bla</p></body></html>"; // QTBUG-41998, ICU will return UTF-16. - QTest::newRow("legacy unicode UTF-8") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8); + QTest::newRow("legacy unicode UTF-8") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8"); } void tst_QStringConverter::encodingForHtml() { QFETCH(QByteArray, html); QFETCH(std::optional<QStringConverter::Encoding>, encoding); + QFETCH(QByteArray, name); QCOMPARE(QStringConverter::encodingForHtml(html), encoding); + + QStringDecoder decoder = QStringDecoder::decoderForHtml(html); + if (encoding || // we should have a valid decoder independent of ICU support + decoder.isValid()) { // we got a valid decoder through ICU + QCOMPARE(decoder.name(), name); + } +} + +void tst_QStringConverter::availableCodesAreAvailable() +{ + auto codecs = QStringConverter::availableCodecs(); + for (const auto &codecName: codecs) + QVERIFY(QStringEncoder(codecName.toLatin1()).isValid()); } class LoadAndConvert: public QRunnable @@ -2096,6 +2521,10 @@ void tst_QStringConverter::initTestCase() void tst_QStringConverter::threadSafety() { +#if defined(Q_OS_WASM) + QSKIP("This test misbehaves on WASM. Investigation needed (QTBUG-110067)"); +#endif + QThreadPool::globalInstance()->setMaxThreadCount(12); QList<QString> res; @@ -2111,6 +2540,292 @@ void tst_QStringConverter::threadSafety() QCOMPARE(b, QString::fromLatin1("abcdefghijklmonpqrstufvxyz")); } +#ifdef Q_OS_WIN +void tst_QStringConverter::fromLocal8Bit_data() +{ + QTest::addColumn<QByteArray>("eightBit"); + QTest::addColumn<QString>("utf16"); + QTest::addColumn<quint32>("codePage"); + + constexpr uint WINDOWS_1252 = 1252u; + QTest::newRow("windows-1252") << "Hello, world!"_ba << u"Hello, world!"_s << WINDOWS_1252; + constexpr uint SHIFT_JIS = 932u; + // Mostly two byte characters, but the comma is a single byte character (0xa4) + QTest::newRow("shiftJIS") + << "\x82\xb1\x82\xf1\x82\xc9\x82\xbf\x82\xcd\xa4\x90\xa2\x8a\x45\x81\x49"_ba + << u"こんにちは、世界!"_s << SHIFT_JIS; + + constexpr uint GB_18030 = 54936u; + QTest::newRow("GB-18030") << "\xc4\xe3\xba\xc3\xca\xc0\xbd\xe7\xa3\xa1"_ba << u"你好世界!"_s + << GB_18030; +} + +void tst_QStringConverter::fromLocal8Bit() +{ + QFETCH(const QByteArray, eightBit); + QFETCH(const QString, utf16); + QFETCH(const quint32, codePage); + + QStringConverter::State state; + + QString result = QLocal8Bit::convertToUnicode_sys(eightBit, codePage, &state); + QCOMPARE(result, utf16); + QCOMPARE(state.remainingChars, 0); + + result.clear(); + state.clear(); + for (char c : eightBit) + result += QLocal8Bit::convertToUnicode_sys({&c, 1}, codePage, &state); + QCOMPARE(result, utf16); + QCOMPARE(state.remainingChars, 0); + + result.clear(); + state.clear(); + // Decode the full string again, this time without state + state.flags |= QStringConverter::Flag::Stateless; + result = QLocal8Bit::convertToUnicode_sys(eightBit, codePage, &state); + QCOMPARE(result, utf16); + QCOMPARE(state.remainingChars, 0); +} + +void tst_QStringConverter::fromLocal8Bit_special_cases() +{ + QStringConverter::State state; + constexpr uint SHIFT_JIS = 932u; + // Decode a 2-octet character, but only provide 1 octet at first: + QString result = QLocal8Bit::convertToUnicode_sys("\x82", SHIFT_JIS, &state); + QCOMPARE(result, QString()); + QVERIFY(result.isNull()); + QCOMPARE_GT(state.remainingChars, 0); + // Then provide the second octet: + result = QLocal8Bit::convertToUnicode_sys("\xb1", SHIFT_JIS, &state); + QCOMPARE(result, u"こ"); + QCOMPARE(state.remainingChars, 0); + + // And without state: + result.clear(); + QStringConverter::State statelessState; + statelessState.flags |= QStringConverter::Flag::Stateless; + result = QLocal8Bit::convertToUnicode_sys("\x82", SHIFT_JIS, &statelessState); + result += QLocal8Bit::convertToUnicode_sys("\xb1", SHIFT_JIS, &statelessState); + // 0xb1 is a valid single-octet character in Shift-JIS, so the output + // isn't really what you would expect: + QCOMPARE(result, QString(QChar::ReplacementCharacter) + u'ア'); + QCOMPARE(statelessState.remainingChars, 0); + + // Now try a 3-octet UTF-8 sequence: + result.clear(); + state.clear(); + constexpr uint UTF8 = 65001u; + // First the first 2 octets: + result = QLocal8Bit::convertToUnicode_sys("\xe4\xbd", UTF8, &state); + QCOMPARE(result, QString()); + QVERIFY(result.isNull()); + QCOMPARE_GT(state.remainingChars, 0); + // Then provide the remaining octet: + result = QLocal8Bit::convertToUnicode_sys("\xa0", UTF8, &state); + QCOMPARE(result, u"你"); + QCOMPARE(state.remainingChars, 0); + + // Now the same, but there is an incomplete sequence at the start + result.clear(); + state.clear(); + result = QLocal8Bit::convertToUnicode_sys("\xe4\xe4\xbd", UTF8, &state); + QCOMPARE(result, QString()); + QVERIFY(result.isNull()); + // Remaining octet (and a '.' to force it to discard something from the + // internal state which is currently limited to 4 octets): + result += QLocal8Bit::convertToUnicode_sys("\xa0.", UTF8, &state); + QCOMPARE(result, QChar::ReplacementCharacter + u"你."_s); + QCOMPARE(state.remainingChars, 0); + + // Test QTBUG-118834, which is failing + result.clear(); + state.clear(); + result = QLocal8Bit::convertToUnicode_sys("\xe4\xe4\xbd", UTF8, &state); + QCOMPARE(result, QString()); + QVERIFY(result.isNull()); + // Remaining octet: + result += QLocal8Bit::convertToUnicode_sys("\xa0", UTF8, &state); + QEXPECT_FAIL("", "QTBUG-118834: We don't output anything because it's " + "within the size of our internal state, and we cannot " + "signal that it needs to be drained.", Continue); + QCOMPARE(result, QChar::ReplacementCharacter + u"你"_s); + QEXPECT_FAIL("", "QTBUG-118834: As above", Continue); + QCOMPARE(state.remainingChars, 0); + + // Now try a 4-octet GB 18030 sequence: + result.clear(); + state.clear(); + constexpr uint GB_18030 = 54936u; + const char sequence[] = "\x95\x32\x90\x31"; + // Repeat the sequence multiple times to test handling of exhaustion of + // internal buffer + QByteArray repeated = QByteArray(sequence).repeated(2049); + QByteArrayView octets = QByteArrayView(repeated); + result = QLocal8Bit::convertToUnicode_sys(octets.first(2), GB_18030, &state); + QCOMPARE(result, QString()); + QVERIFY(result.isNull()); + QCOMPARE_GT(state.remainingChars, 0); + // Then provide one more octet: + result = QLocal8Bit::convertToUnicode_sys(octets.sliced(2, 1), GB_18030, &state); + QCOMPARE(result, QString()); + QVERIFY(result.isNull()); + QCOMPARE_GT(state.remainingChars, 0); + // Then provide the last octet + the rest of the string + result = QLocal8Bit::convertToUnicode_sys(octets.sliced(3), GB_18030, &state); + QCOMPARE(result.first(2), u"𠂇"); + QCOMPARE(state.remainingChars, 0); +} + +void tst_QStringConverter::fromLocal8Bit_2GiB() +{ +#if QT_POINTER_SIZE == 4 + QSKIP("This test is only relevant for 64-bit builds"); +#else + qsizetype size = qsizetype(std::numeric_limits<int>::max()) + 3; + QByteArray input; + QT_TRY { + input.reserve(size); + } QT_CATCH (const std::bad_alloc &) { + QSKIP("Out of memory"); + } + // fill with '、' - a single octet character in Shift-JIS + input.fill('\xa4', std::numeric_limits<int>::max() - 1); + // then append 'こ' - a two octet character in Shift-JIS + // which is now straddling the 2 GiB boundary + input += "\x82\xb1"; + // then append another two '、', so that our output is also crossing the + // 2 GiB boundary + input += "\xa4\xa4"; + QCOMPARE(input.size(), input.capacity()); + constexpr uint SHIFT_JIS = 932u; + QStringConverter::State state; + QString result; + QT_TRY { + result = QLocal8Bit::convertToUnicode_sys(input, SHIFT_JIS, &state); + } QT_CATCH (const std::bad_alloc &) { + QSKIP("Out of memory"); + } + QCOMPARE(result.size(), size - 1); // The 2-octet character is only 1 code unit in UTF-16 + QCOMPARE(result.last(4), u"、こ、、"); // Check we correctly decoded it + QCOMPARE(state.remainingChars, 0); // and there is nothing left in the state +#endif +} + +void tst_QStringConverter::toLocal8Bit_data() +{ + fromLocal8Bit_data(); +} + +void tst_QStringConverter::toLocal8Bit() +{ + QFETCH(const QByteArray, eightBit); + QFETCH(const QString, utf16); + QFETCH(const quint32, codePage); + + QStringConverter::State state; + + QByteArray result = QLocal8Bit::convertFromUnicode_sys(utf16, codePage, &state); + QCOMPARE(result, eightBit); + QCOMPARE(state.remainingChars, 0); + + result.clear(); + state.clear(); + for (QChar c : utf16) + result += QLocal8Bit::convertFromUnicode_sys(QStringView(&c, 1), codePage, &state); + QCOMPARE(result, eightBit); + QCOMPARE(state.remainingChars, 0); + + result.clear(); + state.clear(); + // Decode the full string again, this time without state + state.flags |= QStringConverter::Flag::Stateless; + result = QLocal8Bit::convertFromUnicode_sys(utf16, codePage, &state); + QCOMPARE(result, eightBit); + QCOMPARE(state.remainingChars, 0); +} + +void tst_QStringConverter::toLocal8Bit_special_cases() +{ + QStringConverter::State state; + // Normally utf8 goes through a different code path, but we can force it here + constexpr uint UTF8 = 65001u; + // Decode a 2-code unit character, but only provide 1 code unit at first: + const char16_t a[] = u"𬽦"; + QStringView codeUnits = a; + QByteArray result = QLocal8Bit::convertFromUnicode_sys(codeUnits.first(1), UTF8, &state); + QCOMPARE(result, QString()); + QVERIFY(result.isNull()); + QCOMPARE_GT(state.remainingChars, 0); + // Then provide the second code unit: + result = QLocal8Bit::convertFromUnicode_sys(codeUnits.sliced(1), UTF8, &state); + QCOMPARE(result, "\xf0\xac\xbd\xa6"_ba); + QCOMPARE(state.remainingChars, 0); + + // Retain compat with the behavior for toLocal8Bit: + QCOMPARE(codeUnits.first(1).toLocal8Bit(), "?"); + + // QString::toLocal8Bit is already stateless, but test stateless handling + // explicitly anyway: + result.clear(); + QStringConverter::State statelessState; + statelessState.flags |= QStringConverter::Flag::Stateless; + result = QLocal8Bit::convertFromUnicode_sys(codeUnits.first(1), UTF8, &statelessState); + result += QLocal8Bit::convertFromUnicode_sys(codeUnits.sliced(1), UTF8, &statelessState); + // Windows uses the replacement character for invalid characters: + QCOMPARE(result, "\ufffd\ufffd"); + + // Now do the same, but the second time we feed in a character, we also + // provide many more so the internal stack buffer is not large enough. + result.clear(); + state.clear(); + QString str = QStringView(a).toString().repeated(2048); + codeUnits = str; + result = QLocal8Bit::convertFromUnicode_sys(codeUnits.first(1), UTF8, &state); + QCOMPARE(result, QString()); + QVERIFY(result.isNull()); + QCOMPARE_GT(state.remainingChars, 0); + // Then we provide the rest of the string: + result = QLocal8Bit::convertFromUnicode_sys(codeUnits.sliced(1), UTF8, &state); + QCOMPARE(result.first(4), "\xf0\xac\xbd\xa6"_ba); + QCOMPARE(state.remainingChars, 0); +} + +void tst_QStringConverter::toLocal8Bit_2GiB() +{ +#if QT_POINTER_SIZE == 4 + QSKIP("This test is only relevant for 64-bit builds"); +#else + constexpr qsizetype TwoGiB = qsizetype(std::numeric_limits<int>::max()); + QString input; + QT_TRY { + input.reserve(TwoGiB + 1); + } QT_CATCH (const std::bad_alloc &) { + QSKIP("Out of memory"); + } + // Fill with a single code unit character + input.fill(u'.', TwoGiB - 1); + // Then append a 2 code unit character, so that the input straddles the 2 GiB + // boundary + input += u"🙂"; + QCOMPARE(input.size(), input.capacity()); + constexpr uint UTF8 = 65001u; + QStringConverter::State state; + QByteArray result; + QT_TRY { + result = QLocal8Bit::convertFromUnicode_sys(input, UTF8, &state); + } QT_CATCH (const std::bad_alloc &) { + QSKIP("Out of memory"); + } + QUtf8StringView rView = result; + QCOMPARE(rView.size(), TwoGiB + 3); // The 2 code unit smiley is 4 code units in UTF-8 + QCOMPARE(rView.last(7), u8"...🙂"); // Check we correctly decoded it + QCOMPARE(state.remainingChars, 0); // and there is nothing left in the state +#endif +} +#endif // Q_OS_WIN + struct DontCrashAtExit { ~DontCrashAtExit() { QStringDecoder decoder(QStringDecoder::Utf8); |