diff options
Diffstat (limited to 'tests/auto/corelib/text/qstringconverter')
5 files changed, 1145 insertions, 159 deletions
diff --git a/tests/auto/corelib/text/qstringconverter/BLACKLIST b/tests/auto/corelib/text/qstringconverter/BLACKLIST deleted file mode 100644 index 0bf1982277..0000000000 --- a/tests/auto/corelib/text/qstringconverter/BLACKLIST +++ /dev/null @@ -1,3 +0,0 @@ -# QTBUG-87418 -[convertUtf8] -android diff --git a/tests/auto/corelib/text/qstringconverter/CMakeLists.txt b/tests/auto/corelib/text/qstringconverter/CMakeLists.txt index 5b5eee953e..22378fe96b 100644 --- a/tests/auto/corelib/text/qstringconverter/CMakeLists.txt +++ b/tests/auto/corelib/text/qstringconverter/CMakeLists.txt @@ -1,10 +1,28 @@ -# Generated from qstringconverter.pro. +# Copyright (C) 2022 The Qt Company Ltd. +# SPDX-License-Identifier: BSD-3-Clause ##################################################################### ## tst_qstringconverter Test: ##################################################################### +if(NOT QT_BUILD_STANDALONE_TESTS AND NOT QT_BUILDING_QT) + cmake_minimum_required(VERSION 3.16) + project(tst_qstringconverter LANGUAGES CXX) + find_package(Qt6BuildInternals REQUIRED COMPONENTS STANDALONE_TEST) +endif() + qt_internal_add_test(tst_qstringconverter SOURCES tst_qstringconverter.cpp + TESTDATA ${test_data} + LIBRARIES + Qt::CorePrivate # for access to Qt's feature system +) + + +qt_internal_add_resource(tst_qstringconverter "compressedtexture_bc1" + PREFIX + "/" + FILES + "euc_kr.txt" ) diff --git a/tests/auto/corelib/text/qstringconverter/euc_kr.txt b/tests/auto/corelib/text/qstringconverter/euc_kr.txt new file mode 100644 index 0000000000..a0eb9af691 --- /dev/null +++ b/tests/auto/corelib/text/qstringconverter/euc_kr.txt @@ -0,0 +1 @@ +Ы?Ȫʦ??ɪǪEUC Packed Formatȡ2ЫͳEUC Fixed Width Format롣ġ?ݻ?ĪǡEUCȪǪ˪Ī?롣 diff --git a/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp b/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp index 38714ef580..342c343a42 100644 --- a/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp +++ b/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp @@ -1,58 +1,180 @@ -/**************************************************************************** -** -** Copyright (C) 2020 The Qt Company Ltd. -** Copyright (C) 2016 Intel Corporation. -** Contact: https://www.qt.io/licensing/ -** -** This file is part of the test suite of the Qt Toolkit. -** -** $QT_BEGIN_LICENSE:GPL-EXCEPT$ -** Commercial License Usage -** Licensees holding valid commercial Qt licenses may use this file in -** accordance with the commercial license agreement provided with the -** Software or, alternatively, in accordance with the terms contained in -** a written agreement between you and The Qt Company. For licensing terms -** and conditions see https://www.qt.io/terms-conditions. For further -** information use the contact form at https://www.qt.io/contact-us. -** -** GNU General Public License Usage -** Alternatively, this file may be used under the terms of the GNU -** General Public License version 3 as published by the Free Software -** Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT -** included in the packaging of this file. Please review the following -** information to ensure the GNU General Public License requirements will -** be met: https://www.gnu.org/licenses/gpl-3.0.html. -** -** $QT_END_LICENSE$ -** -****************************************************************************/ +// Copyright (C) 2021 The Qt Company Ltd. +// Copyright (C) 2016 Intel Corporation. +// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only #include <QTest> +#include <QtCore/private/qglobal_p.h> #include <qstringconverter.h> +#include <private/qstringconverter_p.h> #include <qthreadpool.h> +#include <array> +#include <numeric> + +using namespace Qt::StringLiterals; + +QT_BEGIN_NAMESPACE +namespace QTest { +template <typename T> +char *toString(const std::optional<T> &opt) +{ + if (opt) + return QTest::toString(*opt); + else + return qstrdup("std::nullopt"); +} +} // namespace QTest +QT_END_NAMESPACE + +using QTest::toString; + +static constexpr bool IsBigEndian = QSysInfo::ByteOrder == QSysInfo::BigEndian; +enum CodecLimitation { + AsciiOnly, + Latin1Only, + FullUnicode +}; + +#ifdef Q_OS_WIN +# include <qt_windows.h> +static bool localeIsUtf8() +{ + return GetACP() == CP_UTF8; +} +#else +static constexpr bool localeIsUtf8() +{ + return true; +} +#endif + +struct Codec +{ + const char name[12]; + QStringConverter::Encoding code; + CodecLimitation limitation = FullUnicode; +}; +static const std::array codes = { + Codec{ "UTF-8", QStringConverter::Utf8 }, + Codec{ "UTF-16", QStringConverter::Utf16 }, + Codec{ "UTF-16-le", QStringConverter::Utf16LE }, + Codec{ "UTF-16-be", QStringConverter::Utf16BE }, + Codec{ "UTF-32", QStringConverter::Utf32 }, + Codec{ "UTF-32-le", QStringConverter::Utf32LE }, + Codec{ "UTF-32-be", QStringConverter::Utf32BE }, + Codec{ "Latin-1", QStringConverter::Latin1, Latin1Only }, + Codec{ "System", QStringConverter::System, localeIsUtf8() ? FullUnicode : AsciiOnly } +}; + +static const std::array encodedBoms = { + QByteArrayView("\xef\xbb\xbf"), // Utf8, + QByteArrayView(IsBigEndian ? "\xfe\xff" : "\xff\xfe"), // Utf16, + QByteArrayView("\xff\xfe"), // Utf16LE, + QByteArrayView("\xfe\xff"), // Utf16BE, + QByteArrayView(IsBigEndian ? "\0\0\xfe\xff" : "\xff\xfe\0", 4), // Utf32, + QByteArrayView("\xff\xfe\0", 4), // Utf32LE, + QByteArrayView("\0\0\xfe\xff", 4), // Utf32BE, +}; + +struct TestString +{ + const char *description; + QUtf8StringView utf8; + QStringView utf16; + CodecLimitation limitation = FullUnicode; +}; +static const std::array testStrings = { + TestString{ "empty", "", u"", AsciiOnly }, + TestString{ "null-character", QUtf8StringView("", 1), QStringView(u"", 1), AsciiOnly }, + TestString{ "ascii-text", + "This is a standard US-ASCII message", + "This is a standard US-ASCII message" u"", + AsciiOnly + }, + TestString{ "ascii-with-carriage-return", "a\rb", u"a\rb", AsciiOnly }, + TestString{ "ascii-with-control", + "\1This\2is\3an\4US-ASCII\020 message interspersed with control chars", + "\1This\2is\3an\4US-ASCII\020 message interspersed with control chars" u"", + AsciiOnly + }, + + TestString{ "nbsp", "\u00a0", u"\u00a0", Latin1Only }, + TestString{ "latin1-text", + "Hyvää päivää, käyhän että tuon kannettavani saunaan?", + "Hyvää päivää, käyhän että tuon kannettavani saunaan?" u"", + Latin1Only + }, + +#define ROW(name, string) TestString{ name, u8"" string, u"" string } + ROW("euro", "€"), + ROW("character+bom", "b\ufeff"), + /* Check that the codec does NOT flag EFBFBF. + * This is a regression test; see QTBUG-33229 + */ + ROW("last-bmp", "\uffff"), + ROW("character+last-bmp", "b\uffff"), + ROW("replacement", "\ufffd"), + ROW("supplementary-plane", "\U00010203"), + ROW("mahjong", "\U0001f000\U0001f001\U0001f002\U0001f003\U0001f004\U0001f005" + "\U0001f006\U0001f007\U0001f008\U0001f009\U0001f00a\U0001f00b\U0001f00c" + "\U0001f00d\U0001f00e\U0001f00f"), + ROW("emojis", "😂, 😃, 🧘🏻♂️, 🌍, 🌦️, 🍞, 🚗, 📞, 🎉, ❤️, 🏁"), // https://en.wikipedia.org/wiki/Emoji + ROW("last-valid", "\U0010fffd"), // U+10FFFF is the strict last, but it's a non-character + ROW("mixed-bmp-only", "abc\u00a0\u00e1\u00e9\u01fd \u20acdef"), + ROW("mixed-full", "abc\u00a0\u00e1\u00e9\u01fd \U0010FFFD \u20acdef"), + ROW("xml", "<doc>\U00010000\U0010FFFD</doc>\r\n") +#undef ROW +}; + class tst_QStringConverter : public QObject { Q_OBJECT private slots: + void initTestCase(); + void threadSafety(); void constructByName(); + void invalidConverter(); + + void convertUtf8_data(); void convertUtf8(); + void convertUtf8CharByChar_data() { convertUtf8_data(); } + void convertUtf8CharByChar(); + void roundtrip_data(); + void roundtrip(); + + void convertL1U8(); + + void convertL1U16(); + +#if QT_CONFIG(icu) + void roundtripIcu_data(); + void roundtripIcu(); + void icuInvalidCharacter_data(); + void icuInvalidCharacter(); + void icuEncodeEdgeCases_data(); + void icuEncodeEdgeCases(); + void icuUsableAfterMove(); + void charByCharConsistency_data(); + void charByCharConsistency(); + void byteByByteConsistency_data(); + void byteByByteConsistency(); + void statefulPieceWise(); +#endif - void nonFlaggedCodepointFFFF() const; void flagF7808080() const; - void nonFlaggedEFBFBF() const; - void decode0D() const; void utf8Codec_data(); void utf8Codec(); void utf8bom_data(); void utf8bom(); + void roundtripBom_data(); + void roundtripBom(); void utf8stateful_data(); void utf8stateful(); @@ -71,6 +193,20 @@ private slots: void encodingForHtml_data(); void encodingForHtml(); + + void availableCodesAreAvailable(); + +#ifdef Q_OS_WIN + // On all other systems local 8-bit encoding is UTF-8 + void fromLocal8Bit_data(); + void fromLocal8Bit(); + void fromLocal8Bit_special_cases(); + void fromLocal8Bit_2GiB(); + void toLocal8Bit_data(); + void toLocal8Bit(); + void toLocal8Bit_special_cases(); + void toLocal8Bit_2GiB(); +#endif }; void tst_QStringConverter::constructByName() @@ -98,73 +234,536 @@ void tst_QStringConverter::constructByName() QVERIFY(!strcmp(decoder.name(), "UTF-16")); } +void tst_QStringConverter::invalidConverter() +{ + // QStringEncoder tests + { + QStringEncoder encoder; + QVERIFY(!encoder.isValid()); + QVERIFY(!encoder.name()); + QByteArray encoded = encoder(u"Some text"); + QVERIFY(encoded.isEmpty()); + QVERIFY(encoder.hasError()); + + encoder.resetState(); + QVERIFY(!encoder.hasError()); + + encoded = encoder.encode(u"More text"); + QVERIFY(encoded.isEmpty()); + QVERIFY(encoder.hasError()); + QCOMPARE(encoder.requiredSpace(42), 0); + + encoder.resetState(); + QVERIFY(!encoder.hasError()); + char buffer[100]; + char *position = encoder.appendToBuffer(buffer, u"Even more"); + QCOMPARE(position - buffer, 0); + QVERIFY(encoder.hasError()); + } + + // QStringDecoder tests + { + QStringDecoder decoder; + QVERIFY(!decoder.name()); + QVERIFY(!decoder.isValid()); + QString decoded = decoder("Some text"); + QVERIFY(decoded.isEmpty()); + QVERIFY(decoder.hasError()); + + decoder.resetState(); + QVERIFY(!decoder.hasError()); + + decoded = decoder.decode("More text"); + QVERIFY(decoded.isEmpty()); + QVERIFY(decoder.hasError()); + + QCOMPARE(decoder.requiredSpace(42), 0); + + decoder.resetState(); + QVERIFY(!decoder.hasError()); + char16_t buffer[100]; + char16_t *position = decoder.appendToBuffer(buffer, "Even more"); + QCOMPARE(position - buffer, 0); + QVERIFY(decoder.hasError()); + } +} + +void tst_QStringConverter::convertUtf8_data() +{ + QTest::addColumn<QStringConverter::Encoding>("encoding"); + QTest::addColumn<QUtf8StringView>("utf8"); + QTest::addColumn<QStringView>("utf16"); + auto addRow = [](const TestString &s) { + QTest::addRow("Utf8:%s", s.description) << QStringDecoder::Utf8 << s.utf8 << s.utf16; + if (localeIsUtf8()) + QTest::addRow("System:%s", s.description) << QStringDecoder::System << s.utf8 << s.utf16; + }; + + for (const TestString &s : testStrings) + addRow(s); +} + void tst_QStringConverter::convertUtf8() { - QFile file(QFINDTESTDATA("utf8.txt")); + QFETCH(QStringConverter::Encoding, encoding); + QFETCH(QUtf8StringView, utf8); + QFETCH(QStringView, utf16); + + QByteArray ba = QByteArray::fromRawData(utf8.data(), utf8.size()); + + QStringDecoder decoder(encoding); + QVERIFY(decoder.isValid()); + QString uniString = decoder(ba); + QCOMPARE(uniString, utf16); + QCOMPARE(uniString, QString::fromUtf8(ba)); + QCOMPARE(ba, uniString.toUtf8()); + + // do it again (using .decode()) + uniString = decoder.decode(ba); + QCOMPARE(uniString, utf16); + QCOMPARE(uniString, QString::fromUtf8(ba)); + QCOMPARE(ba, uniString.toUtf8()); + + QStringEncoder encoder(encoding); + QByteArray reencoded = encoder(utf16); + QCOMPARE(reencoded, utf8); + QCOMPARE(reencoded, uniString.toUtf8()); + + // do it again (using .encode()) + reencoded = encoder.encode(utf16); + QCOMPARE(reencoded, utf8); + QCOMPARE(reencoded, uniString.toUtf8()); + + if (utf16.isEmpty()) + return; + + // repeat, with a longer string + constexpr qsizetype MinSize = 128; + uniString = utf16.toString(); + while (uniString.size() < MinSize && ba.size() < MinSize) { + uniString += uniString; + ba += ba; + } + QCOMPARE(decoder(ba), uniString); + QCOMPARE(encoder(uniString), ba); +} + +void tst_QStringConverter::convertUtf8CharByChar() +{ + QFETCH(QStringConverter::Encoding, encoding); + QFETCH(QUtf8StringView, utf8); + QFETCH(QStringView, utf16); - if (!file.open(QIODevice::ReadOnly)) - QFAIL(qPrintable("File could not be opened: " + file.errorString())); + QByteArray ba = QByteArray::fromRawData(utf8.data(), utf8.size()); - QByteArray ba = file.readAll(); - QVERIFY(!ba.isEmpty()); + QStringDecoder decoder(encoding); + QVERIFY(decoder.isValid()); + QString uniString; + for (int i = 0; i < ba.size(); ++i) + uniString += decoder(QByteArrayView(ba).sliced(i, 1)); + QCOMPARE(uniString, utf16); + QCOMPARE(uniString, QString::fromUtf8(ba)); + uniString.clear(); + + // do it again (using .decode()) + for (int i = 0; i < ba.size(); ++i) + uniString += decoder.decode(QByteArrayView(ba).sliced(i, 1)); + QCOMPARE(uniString, utf16); + QCOMPARE(uniString, QString::fromUtf8(ba)); + + QStringEncoder encoder(encoding); + QByteArray reencoded; + for (int i = 0; i < utf16.size(); ++i) + reencoded += encoder(utf16.sliced(i, 1)); + QCOMPARE(reencoded, ba); + reencoded.clear(); + + // do it again (using .encode()) + for (int i = 0; i < utf16.size(); ++i) + reencoded += encoder.encode(utf16.sliced(i, 1)); + QCOMPARE(reencoded, ba); +} + +void tst_QStringConverter::convertL1U16() +{ + const QLatin1StringView latin1("some plain latin1 text"); + const QString qstr(latin1); + + QStringDecoder decoder(QStringConverter::Latin1); + QVERIFY(decoder.isValid()); + QString uniString = decoder(latin1); + QCOMPARE(uniString, qstr); + QCOMPARE(latin1, uniString.toLatin1()); + + // do it again (using .decode()) + uniString = decoder.decode(latin1); + QCOMPARE(uniString, qstr); + QCOMPARE(latin1, uniString.toLatin1()); + + QStringEncoder encoder(QStringConverter::Latin1); + QByteArray reencoded = encoder(uniString); + QCOMPARE(reencoded, QByteArrayView(latin1)); + QCOMPARE(reencoded, uniString.toLatin1()); + + // do it again (using .encode()) + reencoded = encoder.encode(uniString); + QCOMPARE(reencoded, QByteArrayView(latin1)); + QCOMPARE(reencoded, uniString.toLatin1()); +} +void tst_QStringConverter::roundtrip_data() +{ + QTest::addColumn<QStringView>("utf16"); + QTest::addColumn<QStringConverter::Encoding>("code"); + + for (const auto &code : codes) { + for (const TestString &s : testStrings) { + // rules: + // 1) don't pass the null character to the System codec + // 2) only pass operate on a string that will properly convert + if (code.code == QStringConverter::System && s.utf16.contains(QChar(0))) + continue; + if (code.limitation < s.limitation) + continue; + QTest::addRow("%s:%s", code.name, s.description) << s.utf16 << code.code; + } + + if (code.limitation == FullUnicode) { + using Digits = std::array<QChar, 2>; + using DigitsArray = std::array<Digits, 10>; + static constexpr DigitsArray chakmaDigits = []() { + const char32_t zeroVal = 0x11136; // Unicode's representation of Chakma zero + DigitsArray r; + for (int i = 0; i < int(r.size()); ++i) + r[i] = { QChar::highSurrogate(zeroVal + i), QChar::lowSurrogate(zeroVal + i) }; + return r; + }(); + for (int i = 0; i < int(chakmaDigits.size()); ++i) + QTest::addRow("%s:Chakma-digit-%d", code.name, i) << QStringView(chakmaDigits[i]) << code.code; + } + } +} + +void tst_QStringConverter::roundtrip() +{ + QFETCH(QStringView, utf16); + QFETCH(QStringConverter::Encoding, code); + QStringEncoder out(code); + QByteArray encoded = out.encode(utf16); + QStringDecoder back(code); + QString decoded = back.decode(encoded); + QCOMPARE(decoded, utf16); + + // test some flags + QStringConverter::Flags flag = QStringEncoder::Flag::Stateless; { - QStringDecoder decoder(QStringDecoder::Utf8); - QVERIFY(decoder.isValid()); - QString uniString = decoder(ba); - QCOMPARE(uniString, QString::fromUtf8(ba)); - QCOMPARE(ba, uniString.toUtf8()); - uniString = decoder.decode(ba); - QCOMPARE(uniString, QString::fromUtf8(ba)); - QCOMPARE(ba, uniString.toUtf8()); - - QStringEncoder encoder(QStringEncoder::Utf8); - QCOMPARE(ba, encoder(uniString)); - QCOMPARE(ba, encoder.encode(uniString)); + QStringEncoder out2(code, flag); + QStringDecoder back2(code, flag); + decoded = back2.decode(out2.encode(utf16)); + QCOMPARE(decoded, utf16); + } + flag |= QStringConverter::Flag::ConvertInvalidToNull; + { + QStringEncoder out2(code, flag); + QStringDecoder back2(code, flag); + decoded = back2.decode(out2.encode(utf16)); + QCOMPARE(decoded, utf16); + } + + if (utf16.isEmpty()) + return; + + // repeat, with a longer string + constexpr qsizetype MinSize = 128; + QString uniString = utf16.toString(); + while (uniString.size() < MinSize && encoded.size() < MinSize) { + uniString += uniString; + encoded += encoded; } + QCOMPARE(out.encode(uniString), encoded); + QCOMPARE(back.decode(encoded), uniString); + + QStringEncoder out2(code, flag); + QStringDecoder back2(code, flag); + decoded = back2.decode(out2.encode(uniString)); + QCOMPARE(decoded, uniString); +} +void tst_QStringConverter::convertL1U8() +{ { - // once again converting char by char - QStringDecoder decoder(QStringDecoder::Utf8); - QVERIFY(decoder.isValid()); - QString uniString; - for (int i = 0; i < ba.size(); ++i) - uniString += decoder(QByteArrayView(ba).sliced(i, 1)); - QCOMPARE(uniString, QString::fromUtf8(ba)); - uniString.clear(); - for (int i = 0; i < ba.size(); ++i) - uniString += decoder.decode(QByteArrayView(ba).sliced(i, 1)); - QCOMPARE(uniString, QString::fromUtf8(ba)); - - QStringEncoder encoder(QStringEncoder::Utf8); - QByteArray reencoded; - for (int i = 0; i < uniString.size(); ++i) - reencoded += encoder(QStringView(uniString).sliced(i, 1)); - QCOMPARE(ba, encoder(uniString)); - reencoded.clear(); - for (int i = 0; i < uniString.size(); ++i) - reencoded += encoder.encode(QStringView(uniString).sliced(i, 1)); - QCOMPARE(ba, encoder(uniString)); + std::array<char, 256> latin1; + std::iota(latin1.data(), latin1.data() + latin1.size(), uchar(0)); + std::array<char, 512> utf8; + auto out = QUtf8::convertFromLatin1(utf8.data(), QLatin1StringView{latin1.data(), latin1.size()}); + QCOMPARE(QString::fromLatin1(latin1.data(), latin1.size()), + QString::fromUtf8(utf8.data(), out - utf8.data())); } } -void tst_QStringConverter::nonFlaggedCodepointFFFF() const +#if QT_CONFIG(icu) + +void tst_QStringConverter::roundtripIcu_data() +{ + QTest::addColumn<QString>("original"); + QTest::addColumn<QByteArray>("codec"); + + QTest::addRow("shift_jis") << u"古池や 蛙飛び込む 水の音"_s << QByteArray("shift_jis"); + QTest::addRow("UTF7") << u"Übermäßig: čçö"_s << QByteArray("UTF-7"); +} + +void tst_QStringConverter::roundtripIcu() +{ + QFETCH(QString, original); + QFETCH(QByteArray, codec); + QStringEncoder fromUtf16(codec); + if (!fromUtf16.isValid()) + QSKIP("Unsupported codec"); + QStringDecoder toUtf16(codec); + QByteArray asShiftJIS = fromUtf16(original); + QString roundTripped = toUtf16(asShiftJIS); + QCOMPARE(roundTripped, original); +} + +void tst_QStringConverter::icuEncodeEdgeCases_data() { - //Check that the code point 0xFFFF (=non-character code 0xEFBFBF) is not flagged - const QChar ch(0xFFFF); + QTest::addColumn<QString>("source"); + QTest::addColumn<QByteArray>("expected") ; + QTest::addColumn<QByteArray>("codec"); + + QTest::addRow("empty") << QString() << QByteArray() << QByteArray("ISO-2022-CN"); + QTest::addRow("BOMonly") << QString(QChar(QChar::ByteOrderMark)) << QByteArray() << QByteArray("ISO-2022-CN"); + QTest::addRow("1to6") << u"좋"_s << QByteArray::fromHex("1b2428434141") << QByteArray("ISO-2022-JP-2"); + QTest::addRow("1to7") << u"漢"_s << QByteArray::fromHex("1b2429470e6947") << QByteArray("ISO-2022-CN"); + QTest::addRow("1to8") << u"墎"_s << QByteArray::fromHex("1b242a481b4e4949") << QByteArray("ISO-2022-CN"); + QTest::addRow("utf7") << u"Übergröße"_s << QByteArray("+ANw-bergr+APYA3w-e") << QByteArray("UTF-7"); +} - QStringEncoder encoder(QStringEncoder::Utf8); +void tst_QStringConverter::icuEncodeEdgeCases() +{ + QFETCH(QString, source); + QFETCH(QByteArray, expected); + QFETCH(QByteArray, codec); + QStringEncoder encoder(codec); + if (!encoder.isValid()) + QSKIP("Unsupported codec"); QVERIFY(encoder.isValid()); + QByteArray encoded = encoder.encode(source); + QCOMPARE(encoded, expected); +} - const QByteArray asDecoded = encoder(QStringView(&ch, 1)); - QCOMPARE(asDecoded, QByteArray("\357\277\277")); +void tst_QStringConverter::charByCharConsistency_data() +{ + QTest::addColumn<QStringView>("source"); + QTest::addColumn<QByteArray>("codec"); + + auto addRow = [](const TestString &s) { + QTest::addRow("%s_shift_jis", s.description) << s.utf16 << QByteArray("shift_jis"); + QTest::addRow("%s_EUC-CN", s.description) << s.utf16 << QByteArray("EUC-CN"); + }; + + for (const TestString &s : testStrings) { + if (s.utf16.isEmpty()) + continue; + addRow(s); + } +} - QByteArray ffff("\357\277\277"); - QStringDecoder decoder(QStringEncoder::Utf8, QStringDecoder::Flag::ConvertInvalidToNull); - QVERIFY(decoder.isValid()); - QVERIFY(decoder(ffff) == QString(1, ch)); +void tst_QStringConverter::charByCharConsistency() +{ + QFETCH(const QStringView, source); + QFETCH(const QByteArray, codec); + + const auto check = [&](QStringEncoder encoder){ + if (!encoder.isValid()) + QSKIP("Unsupported codec"); + + QByteArray fullyConverted = encoder.encode(source); + encoder.resetState(); + QByteArray stepByStepConverted; + for (const auto& codeUnit: source) { + stepByStepConverted += encoder.encode(codeUnit); + } + QCOMPARE(stepByStepConverted, fullyConverted); + }; + + check(QStringEncoder(codec)); + if (QTest::currentTestResolved()) return; + + check(QStringEncoder(codec, QStringConverter::Flag::ConvertInvalidToNull)); + if (QTest::currentTestResolved()) return; + + // moved codecs also work: + + { + QStringEncoder dec(codec); + check(std::move(dec)); + } + if (QTest::currentTestResolved()) return; + + { + QStringEncoder dec(codec, QStringConverter::Flag::ConvertInvalidToNull); + check(std::move(dec)); + } + if (QTest::currentTestResolved()) return; + +} + +void tst_QStringConverter::byteByByteConsistency_data() +{ + QTest::addColumn<QByteArray>("source"); + QTest::addColumn<QByteArray>("codec"); + + QTest::addRow("plain_ascii_utf7") << QByteArray("Hello, world!") << QByteArray("UTF-7"); + QFile eucKr(":/euc_kr.txt"); + if (eucKr.open(QFile::ReadOnly)) + QTest::addRow("euc_kr_storing_jp") << eucKr.readAll() << QByteArray("EUC-KR"); + QTest::addRow("incomplete_euc_jp") << QByteArrayLiteral("test\x8Ftest") << QByteArray("EUC-JP"); } +void tst_QStringConverter::byteByByteConsistency() +{ + QFETCH(const QByteArray, source); + QFETCH(const QByteArray, codec); + + const auto check = [&](QStringDecoder decoder) { + if (!decoder.isValid()) + QSKIP("Unsupported codec"); + + QString fullyConverted = decoder.decode(source); + decoder.resetState(); + QString stepByStepConverted; + for (const auto& byte: source) { + QByteArray singleChar; + singleChar.append(byte); + stepByStepConverted += decoder.decode(singleChar); + } + QCOMPARE(stepByStepConverted, fullyConverted); + }; + + check(QStringDecoder(codec)); + if (QTest::currentTestResolved()) return; + + check(QStringDecoder(codec, QStringConverter::Flag::ConvertInvalidToNull)); + if (QTest::currentTestResolved()) return; + + // moved codecs also work: + + { + QStringDecoder dec(codec); + check(std::move(dec)); + } + if (QTest::currentTestResolved()) return; + + { + QStringDecoder dec(codec, QStringConverter::Flag::ConvertInvalidToNull); + check(std::move(dec)); + } + if (QTest::currentTestResolved()) return; + +} + +void tst_QStringConverter::statefulPieceWise() +{ + QStringDecoder decoder("HZ"); + if (!decoder.isValid()) + QSKIP("Unsupported codec"); + QString start = decoder.decode("pure ASCII"); + QCOMPARE(start, u"pure ASCII"); + QString shifted = decoder.decode("~{"); + // shift out changes the state, but won't create any output + QCOMPARE(shifted, ""); + QString continuation = decoder.decode("\x42\x43"); + QCOMPARE(continuation, "旅"); + decoder.resetState(); + // after resetting the state we're in N0 again + QString afterReset = decoder.decode("\x42\x43"); + QCOMPARE(afterReset, "BC"); +} + +void tst_QStringConverter::icuUsableAfterMove() +{ + { + QStringDecoder decoder("EUC-JP"); + QVERIFY(decoder.isValid()); + QString partial = decoder.decode("Test\x8E"); + QCOMPARE(partial, u"Test"_s); + QStringDecoder moved(std::move(decoder)); + QString complete = partial + moved.decode("\xA1Test"); + QCOMPARE(complete, u"Test\uFF61Test"_s); + } + { + QStringEncoder encoder("Big5"); + QVERIFY(encoder.isValid()); + QByteArray encoded = encoder.encode("hello"_L1); + QCOMPARE(encoded, "hello"); + QStringEncoder moved(std::move(encoder)); + encoded = moved.encode("bye"); + QCOMPARE(encoded, "bye"); + } +} + +void tst_QStringConverter::icuInvalidCharacter_data() +{ + QTest::addColumn<QString>("string"); + QTest::addColumn<QByteArray>("bytearray"); + QTest::addColumn<QByteArray>("codec"); + QTest::addColumn<QStringConverter::Flags>("flags"); + QTest::addColumn<bool>("shouldDecode"); + + using Flags = QStringConverter::Flags; + using Flag = QStringConverter::Flag; + QTest::addRow("encode") + << u"Test👪Test"_s + << QByteArrayLiteral("\xE3\x85\xA2\xA3\x3F\xE3\x85\xA2\xA3") + << QByteArray("IBM-037") << Flags(Flag::Default) + << false; + QTest::addRow("encode_null") + << u"Test👪Test"_s + << QByteArrayLiteral("\xE3\x85\xA2\xA3\0\xE3\x85\xA2\xA3") + << QByteArray("IBM-037") << Flags(Flag::ConvertInvalidToNull) + << false; + QTest::addRow("decode_incomplete_EUC-JP") + << u"test"_s + << QByteArrayLiteral("test\x8F") + << QByteArray("EUC-JP") << Flags(Flag::Stateless) + << true; + QTest::addRow("decode_invalid_EUC-JP_sequence") + << u"test\0test"_s + << QByteArrayLiteral("test\x8Ftest") + << QByteArray("EUC-JP") << Flags(Flag::ConvertInvalidToNull) + << true; + QTest::addRow("encode_incomplete_surrogate") + << u"test"_s + QChar::highSurrogate(0x11136) + << QByteArray("test") + << QByteArray("EUC-JP") << Flags(Flag::Stateless) + << false; +} + +void tst_QStringConverter::icuInvalidCharacter() +{ + QFETCH(QString, string); + QFETCH(QByteArray, bytearray); + QFETCH(QByteArray, codec); + QFETCH(QStringConverter::Flags, flags); + QFETCH(bool, shouldDecode); + if (shouldDecode) { + QStringDecoder decoder(codec.data(), flags); + QVERIFY(decoder.isValid()); + QString decoded = decoder.decode(bytearray); + QVERIFY(decoder.hasError()); + QCOMPARE(decoded, string); + } else { + QStringEncoder encoder(codec.data(), flags); + QVERIFY(encoder.isValid()); + QByteArray encoded = encoder.encode(string); + QVERIFY(encoder.hasError()); + QCOMPARE(encoded, bytearray); + } +} + +#endif + void tst_QStringConverter::flagF7808080() const { /* This test case stems from test not-wf-sa-170, tests/qxmlstream/XML-Test-Suite/xmlconf/xmltest/not-wf/sa/166.xml, @@ -195,45 +794,6 @@ void tst_QStringConverter::flagF7808080() const QCOMPARE(decoder(input), QString(input.size(), QChar(0))); } -void tst_QStringConverter::nonFlaggedEFBFBF() const -{ - /* Check that the codec does NOT flag EFBFBF. - * This is a regression test; see QTBUG-33229 - */ - QByteArray validInput; - validInput.resize(3); - validInput[0] = char(0xEF); - validInput[1] = char(0xBF); - validInput[2] = char(0xBF); - - { - QStringDecoder decoder(QStringEncoder::Utf8, QStringDecoder::Flag::ConvertInvalidToNull); - QVERIFY(decoder.isValid()); - QVERIFY(decoder(validInput) == QString::fromUtf8(QByteArray::fromHex("EFBFBF"))); - } - - // Check that 0xEFBFBF is correctly decoded when preceded by an arbitrary character - { - QByteArray start("B"); - start.append(validInput); - - QStringDecoder decoder(QStringEncoder::Utf8, QStringDecoder::Flag::ConvertInvalidToNull); - QVERIFY(decoder.isValid()); - QVERIFY(decoder(start) == QString::fromUtf8(QByteArray("B").append(QByteArray::fromHex("EFBFBF")))); - } -} - -void tst_QStringConverter::decode0D() const -{ - QByteArray input; - input.resize(3); - input[0] = 'A'; - input[1] = '\r'; - input[2] = 'B'; - - QCOMPARE(QString::fromUtf8(input.constData()).toUtf8(), input); -} - static QString fromInvalidUtf8Sequence(const QByteArray &ba) { return QString().fill(QChar::ReplacementCharacter, ba.size()); @@ -1326,6 +1886,51 @@ void tst_QStringConverter::utf8bom() QCOMPARE(decoder(data), result); } +// someone set us up the BOM! +void tst_QStringConverter::roundtripBom_data() +{ + QTest::addColumn<QStringView>("utf16"); + QTest::addColumn<QStringConverter::Encoding>("code"); + + for (const auto &code : codes) { + if (size_t(code.code) >= encodedBoms.size()) + break; + if (code.limitation != FullUnicode) + continue; // can't represent BOM + + for (const TestString &s : testStrings) { + if (s.utf16.isEmpty()) + continue; + QTest::addRow("%s:%s", code.name, s.description) << s.utf16 << code.code; + } + } +} + +void tst_QStringConverter::roundtripBom() +{ + QFETCH(QStringView, utf16); + QFETCH(QStringConverter::Encoding, code); + QByteArray encodedBom = encodedBoms[code].toByteArray(); + QChar bom = QChar::ByteOrderMark; + + // QStringConverter defaults to producing no BOM, but interpreting it if it + // is there + + QStringEncoder encoderWithoutBom(code); + QStringEncoder encoderWithBom(code, QStringEncoder::Flag::WriteBom); + QByteArray encodedWithoutBom = encoderWithoutBom(utf16); + QByteArray encodedWithBom = encoderWithBom(utf16); + QCOMPARE(encodedWithBom, encodedBom + encodedWithoutBom); + + QStringDecoder decoderWithoutBom(code, QStringDecoder::Flag::ConvertInitialBom); + QStringDecoder decoderWithBom(code); + QString decoded = decoderWithBom(encodedWithBom); + QCOMPARE(decoded, utf16); + + decoded = decoderWithoutBom(encodedWithBom); + QCOMPARE(decoded, bom + utf16.toString()); +} + void tst_QStringConverter::utf8stateful_data() { QTest::addColumn<QByteArray>("buffer1"); @@ -1596,7 +2201,7 @@ void tst_QStringConverter::utfHeaders() QVERIFY(decode.isValid()); QString result = decode(encoded); - QCOMPARE(result.length(), unicode.length()); + QCOMPARE(result.size(), unicode.size()); QCOMPARE(result, unicode); } @@ -1607,7 +2212,7 @@ void tst_QStringConverter::utfHeaders() QString result; for (char c : encoded) result += decode(QByteArrayView(&c, 1)); - QCOMPARE(result.length(), unicode.length()); + QCOMPARE(result.size(), unicode.size()); QCOMPARE(result, unicode); } @@ -1635,25 +2240,42 @@ void tst_QStringConverter::encodingForName_data() QTest::addColumn<QByteArray>("name"); QTest::addColumn<std::optional<QStringConverter::Encoding>>("encoding"); - QTest::newRow("UTF-8") << QByteArray("UTF-8") << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8); - QTest::newRow("utf8") << QByteArray("utf8") << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8); - QTest::newRow("Utf-8") << QByteArray("Utf-8") << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8); - QTest::newRow("UTF-16") << QByteArray("UTF-16") << std::optional<QStringConverter::Encoding>(QStringConverter::Utf16); - QTest::newRow("UTF-16le") << QByteArray("UTF-16le") << std::optional<QStringConverter::Encoding>(QStringConverter::Utf16LE); - QTest::newRow("ISO-8859-1") << QByteArray("ISO-8859-1") << std::optional<QStringConverter::Encoding>(QStringConverter::Latin1); - QTest::newRow("ISO8859-1") << QByteArray("ISO8859-1") << std::optional<QStringConverter::Encoding>(QStringConverter::Latin1); - QTest::newRow("iso8859-1") << QByteArray("iso8859-1") << std::optional<QStringConverter::Encoding>(QStringConverter::Latin1); - QTest::newRow("latin1") << QByteArray("latin1") << std::optional<QStringConverter::Encoding>(QStringConverter::Latin1); - QTest::newRow("latin2") << QByteArray("latin2") << std::optional<QStringConverter::Encoding>(); - QTest::newRow("latin15") << QByteArray("latin15") << std::optional<QStringConverter::Encoding>(); + auto row = [](const char *name, std::optional<QStringConverter::Encoding> expected = std::nullopt) { + auto protect = [](auto p) { return p ? *p ? p : "<empty>" : "<nullptr>"; }; + QTest::addRow("%s", protect(name)) << QByteArray(name) << expected; + }; + + row("UTF-8", QStringConverter::Utf8); + row("utf8", QStringConverter::Utf8); + row("Utf-8", QStringConverter::Utf8); + row("UTF-16", QStringConverter::Utf16); + row("UTF-16le", QStringConverter::Utf16LE); + row("ISO-8859-1", QStringConverter::Latin1); + row("ISO8859-1", QStringConverter::Latin1); + row("iso8859-1", QStringConverter::Latin1); + row("latin1", QStringConverter::Latin1); + row("latin-1_-", QStringConverter::Latin1); + row("latin_1-_", QStringConverter::Latin1); + row("-_latin-1", QStringConverter::Latin1); + row("_-latin_1", QStringConverter::Latin1); + + // failures: + row(nullptr); + row(""); + row("latin2"); + row("latin42"); + row(" latin1"); // spaces are significant + row("\tlatin1"); // HTs are significant } void tst_QStringConverter::encodingForName() { - QFETCH(QByteArray, name); - QFETCH(std::optional<QStringConverter::Encoding>, encoding); + QFETCH(const QByteArray, name); + QFETCH(const std::optional<QStringConverter::Encoding>, encoding); - auto e = QStringConverter::encodingForName(name); + const auto *ptr = name.isNull() ? nullptr : name.data(); + + const auto e = QStringConverter::encodingForName(ptr); QCOMPARE(e, encoding); } @@ -1737,65 +2359,102 @@ void tst_QStringConverter::encodingForHtml_data() { QTest::addColumn<QByteArray>("html"); QTest::addColumn<std::optional<QStringConverter::Encoding>>("encoding"); + QTest::addColumn<QByteArray>("name"); // ICU name if we have ICU support QByteArray html = "<html><head></head><body>blah</body></html>"; - QTest::newRow("no charset") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8); + QTest::newRow("no charset") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8"); html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=ISO-8859-15\" /></head></html>"; - QTest::newRow("latin 15") << html << std::optional<QStringConverter::Encoding>(); + QTest::newRow("latin 15") << html << std::optional<QStringConverter::Encoding>() << QByteArray("ISO-8859-15"); + + html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=SJIS\" /></head></html>"; + QTest::newRow("sjis") << html << std::optional<QStringConverter::Encoding>() << QByteArray("Shift_JIS"); + + html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=ISO-2022-JP\" /></head></html>"; + QTest::newRow("ISO-2022-JP") << html << std::optional<QStringConverter::Encoding>() << QByteArray("ISO-2022-JP"); + + html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=ISO-2022\" /></head></html>"; + QTest::newRow("ISO-2022") << html << std::optional<QStringConverter::Encoding>() << QByteArray("ISO-2022-JP"); + + html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=GB2312\" /></head></html>"; + QTest::newRow("GB2312") << html << std::optional<QStringConverter::Encoding>() << QByteArray("GB2312"); + + html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=Big5\" /></head></html>"; + QTest::newRow("Big5") << html << std::optional<QStringConverter::Encoding>() << QByteArray("Big5"); + + html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=GB18030\" /></head></html>"; + QTest::newRow("GB18030") << html << std::optional<QStringConverter::Encoding>() << QByteArray("GB18030"); + + html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=GB2312-HKSCS\" /></head></html>"; + QTest::newRow("GB2312-HKSCS") << html << std::optional<QStringConverter::Encoding>() << QByteArray("GB2312-HKSCS"); + + html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=Big5-HKSCS\" /></head></html>"; + QTest::newRow("Big5-HKSCS") << html << std::optional<QStringConverter::Encoding>() << QByteArray("Big5-HKSCS"); + + html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=EucJP\" /></head></html>"; + QTest::newRow("EucJP") << html << std::optional<QStringConverter::Encoding>() << QByteArray("EUC-JP"); + + html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=EucKR\" /></head></html>"; + QTest::newRow("EucKR") << html << std::optional<QStringConverter::Encoding>() << QByteArray("EUC-KR"); + + html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=KOI8-R\" /></head></html>"; + QTest::newRow("KOI8-R") << html << std::optional<QStringConverter::Encoding>() << QByteArray("KOI8-R"); + + html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=KOI8-U\" /></head></html>"; + QTest::newRow("KOI8-U") << html << std::optional<QStringConverter::Encoding>() << QByteArray("KOI8-U"); html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=ISO-8859-1\" /></head></html>"; - QTest::newRow("latin 1") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Latin1); + QTest::newRow("latin 1") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Latin1) << QByteArray("ISO-8859-1"); html = "<!DOCTYPE html><html><head><meta charset=\"ISO_8859-1:1987\"><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><title>Test</title></head>"; - QTest::newRow("latin 1 (#2)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Latin1); + QTest::newRow("latin 1 (#2)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Latin1) << QByteArray("ISO-8859-1"); html = "<!DOCTYPE html><html><head><meta charset=\"utf-8\"><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><title>Test</title></head>"; - QTest::newRow("UTF-8") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8); + QTest::newRow("UTF-8") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8"); html = "<!DOCTYPE html><html><head><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><meta charset=\"utf-8\"><title>Test</title></head>"; - QTest::newRow("UTF-8 (#2)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8); + QTest::newRow("UTF-8 (#2)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8"); html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=UTF-8/></head></html>"; - QTest::newRow("UTF-8, no quotes") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8); + QTest::newRow("UTF-8, no quotes") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8"); html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset='UTF-8'/></head></html>"; - QTest::newRow("UTF-8, single quotes") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8); + QTest::newRow("UTF-8, single quotes") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8"); html = "<!DOCTYPE html><html><head><meta charset=utf-8><title>Test</title></head>"; - QTest::newRow("UTF-8, > terminator") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8); + QTest::newRow("UTF-8, > terminator") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8"); html = "<!DOCTYPE html><html><head><meta charset= utf-8 ><title>Test</title></head>"; - QTest::newRow("UTF-8, > terminator with spaces") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8); + QTest::newRow("UTF-8, > terminator with spaces") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8"); // Test invalid charsets. html = "<!DOCTYPE html><html><head><meta charset= utf/8 ><title>Test</title></head>"; - QTest::newRow("utf/8") << html << std::optional<QStringConverter::Encoding>(); + QTest::newRow("utf/8") << html << std::optional<QStringConverter::Encoding>() << QByteArray(); html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=invalid-foo\" /></head></html>"; - QTest::newRow("invalid charset, no default") << html << std::optional<QStringConverter::Encoding>(); + QTest::newRow("invalid charset, no default") << html << std::optional<QStringConverter::Encoding>() << QByteArray("UTF-8"); html = "<!DOCTYPE html><html><head><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><meta charset=\""; html.prepend(QByteArray().fill(' ', 512 - html.size())); - QTest::newRow("invalid charset (large header)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8); + QTest::newRow("invalid charset (large header)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8"); html = "<!DOCTYPE html><html><head><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><meta charset=\"utf-8"; - QTest::newRow("invalid charset (no closing double quote)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8); + QTest::newRow("invalid charset (no closing double quote)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8"); html = "<!DOCTYPE html><html><head><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><meta charset='utf-8"; - QTest::newRow("invalid charset (no closing single quote)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8); + QTest::newRow("invalid charset (no closing single quote)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8"); html = "<!DOCTYPE html><html><head><meta charset=utf-8 foo=bar><title>Test</title></head>"; - QTest::newRow("invalid (space terminator)") << html << std::optional<QStringConverter::Encoding>(); + QTest::newRow("invalid (space terminator)") << html << std::optional<QStringConverter::Encoding>() << QByteArray(); html = "<!DOCTYPE html><html><head><meta charset=\" utf' 8 /><title>Test</title></head>"; - QTest::newRow("invalid charset, early terminator (')") << html << std::optional<QStringConverter::Encoding>(); + QTest::newRow("invalid charset, early terminator (')") << html << std::optional<QStringConverter::Encoding>() << QByteArray(); const char src[] = { char(0xff), char(0xfe), char(0x7a), char(0x03), 0, 0 }; html = src; - QTest::newRow("greek text UTF-16LE") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf16LE); + QTest::newRow("greek text UTF-16LE") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf16LE) << QByteArray("UTF-16LE"); html = "<meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\"><span style=\"color: rgb(0, 0, 0); font-family: " @@ -1803,19 +2462,33 @@ void tst_QStringConverter::encodingForHtml_data() "line-height: normal; orphans: auto; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; widows: " "auto; word-spacing: 0px; -webkit-text-size-adjust: auto; -webkit-text-stroke-width: 0px; display: inline !important; float: " "none;\">ͻ</span>\000"; - QTest::newRow("greek text UTF-8") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8); + QTest::newRow("greek text UTF-8") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8"); html = "<!DOCTYPE html><html><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=unicode\">" "<head/><body><p>bla</p></body></html>"; // QTBUG-41998, ICU will return UTF-16. - QTest::newRow("legacy unicode UTF-8") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8); + QTest::newRow("legacy unicode UTF-8") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8"); } void tst_QStringConverter::encodingForHtml() { QFETCH(QByteArray, html); QFETCH(std::optional<QStringConverter::Encoding>, encoding); + QFETCH(QByteArray, name); QCOMPARE(QStringConverter::encodingForHtml(html), encoding); + + QStringDecoder decoder = QStringDecoder::decoderForHtml(html); + if (encoding || // we should have a valid decoder independent of ICU support + decoder.isValid()) { // we got a valid decoder through ICU + QCOMPARE(decoder.name(), name); + } +} + +void tst_QStringConverter::availableCodesAreAvailable() +{ + auto codecs = QStringConverter::availableCodecs(); + for (const auto &codecName: codecs) + QVERIFY(QStringEncoder(codecName.toLatin1()).isValid()); } class LoadAndConvert: public QRunnable @@ -1838,8 +2511,20 @@ public: } }; +void tst_QStringConverter::initTestCase() +{ + if (localeIsUtf8()) + qInfo("System locale is UTF-8"); + else + qInfo("System locale is not UTF-8"); +} + void tst_QStringConverter::threadSafety() { +#if defined(Q_OS_WASM) + QSKIP("This test misbehaves on WASM. Investigation needed (QTBUG-110067)"); +#endif + QThreadPool::globalInstance()->setMaxThreadCount(12); QList<QString> res; @@ -1855,6 +2540,292 @@ void tst_QStringConverter::threadSafety() QCOMPARE(b, QString::fromLatin1("abcdefghijklmonpqrstufvxyz")); } +#ifdef Q_OS_WIN +void tst_QStringConverter::fromLocal8Bit_data() +{ + QTest::addColumn<QByteArray>("eightBit"); + QTest::addColumn<QString>("utf16"); + QTest::addColumn<quint32>("codePage"); + + constexpr uint WINDOWS_1252 = 1252u; + QTest::newRow("windows-1252") << "Hello, world!"_ba << u"Hello, world!"_s << WINDOWS_1252; + constexpr uint SHIFT_JIS = 932u; + // Mostly two byte characters, but the comma is a single byte character (0xa4) + QTest::newRow("shiftJIS") + << "\x82\xb1\x82\xf1\x82\xc9\x82\xbf\x82\xcd\xa4\x90\xa2\x8a\x45\x81\x49"_ba + << u"こんにちは、世界!"_s << SHIFT_JIS; + + constexpr uint GB_18030 = 54936u; + QTest::newRow("GB-18030") << "\xc4\xe3\xba\xc3\xca\xc0\xbd\xe7\xa3\xa1"_ba << u"你好世界!"_s + << GB_18030; +} + +void tst_QStringConverter::fromLocal8Bit() +{ + QFETCH(const QByteArray, eightBit); + QFETCH(const QString, utf16); + QFETCH(const quint32, codePage); + + QStringConverter::State state; + + QString result = QLocal8Bit::convertToUnicode_sys(eightBit, codePage, &state); + QCOMPARE(result, utf16); + QCOMPARE(state.remainingChars, 0); + + result.clear(); + state.clear(); + for (char c : eightBit) + result += QLocal8Bit::convertToUnicode_sys({&c, 1}, codePage, &state); + QCOMPARE(result, utf16); + QCOMPARE(state.remainingChars, 0); + + result.clear(); + state.clear(); + // Decode the full string again, this time without state + state.flags |= QStringConverter::Flag::Stateless; + result = QLocal8Bit::convertToUnicode_sys(eightBit, codePage, &state); + QCOMPARE(result, utf16); + QCOMPARE(state.remainingChars, 0); +} + +void tst_QStringConverter::fromLocal8Bit_special_cases() +{ + QStringConverter::State state; + constexpr uint SHIFT_JIS = 932u; + // Decode a 2-octet character, but only provide 1 octet at first: + QString result = QLocal8Bit::convertToUnicode_sys("\x82", SHIFT_JIS, &state); + QCOMPARE(result, QString()); + QVERIFY(result.isNull()); + QCOMPARE_GT(state.remainingChars, 0); + // Then provide the second octet: + result = QLocal8Bit::convertToUnicode_sys("\xb1", SHIFT_JIS, &state); + QCOMPARE(result, u"こ"); + QCOMPARE(state.remainingChars, 0); + + // And without state: + result.clear(); + QStringConverter::State statelessState; + statelessState.flags |= QStringConverter::Flag::Stateless; + result = QLocal8Bit::convertToUnicode_sys("\x82", SHIFT_JIS, &statelessState); + result += QLocal8Bit::convertToUnicode_sys("\xb1", SHIFT_JIS, &statelessState); + // 0xb1 is a valid single-octet character in Shift-JIS, so the output + // isn't really what you would expect: + QCOMPARE(result, QString(QChar::ReplacementCharacter) + u'ア'); + QCOMPARE(statelessState.remainingChars, 0); + + // Now try a 3-octet UTF-8 sequence: + result.clear(); + state.clear(); + constexpr uint UTF8 = 65001u; + // First the first 2 octets: + result = QLocal8Bit::convertToUnicode_sys("\xe4\xbd", UTF8, &state); + QCOMPARE(result, QString()); + QVERIFY(result.isNull()); + QCOMPARE_GT(state.remainingChars, 0); + // Then provide the remaining octet: + result = QLocal8Bit::convertToUnicode_sys("\xa0", UTF8, &state); + QCOMPARE(result, u"你"); + QCOMPARE(state.remainingChars, 0); + + // Now the same, but there is an incomplete sequence at the start + result.clear(); + state.clear(); + result = QLocal8Bit::convertToUnicode_sys("\xe4\xe4\xbd", UTF8, &state); + QCOMPARE(result, QString()); + QVERIFY(result.isNull()); + // Remaining octet (and a '.' to force it to discard something from the + // internal state which is currently limited to 4 octets): + result += QLocal8Bit::convertToUnicode_sys("\xa0.", UTF8, &state); + QCOMPARE(result, QChar::ReplacementCharacter + u"你."_s); + QCOMPARE(state.remainingChars, 0); + + // Test QTBUG-118834, which is failing + result.clear(); + state.clear(); + result = QLocal8Bit::convertToUnicode_sys("\xe4\xe4\xbd", UTF8, &state); + QCOMPARE(result, QString()); + QVERIFY(result.isNull()); + // Remaining octet: + result += QLocal8Bit::convertToUnicode_sys("\xa0", UTF8, &state); + QEXPECT_FAIL("", "QTBUG-118834: We don't output anything because it's " + "within the size of our internal state, and we cannot " + "signal that it needs to be drained.", Continue); + QCOMPARE(result, QChar::ReplacementCharacter + u"你"_s); + QEXPECT_FAIL("", "QTBUG-118834: As above", Continue); + QCOMPARE(state.remainingChars, 0); + + // Now try a 4-octet GB 18030 sequence: + result.clear(); + state.clear(); + constexpr uint GB_18030 = 54936u; + const char sequence[] = "\x95\x32\x90\x31"; + // Repeat the sequence multiple times to test handling of exhaustion of + // internal buffer + QByteArray repeated = QByteArray(sequence).repeated(2049); + QByteArrayView octets = QByteArrayView(repeated); + result = QLocal8Bit::convertToUnicode_sys(octets.first(2), GB_18030, &state); + QCOMPARE(result, QString()); + QVERIFY(result.isNull()); + QCOMPARE_GT(state.remainingChars, 0); + // Then provide one more octet: + result = QLocal8Bit::convertToUnicode_sys(octets.sliced(2, 1), GB_18030, &state); + QCOMPARE(result, QString()); + QVERIFY(result.isNull()); + QCOMPARE_GT(state.remainingChars, 0); + // Then provide the last octet + the rest of the string + result = QLocal8Bit::convertToUnicode_sys(octets.sliced(3), GB_18030, &state); + QCOMPARE(result.first(2), u"𠂇"); + QCOMPARE(state.remainingChars, 0); +} + +void tst_QStringConverter::fromLocal8Bit_2GiB() +{ +#if QT_POINTER_SIZE == 4 + QSKIP("This test is only relevant for 64-bit builds"); +#else + qsizetype size = qsizetype(std::numeric_limits<int>::max()) + 3; + QByteArray input; + QT_TRY { + input.reserve(size); + } QT_CATCH (const std::bad_alloc &) { + QSKIP("Out of memory"); + } + // fill with '、' - a single octet character in Shift-JIS + input.fill('\xa4', std::numeric_limits<int>::max() - 1); + // then append 'こ' - a two octet character in Shift-JIS + // which is now straddling the 2 GiB boundary + input += "\x82\xb1"; + // then append another two '、', so that our output is also crossing the + // 2 GiB boundary + input += "\xa4\xa4"; + QCOMPARE(input.size(), input.capacity()); + constexpr uint SHIFT_JIS = 932u; + QStringConverter::State state; + QString result; + QT_TRY { + result = QLocal8Bit::convertToUnicode_sys(input, SHIFT_JIS, &state); + } QT_CATCH (const std::bad_alloc &) { + QSKIP("Out of memory"); + } + QCOMPARE(result.size(), size - 1); // The 2-octet character is only 1 code unit in UTF-16 + QCOMPARE(result.last(4), u"、こ、、"); // Check we correctly decoded it + QCOMPARE(state.remainingChars, 0); // and there is nothing left in the state +#endif +} + +void tst_QStringConverter::toLocal8Bit_data() +{ + fromLocal8Bit_data(); +} + +void tst_QStringConverter::toLocal8Bit() +{ + QFETCH(const QByteArray, eightBit); + QFETCH(const QString, utf16); + QFETCH(const quint32, codePage); + + QStringConverter::State state; + + QByteArray result = QLocal8Bit::convertFromUnicode_sys(utf16, codePage, &state); + QCOMPARE(result, eightBit); + QCOMPARE(state.remainingChars, 0); + + result.clear(); + state.clear(); + for (QChar c : utf16) + result += QLocal8Bit::convertFromUnicode_sys(QStringView(&c, 1), codePage, &state); + QCOMPARE(result, eightBit); + QCOMPARE(state.remainingChars, 0); + + result.clear(); + state.clear(); + // Decode the full string again, this time without state + state.flags |= QStringConverter::Flag::Stateless; + result = QLocal8Bit::convertFromUnicode_sys(utf16, codePage, &state); + QCOMPARE(result, eightBit); + QCOMPARE(state.remainingChars, 0); +} + +void tst_QStringConverter::toLocal8Bit_special_cases() +{ + QStringConverter::State state; + // Normally utf8 goes through a different code path, but we can force it here + constexpr uint UTF8 = 65001u; + // Decode a 2-code unit character, but only provide 1 code unit at first: + const char16_t a[] = u"𬽦"; + QStringView codeUnits = a; + QByteArray result = QLocal8Bit::convertFromUnicode_sys(codeUnits.first(1), UTF8, &state); + QCOMPARE(result, QString()); + QVERIFY(result.isNull()); + QCOMPARE_GT(state.remainingChars, 0); + // Then provide the second code unit: + result = QLocal8Bit::convertFromUnicode_sys(codeUnits.sliced(1), UTF8, &state); + QCOMPARE(result, "\xf0\xac\xbd\xa6"_ba); + QCOMPARE(state.remainingChars, 0); + + // Retain compat with the behavior for toLocal8Bit: + QCOMPARE(codeUnits.first(1).toLocal8Bit(), "?"); + + // QString::toLocal8Bit is already stateless, but test stateless handling + // explicitly anyway: + result.clear(); + QStringConverter::State statelessState; + statelessState.flags |= QStringConverter::Flag::Stateless; + result = QLocal8Bit::convertFromUnicode_sys(codeUnits.first(1), UTF8, &statelessState); + result += QLocal8Bit::convertFromUnicode_sys(codeUnits.sliced(1), UTF8, &statelessState); + // Windows uses the replacement character for invalid characters: + QCOMPARE(result, "\ufffd\ufffd"); + + // Now do the same, but the second time we feed in a character, we also + // provide many more so the internal stack buffer is not large enough. + result.clear(); + state.clear(); + QString str = QStringView(a).toString().repeated(2048); + codeUnits = str; + result = QLocal8Bit::convertFromUnicode_sys(codeUnits.first(1), UTF8, &state); + QCOMPARE(result, QString()); + QVERIFY(result.isNull()); + QCOMPARE_GT(state.remainingChars, 0); + // Then we provide the rest of the string: + result = QLocal8Bit::convertFromUnicode_sys(codeUnits.sliced(1), UTF8, &state); + QCOMPARE(result.first(4), "\xf0\xac\xbd\xa6"_ba); + QCOMPARE(state.remainingChars, 0); +} + +void tst_QStringConverter::toLocal8Bit_2GiB() +{ +#if QT_POINTER_SIZE == 4 + QSKIP("This test is only relevant for 64-bit builds"); +#else + constexpr qsizetype TwoGiB = qsizetype(std::numeric_limits<int>::max()); + QString input; + QT_TRY { + input.reserve(TwoGiB + 1); + } QT_CATCH (const std::bad_alloc &) { + QSKIP("Out of memory"); + } + // Fill with a single code unit character + input.fill(u'.', TwoGiB - 1); + // Then append a 2 code unit character, so that the input straddles the 2 GiB + // boundary + input += u"🙂"; + QCOMPARE(input.size(), input.capacity()); + constexpr uint UTF8 = 65001u; + QStringConverter::State state; + QByteArray result; + QT_TRY { + result = QLocal8Bit::convertFromUnicode_sys(input, UTF8, &state); + } QT_CATCH (const std::bad_alloc &) { + QSKIP("Out of memory"); + } + QUtf8StringView rView = result; + QCOMPARE(rView.size(), TwoGiB + 3); // The 2 code unit smiley is 4 code units in UTF-8 + QCOMPARE(rView.last(7), u8"...🙂"); // Check we correctly decoded it + QCOMPARE(state.remainingChars, 0); // and there is nothing left in the state +#endif +} +#endif // Q_OS_WIN + struct DontCrashAtExit { ~DontCrashAtExit() { QStringDecoder decoder(QStringDecoder::Utf8); diff --git a/tests/auto/corelib/text/qstringconverter/utf8.txt b/tests/auto/corelib/text/qstringconverter/utf8.txt deleted file mode 100644 index f5ab44c8f4..0000000000 --- a/tests/auto/corelib/text/qstringconverter/utf8.txt +++ /dev/null @@ -1 +0,0 @@ -<doc>𐀀</doc>
|