diff options
author | Fabian Kosmale <fabian.kosmale@qt.io> | 2022-01-31 11:25:25 +0100 |
---|---|---|
committer | Fabian Kosmale <fabian.kosmale@qt.io> | 2022-06-19 00:41:12 +0200 |
commit | 122270d6bea164e6df4357f4d4d77aacfa430470 (patch) | |
tree | 4d0477aa23a0575b7d6185311ca51a56746ab0f4 /tests | |
parent | d350373133f169b44fd98faab6fe3f75abab6282 (diff) |
Long live the ICU-based QStringConverter interface!
This adds support for additional codecs to QStringConverter when ICU is
available.
We store the converter in the state (d[0]), and its canonical name in
d[1]. We need the name there, as in the clear function we close the
UConverter, and set the pointer to null. Consequently, the actual
conversion functions might need to re-open the converter again. The
advantage of this approach is that clear is used in the destructor of
State, and with this approach we properly clean up the state.
There is however a disadvantage: The clear function was so far also used
for resetting the state when QStringConverter::resetState . Discarding
the whole Uconverter for that is however rather costly. For that reason
we modify resetState to call a new function, State::reset. For existing
converters, it behaves the same as clear; for the ICU based converter,
we call the more efficient ucnv_reset. Code compiled against Qt 6.4 can
benefit from this more efficient version; code compiled against older Qt
versions will continue to work, as the conversion functions can just
recretate the converter from the name.
We can distinguish between ICU and non-ICU converters by checking if the
UsesIcu flag is set.
QStringConverter::name is changed to return the name stored in d[1]. The
interface of the ICU converter has a dummy name, so code using the old
name function from QT < 6.4 still returns something, namely a message
asking the user to recompile.
The function is moved out of line, as we need to check for the private
ICU feature, and want to avoid having that check in the public header.
As the QStringConverter ctor taking a name now can allocate memory, it
can no longer be noexcept. Removing the noexceptness is safe, as it was
only added after Qt 6.3.
Note that we cannot extend the API consuming or returning Encoding, as
we use Encoding values to index into an array of converter interfaces in
inline API.
Further API to support getting an ICU converter for HTML will be added
in a future commit.
Currently, the code depending on ICU is enabled at compile time if ICU
is found. However, in the future it could be moved into a plugin to
avoid a hard dependency on ICU in Core.
[ChangeLog][Corelib][Text] QStringConverter and API using it now
supports more text codecs if Qt is compiled with ICU support.
Fixes: QTBUG-103375
Change-Id: I7afb92fc68ef994179ebc7a3aa73beebb1386204
Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
Diffstat (limited to 'tests')
3 files changed, 280 insertions, 2 deletions
diff --git a/tests/auto/corelib/text/qstringconverter/CMakeLists.txt b/tests/auto/corelib/text/qstringconverter/CMakeLists.txt index 07e33e26ca..a7816de1bf 100644 --- a/tests/auto/corelib/text/qstringconverter/CMakeLists.txt +++ b/tests/auto/corelib/text/qstringconverter/CMakeLists.txt @@ -8,4 +8,14 @@ qt_internal_add_test(tst_qstringconverter SOURCES tst_qstringconverter.cpp TESTDATA ${test_data} + PUBLIC_LIBRARIES + Qt::CorePrivate # for access to Qt's feature system +) + + +qt_internal_add_resource(tst_qstringconverter "compressedtexture_bc1" + PREFIX + "/" + FILES + "euc_kr.txt" ) diff --git a/tests/auto/corelib/text/qstringconverter/euc_kr.txt b/tests/auto/corelib/text/qstringconverter/euc_kr.txt new file mode 100644 index 0000000000..a0eb9af691 --- /dev/null +++ b/tests/auto/corelib/text/qstringconverter/euc_kr.txt @@ -0,0 +1 @@ +Ы?Ȫʦ??ɪǪEUC Packed Formatȡ2ЫͳEUC Fixed Width Format롣ġ?ݻ?ĪǡEUCȪǪ˪Ī?롣 diff --git a/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp b/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp index 633346a639..07f29a6429 100644 --- a/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp +++ b/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp @@ -4,11 +4,14 @@ #include <QTest> +#include <QtCore/private/qglobal_p.h> #include <qstringconverter.h> #include <qthreadpool.h> #include <array> +using namespace Qt::StringLiterals; + static constexpr bool IsBigEndian = QSysInfo::ByteOrder == QSysInfo::BigEndian; enum CodecLimitation { AsciiOnly, @@ -29,8 +32,6 @@ static constexpr bool localeIsUtf8() } #endif -using namespace Qt::StringLiterals; - struct Codec { const char name[12]; @@ -129,6 +130,21 @@ private slots: void roundtrip_data(); void roundtrip(); +#if QT_CONFIG(icu) + void roundtripIcu_data(); + void roundtripIcu(); + void icuInvalidCharacter_data(); + void icuInvalidCharacter(); + void icuEncodeEdgeCases_data(); + void icuEncodeEdgeCases(); + void icuUsableAfterMove(); + void charByCharConsistency_data(); + void charByCharConsistency(); + void byteByByteConsistency_data(); + void byteByByteConsistency(); + void statefulPieceWise(); +#endif + void flagF7808080() const; void utf8Codec_data(); @@ -411,6 +427,257 @@ void tst_QStringConverter::roundtrip() QCOMPARE(decoded, uniString); } +#if QT_CONFIG(icu) + +void tst_QStringConverter::roundtripIcu_data() +{ + QTest::addColumn<QString>("original"); + QTest::addColumn<QByteArray>("codec"); + + QTest::addRow("shift_jis") << u"古池や 蛙飛び込む 水の音"_s << QByteArray("shift_jis"); + QTest::addRow("UTF7") << u"Übermäßig: čçö"_s << QByteArray("UTF-7"); +} + +void tst_QStringConverter::roundtripIcu() +{ + QFETCH(QString, original); + QFETCH(QByteArray, codec); + QStringEncoder fromUtf16(codec); + if (!fromUtf16.isValid()) + QSKIP("Unsupported codec"); + QStringDecoder toUtf16(codec); + QByteArray asShiftJIS = fromUtf16(original); + QString roundTripped = toUtf16(asShiftJIS); + QCOMPARE(roundTripped, original); +} + +void tst_QStringConverter::icuEncodeEdgeCases_data() +{ + QTest::addColumn<QString>("source"); + QTest::addColumn<QByteArray>("expected") ; + QTest::addColumn<QByteArray>("codec"); + + QTest::addRow("empty") << QString() << QByteArray() << QByteArray("ISO-2022-CN"); + QTest::addRow("BOMonly") << QString(QChar(QChar::ByteOrderMark)) << QByteArray() << QByteArray("ISO-2022-CN"); + QTest::addRow("1to6") << u"좋"_s << QByteArray::fromHex("1b2428434141") << QByteArray("ISO-2022-JP-2"); + QTest::addRow("1to7") << u"漢"_s << QByteArray::fromHex("1b2429470e6947") << QByteArray("ISO-2022-CN"); + QTest::addRow("1to8") << u"墎"_s << QByteArray::fromHex("1b242a481b4e4949") << QByteArray("ISO-2022-CN"); + QTest::addRow("utf7") << u"Übergröße"_s << QByteArray("+ANw-bergr+APYA3w-e") << QByteArray("UTF-7"); +} + +void tst_QStringConverter::icuEncodeEdgeCases() +{ + QFETCH(QString, source); + QFETCH(QByteArray, expected); + QFETCH(QByteArray, codec); + QStringEncoder encoder(codec); + if (!encoder.isValid()) + QSKIP("Unsupported codec"); + QVERIFY(encoder.isValid()); + QByteArray encoded = encoder.encode(source); + QCOMPARE(encoded, expected); +} + +void tst_QStringConverter::charByCharConsistency_data() +{ + QTest::addColumn<QStringView>("source"); + QTest::addColumn<QByteArray>("codec"); + + auto addRow = [](const TestString &s) { + QTest::addRow("%s_shift_jis", s.description) << s.utf16 << QByteArray("shift_jis"); + QTest::addRow("%s_EUC-CN", s.description) << s.utf16 << QByteArray("EUC-CN"); + }; + + for (const TestString &s : testStrings) { + if (s.utf16.isEmpty()) + continue; + addRow(s); + } +} + +void tst_QStringConverter::charByCharConsistency() +{ + QFETCH(QStringView, source); + QFETCH(QByteArray, codec); + + { + QStringEncoder encoder(codec); + if (!encoder.isValid()) + QSKIP("Unsupported codec"); + + QByteArray fullyConverted = encoder.encode(source); + encoder.resetState(); + QByteArray stepByStepConverted; + for (const auto& codeUnit: source) { + stepByStepConverted += encoder.encode(codeUnit); + } + QCOMPARE(stepByStepConverted, fullyConverted); + } + + { + QStringEncoder encoder(codec, QStringConverter::Flag::ConvertInvalidToNull); + + QByteArray fullyConverted = encoder.encode(source); + encoder.resetState(); + QByteArray stepByStepConverted; + for (const auto& codeUnit: source) { + stepByStepConverted += encoder.encode(codeUnit); + } + QCOMPARE(stepByStepConverted, fullyConverted); + } +} + +void tst_QStringConverter::byteByByteConsistency_data() +{ + QTest::addColumn<QByteArray>("source"); + QTest::addColumn<QByteArray>("codec"); + + QTest::addRow("plain_ascii_utf7") << QByteArray("Hello, world!") << QByteArray("UTF-7"); + QFile eucKr(":/euc_kr.txt"); + if (eucKr.open(QFile::ReadOnly)) + QTest::addRow("euc_kr_storing_jp") << eucKr.readAll() << QByteArray("EUC-KR"); + QTest::addRow("incomplete_euc_jp") << QByteArrayLiteral("test\x8Ftest") << QByteArray("EUC-JP"); +} + +void tst_QStringConverter::byteByByteConsistency() +{ + QFETCH(QByteArray, source); + QFETCH(QByteArray, codec); + + { + QStringDecoder decoder(codec); + if (!decoder.isValid()) + QSKIP("Unsupported codec"); + + QString fullyConverted = decoder.decode(source); + decoder.resetState(); + QString stepByStepConverted; + for (const auto& byte: source) { + QByteArray singleChar; + singleChar.append(byte); + stepByStepConverted += decoder.decode(singleChar); + } + QCOMPARE(stepByStepConverted, fullyConverted); + } + + { + QStringDecoder decoder(codec, QStringConverter::Flag::ConvertInvalidToNull); + if (!decoder.isValid()) + QSKIP("Unsupported codec"); + + QString fullyConverted = decoder.decode(source); + decoder.resetState(); + QString stepByStepConverted; + for (const auto& byte: source) { + QByteArray singleChar; + singleChar.append(byte); + stepByStepConverted += decoder.decode(singleChar); + } + QCOMPARE(stepByStepConverted, fullyConverted); + } +} + +void tst_QStringConverter::statefulPieceWise() +{ + QStringDecoder decoder("HZ"); + if (!decoder.isValid()) + QSKIP("Unsupported codec"); + QString start = decoder.decode("pure ASCII"); + QCOMPARE(start, u"pure ASCII"); + QString shifted = decoder.decode("~{"); + // shift out changes the state, but won't create any output + QCOMPARE(shifted, ""); + QString continuation = decoder.decode("\x42\x43"); + QCOMPARE(continuation, "旅"); + decoder.resetState(); + // after resetting the state we're in N0 again + QString afterReset = decoder.decode("\x42\x43"); + QCOMPARE(afterReset, "BC"); +} + +void tst_QStringConverter::icuUsableAfterMove() +{ + { + QStringDecoder decoder("EUC-JP"); + QVERIFY(decoder.isValid()); + QString partial = decoder.decode("Test\x8E"); + QCOMPARE(partial, u"Test"_s); + QStringDecoder moved(std::move(decoder)); + QString complete = partial + moved.decode("\xA1Test"); + QCOMPARE(complete, u"Test\uFF61Test"_s); + } + { + QStringEncoder encoder("Big5"); + QVERIFY(encoder.isValid()); + QByteArray encoded = encoder.encode("hello"_L1); + QCOMPARE(encoded, "hello"); + QStringEncoder moved(std::move(encoder)); + encoded = moved.encode("bye"); + QCOMPARE(encoded, "bye"); + } +} + +void tst_QStringConverter::icuInvalidCharacter_data() +{ + QTest::addColumn<QString>("string"); + QTest::addColumn<QByteArray>("bytearray"); + QTest::addColumn<QByteArray>("codec"); + QTest::addColumn<QStringConverter::Flags>("flags"); + QTest::addColumn<bool>("shouldDecode"); + + using Flags = QStringConverter::Flags; + using Flag = QStringConverter::Flag; + QTest::addRow("encode") + << u"Test👪Test"_s + << QByteArrayLiteral("\xE3\x85\xA2\xA3\x3F\xE3\x85\xA2\xA3") + << QByteArray("IBM-037") << Flags(Flag::Default) + << false; + QTest::addRow("encode_null") + << u"Test👪Test"_s + << QByteArrayLiteral("\xE3\x85\xA2\xA3\0\xE3\x85\xA2\xA3") + << QByteArray("IBM-037") << Flags(Flag::ConvertInvalidToNull) + << false; + QTest::addRow("decode_incomplete_EUC-JP") + << u"test"_s + << QByteArrayLiteral("test\x8F") + << QByteArray("EUC-JP") << Flags(Flag::Stateless) + << true; + QTest::addRow("decode_invalid_EUC-JP_sequence") + << u"test\0test"_s + << QByteArrayLiteral("test\x8Ftest") + << QByteArray("EUC-JP") << Flags(Flag::ConvertInvalidToNull) + << true; + QTest::addRow("encode_incomplete_surrogate") + << u"test"_s + QChar::highSurrogate(0x11136) + << QByteArray("test") + << QByteArray("EUC-JP") << Flags(Flag::Stateless) + << false; +} + +void tst_QStringConverter::icuInvalidCharacter() +{ + QFETCH(QString, string); + QFETCH(QByteArray, bytearray); + QFETCH(QByteArray, codec); + QFETCH(QStringConverter::Flags, flags); + QFETCH(bool, shouldDecode); + if (shouldDecode) { + QStringDecoder decoder(codec.data(), flags); + QVERIFY(decoder.isValid()); + QString decoded = decoder.decode(bytearray); + QVERIFY(decoder.hasError()); + QCOMPARE(decoded, string); + } else { + QStringEncoder encoder(codec.data(), flags); + QVERIFY(encoder.isValid()); + QByteArray encoded = encoder.encode(string); + QVERIFY(encoder.hasError()); + QCOMPARE(encoded, bytearray); + } +} + +#endif + void tst_QStringConverter::flagF7808080() const { /* This test case stems from test not-wf-sa-170, tests/qxmlstream/XML-Test-Suite/xmlconf/xmltest/not-wf/sa/166.xml, |