summaryrefslogtreecommitdiffstats
path: root/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp')
-rw-r--r--tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp1267
1 files changed, 1113 insertions, 154 deletions
diff --git a/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp b/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp
index 38714ef580..7c0235998f 100644
--- a/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp
+++ b/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp
@@ -1,58 +1,180 @@
-/****************************************************************************
-**
-** Copyright (C) 2020 The Qt Company Ltd.
-** Copyright (C) 2016 Intel Corporation.
-** Contact: https://www.qt.io/licensing/
-**
-** This file is part of the test suite of the Qt Toolkit.
-**
-** $QT_BEGIN_LICENSE:GPL-EXCEPT$
-** Commercial License Usage
-** Licensees holding valid commercial Qt licenses may use this file in
-** accordance with the commercial license agreement provided with the
-** Software or, alternatively, in accordance with the terms contained in
-** a written agreement between you and The Qt Company. For licensing terms
-** and conditions see https://www.qt.io/terms-conditions. For further
-** information use the contact form at https://www.qt.io/contact-us.
-**
-** GNU General Public License Usage
-** Alternatively, this file may be used under the terms of the GNU
-** General Public License version 3 as published by the Free Software
-** Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
-** included in the packaging of this file. Please review the following
-** information to ensure the GNU General Public License requirements will
-** be met: https://www.gnu.org/licenses/gpl-3.0.html.
-**
-** $QT_END_LICENSE$
-**
-****************************************************************************/
+// Copyright (C) 2021 The Qt Company Ltd.
+// Copyright (C) 2016 Intel Corporation.
+// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only
#include <QTest>
+#include <QtCore/private/qglobal_p.h>
#include <qstringconverter.h>
+#include <private/qstringconverter_p.h>
#include <qthreadpool.h>
+#include <array>
+#include <numeric>
+
+using namespace Qt::StringLiterals;
+
+QT_BEGIN_NAMESPACE
+namespace QTest {
+template <typename T>
+char *toString(const std::optional<T> &opt)
+{
+ if (opt)
+ return QTest::toString(*opt);
+ else
+ return qstrdup("std::nullopt");
+}
+} // namespace QTest
+QT_END_NAMESPACE
+
+using QTest::toString;
+
+static constexpr bool IsBigEndian = QSysInfo::ByteOrder == QSysInfo::BigEndian;
+enum CodecLimitation {
+ AsciiOnly,
+ Latin1Only,
+ FullUnicode
+};
+
+#ifdef Q_OS_WIN
+# include <qt_windows.h>
+static bool localeIsUtf8()
+{
+ return GetACP() == CP_UTF8;
+}
+#else
+static constexpr bool localeIsUtf8()
+{
+ return true;
+}
+#endif
+
+struct Codec
+{
+ const char name[12];
+ QStringConverter::Encoding code;
+ CodecLimitation limitation = FullUnicode;
+};
+static const std::array codes = {
+ Codec{ "UTF-8", QStringConverter::Utf8 },
+ Codec{ "UTF-16", QStringConverter::Utf16 },
+ Codec{ "UTF-16-le", QStringConverter::Utf16LE },
+ Codec{ "UTF-16-be", QStringConverter::Utf16BE },
+ Codec{ "UTF-32", QStringConverter::Utf32 },
+ Codec{ "UTF-32-le", QStringConverter::Utf32LE },
+ Codec{ "UTF-32-be", QStringConverter::Utf32BE },
+ Codec{ "Latin-1", QStringConverter::Latin1, Latin1Only },
+ Codec{ "System", QStringConverter::System, localeIsUtf8() ? FullUnicode : AsciiOnly }
+};
+
+static const std::array encodedBoms = {
+ QByteArrayView("\xef\xbb\xbf"), // Utf8,
+ QByteArrayView(IsBigEndian ? "\xfe\xff" : "\xff\xfe"), // Utf16,
+ QByteArrayView("\xff\xfe"), // Utf16LE,
+ QByteArrayView("\xfe\xff"), // Utf16BE,
+ QByteArrayView(IsBigEndian ? "\0\0\xfe\xff" : "\xff\xfe\0", 4), // Utf32,
+ QByteArrayView("\xff\xfe\0", 4), // Utf32LE,
+ QByteArrayView("\0\0\xfe\xff", 4), // Utf32BE,
+};
+
+struct TestString
+{
+ const char *description;
+ QUtf8StringView utf8;
+ QStringView utf16;
+ CodecLimitation limitation = FullUnicode;
+};
+static const std::array testStrings = {
+ TestString{ "empty", "", u"", AsciiOnly },
+ TestString{ "null-character", QUtf8StringView("", 1), QStringView(u"", 1), AsciiOnly },
+ TestString{ "ascii-text",
+ "This is a standard US-ASCII message",
+ "This is a standard US-ASCII message" u"",
+ AsciiOnly
+ },
+ TestString{ "ascii-with-carriage-return", "a\rb", u"a\rb", AsciiOnly },
+ TestString{ "ascii-with-control",
+ "\1This\2is\3an\4US-ASCII\020 message interspersed with control chars",
+ "\1This\2is\3an\4US-ASCII\020 message interspersed with control chars" u"",
+ AsciiOnly
+ },
+
+ TestString{ "nbsp", "\u00a0", u"\u00a0", Latin1Only },
+ TestString{ "latin1-text",
+ "Hyvää päivää, käyhän että tuon kannettavani saunaan?",
+ "Hyvää päivää, käyhän että tuon kannettavani saunaan?" u"",
+ Latin1Only
+ },
+
+#define ROW(name, string) TestString{ name, u8"" string, u"" string }
+ ROW("euro", "€"),
+ ROW("character+bom", "b\ufeff"),
+ /* Check that the codec does NOT flag EFBFBF.
+ * This is a regression test; see QTBUG-33229
+ */
+ ROW("last-bmp", "\uffff"),
+ ROW("character+last-bmp", "b\uffff"),
+ ROW("replacement", "\ufffd"),
+ ROW("supplementary-plane", "\U00010203"),
+ ROW("mahjong", "\U0001f000\U0001f001\U0001f002\U0001f003\U0001f004\U0001f005"
+ "\U0001f006\U0001f007\U0001f008\U0001f009\U0001f00a\U0001f00b\U0001f00c"
+ "\U0001f00d\U0001f00e\U0001f00f"),
+ ROW("emojis", "😂, 😃, 🧘🏻‍♂️, 🌍, 🌦️, 🍞, 🚗, 📞, 🎉, ❤️, 🏁"), // https://en.wikipedia.org/wiki/Emoji
+ ROW("last-valid", "\U0010fffd"), // U+10FFFF is the strict last, but it's a non-character
+ ROW("mixed-bmp-only", "abc\u00a0\u00e1\u00e9\u01fd \u20acdef"),
+ ROW("mixed-full", "abc\u00a0\u00e1\u00e9\u01fd \U0010FFFD \u20acdef"),
+ ROW("xml", "<doc>\U00010000\U0010FFFD</doc>\r\n")
+#undef ROW
+};
+
class tst_QStringConverter : public QObject
{
Q_OBJECT
private slots:
+ void initTestCase();
+
void threadSafety();
void constructByName();
+ void invalidConverter();
+
+ void convertUtf8_data();
void convertUtf8();
+ void convertUtf8CharByChar_data() { convertUtf8_data(); }
+ void convertUtf8CharByChar();
+ void roundtrip_data();
+ void roundtrip();
+
+ void convertL1U8();
+
+ void convertL1U16();
+
+#if QT_CONFIG(icu)
+ void roundtripIcu_data();
+ void roundtripIcu();
+ void icuInvalidCharacter_data();
+ void icuInvalidCharacter();
+ void icuEncodeEdgeCases_data();
+ void icuEncodeEdgeCases();
+ void icuUsableAfterMove();
+ void charByCharConsistency_data();
+ void charByCharConsistency();
+ void byteByByteConsistency_data();
+ void byteByByteConsistency();
+ void statefulPieceWise();
+#endif
- void nonFlaggedCodepointFFFF() const;
void flagF7808080() const;
- void nonFlaggedEFBFBF() const;
- void decode0D() const;
void utf8Codec_data();
void utf8Codec();
void utf8bom_data();
void utf8bom();
+ void roundtripBom_data();
+ void roundtripBom();
void utf8stateful_data();
void utf8stateful();
@@ -71,6 +193,20 @@ private slots:
void encodingForHtml_data();
void encodingForHtml();
+
+ void availableCodesAreAvailable();
+
+#ifdef Q_OS_WIN
+ // On all other systems local 8-bit encoding is UTF-8
+ void fromLocal8Bit_data();
+ void fromLocal8Bit();
+ void fromLocal8Bit_special_cases();
+ void fromLocal8Bit_2GiB();
+ void toLocal8Bit_data();
+ void toLocal8Bit();
+ void toLocal8Bit_special_cases();
+ void toLocal8Bit_2GiB();
+#endif
};
void tst_QStringConverter::constructByName()
@@ -98,73 +234,524 @@ void tst_QStringConverter::constructByName()
QVERIFY(!strcmp(decoder.name(), "UTF-16"));
}
+void tst_QStringConverter::invalidConverter()
+{
+ // QStringEncoder tests
+ {
+ QStringEncoder encoder;
+ QVERIFY(!encoder.isValid());
+ QVERIFY(!encoder.name());
+ QByteArray encoded = encoder(u"Some text");
+ QVERIFY(encoded.isEmpty());
+ QVERIFY(encoder.hasError());
+
+ encoder.resetState();
+ QVERIFY(!encoder.hasError());
+
+ encoded = encoder.encode(u"More text");
+ QVERIFY(encoded.isEmpty());
+ QVERIFY(encoder.hasError());
+ QCOMPARE(encoder.requiredSpace(42), 0);
+
+ encoder.resetState();
+ QVERIFY(!encoder.hasError());
+ char buffer[100];
+ char *position = encoder.appendToBuffer(buffer, u"Even more");
+ QCOMPARE(position, buffer);
+ QVERIFY(encoder.hasError());
+ }
+
+ // QStringDecoder tests
+ {
+ QStringDecoder decoder;
+ QVERIFY(!decoder.name());
+ QVERIFY(!decoder.isValid());
+ QString decoded = decoder("Some text");
+ QVERIFY(decoded.isEmpty());
+ QVERIFY(decoder.hasError());
+
+ decoder.resetState();
+ QVERIFY(!decoder.hasError());
+
+ decoded = decoder.decode("More text");
+ QVERIFY(decoded.isEmpty());
+ QVERIFY(decoder.hasError());
+
+ QCOMPARE(decoder.requiredSpace(42), 0);
+
+ decoder.resetState();
+ QVERIFY(!decoder.hasError());
+ char16_t buffer[100];
+ char16_t *position = decoder.appendToBuffer(buffer, "Even more");
+ QCOMPARE(position, buffer);
+ QVERIFY(decoder.hasError());
+ }
+}
+
+void tst_QStringConverter::convertUtf8_data()
+{
+ QTest::addColumn<QStringConverter::Encoding>("encoding");
+ QTest::addColumn<QUtf8StringView>("utf8");
+ QTest::addColumn<QStringView>("utf16");
+ auto addRow = [](const TestString &s) {
+ QTest::addRow("Utf8:%s", s.description) << QStringDecoder::Utf8 << s.utf8 << s.utf16;
+ if (localeIsUtf8())
+ QTest::addRow("System:%s", s.description) << QStringDecoder::System << s.utf8 << s.utf16;
+ };
+
+ for (const TestString &s : testStrings)
+ addRow(s);
+}
+
void tst_QStringConverter::convertUtf8()
{
- QFile file(QFINDTESTDATA("utf8.txt"));
+ QFETCH(QStringConverter::Encoding, encoding);
+ QFETCH(QUtf8StringView, utf8);
+ QFETCH(QStringView, utf16);
+
+ QByteArray ba = QByteArray::fromRawData(utf8.data(), utf8.size());
+
+ QStringDecoder decoder(encoding);
+ QVERIFY(decoder.isValid());
+ QString uniString = decoder(ba);
+ QCOMPARE(uniString, utf16);
+ QCOMPARE(uniString, QString::fromUtf8(ba));
+ QCOMPARE(ba, uniString.toUtf8());
+
+ // do it again (using .decode())
+ uniString = decoder.decode(ba);
+ QCOMPARE(uniString, utf16);
+ QCOMPARE(uniString, QString::fromUtf8(ba));
+ QCOMPARE(ba, uniString.toUtf8());
+
+ QStringEncoder encoder(encoding);
+ QByteArray reencoded = encoder(utf16);
+ QCOMPARE(reencoded, utf8);
+ QCOMPARE(reencoded, uniString.toUtf8());
+
+ // do it again (using .encode())
+ reencoded = encoder.encode(utf16);
+ QCOMPARE(reencoded, utf8);
+ QCOMPARE(reencoded, uniString.toUtf8());
+
+ if (utf16.isEmpty())
+ return;
+
+ // repeat, with a longer string
+ constexpr qsizetype MinSize = 128;
+ uniString = utf16.toString();
+ while (uniString.size() < MinSize && ba.size() < MinSize) {
+ uniString += uniString;
+ ba += ba;
+ }
+ QCOMPARE(decoder(ba), uniString);
+ QCOMPARE(encoder(uniString), ba);
+}
+
+void tst_QStringConverter::convertUtf8CharByChar()
+{
+ QFETCH(QStringConverter::Encoding, encoding);
+ QFETCH(QUtf8StringView, utf8);
+ QFETCH(QStringView, utf16);
+
+ QByteArray ba = QByteArray::fromRawData(utf8.data(), utf8.size());
+
+ QStringDecoder decoder(encoding);
+ QVERIFY(decoder.isValid());
+ QString uniString;
+ for (int i = 0; i < ba.size(); ++i)
+ uniString += decoder(QByteArrayView(ba).sliced(i, 1));
+ QCOMPARE(uniString, utf16);
+ QCOMPARE(uniString, QString::fromUtf8(ba));
+ uniString.clear();
+
+ // do it again (using .decode())
+ for (int i = 0; i < ba.size(); ++i)
+ uniString += decoder.decode(QByteArrayView(ba).sliced(i, 1));
+ QCOMPARE(uniString, utf16);
+ QCOMPARE(uniString, QString::fromUtf8(ba));
+
+ QStringEncoder encoder(encoding);
+ QByteArray reencoded;
+ for (int i = 0; i < utf16.size(); ++i)
+ reencoded += encoder(utf16.sliced(i, 1));
+ QCOMPARE(reencoded, ba);
+ reencoded.clear();
+
+ // do it again (using .encode())
+ for (int i = 0; i < utf16.size(); ++i)
+ reencoded += encoder.encode(utf16.sliced(i, 1));
+ QCOMPARE(reencoded, ba);
+}
+
+void tst_QStringConverter::convertL1U16()
+{
+ const QLatin1StringView latin1("some plain latin1 text");
+ const QString qstr(latin1);
- if (!file.open(QIODevice::ReadOnly))
- QFAIL(qPrintable("File could not be opened: " + file.errorString()));
+ QStringDecoder decoder(QStringConverter::Latin1);
+ QVERIFY(decoder.isValid());
+ QString uniString = decoder(latin1);
+ QCOMPARE(uniString, qstr);
+ QCOMPARE(latin1, uniString.toLatin1());
+
+ // do it again (using .decode())
+ uniString = decoder.decode(latin1);
+ QCOMPARE(uniString, qstr);
+ QCOMPARE(latin1, uniString.toLatin1());
+
+ QStringEncoder encoder(QStringConverter::Latin1);
+ QByteArray reencoded = encoder(uniString);
+ QCOMPARE(reencoded, QByteArrayView(latin1));
+ QCOMPARE(reencoded, uniString.toLatin1());
+
+ // do it again (using .encode())
+ reencoded = encoder.encode(uniString);
+ QCOMPARE(reencoded, QByteArrayView(latin1));
+ QCOMPARE(reencoded, uniString.toLatin1());
+}
- QByteArray ba = file.readAll();
- QVERIFY(!ba.isEmpty());
+void tst_QStringConverter::roundtrip_data()
+{
+ QTest::addColumn<QStringView>("utf16");
+ QTest::addColumn<QStringConverter::Encoding>("code");
+
+ for (const auto &code : codes) {
+ for (const TestString &s : testStrings) {
+ // rules:
+ // 1) don't pass the null character to the System codec
+ // 2) only pass operate on a string that will properly convert
+ if (code.code == QStringConverter::System && s.utf16.contains(QChar(0)))
+ continue;
+ if (code.limitation < s.limitation)
+ continue;
+ QTest::addRow("%s:%s", code.name, s.description) << s.utf16 << code.code;
+ }
+
+ if (code.limitation == FullUnicode) {
+ using Digits = std::array<QChar, 2>;
+ using DigitsArray = std::array<Digits, 10>;
+ static constexpr DigitsArray chakmaDigits = []() {
+ const char32_t zeroVal = 0x11136; // Unicode's representation of Chakma zero
+ DigitsArray r;
+ for (int i = 0; i < int(r.size()); ++i)
+ r[i] = { QChar::highSurrogate(zeroVal + i), QChar::lowSurrogate(zeroVal + i) };
+ return r;
+ }();
+ for (int i = 0; i < int(chakmaDigits.size()); ++i)
+ QTest::addRow("%s:Chakma-digit-%d", code.name, i) << QStringView(chakmaDigits[i]) << code.code;
+ }
+ }
+}
+void tst_QStringConverter::roundtrip()
+{
+ QFETCH(QStringView, utf16);
+ QFETCH(QStringConverter::Encoding, code);
+ QStringEncoder out(code);
+ QByteArray encoded = out.encode(utf16);
+ QStringDecoder back(code);
+ QString decoded = back.decode(encoded);
+ QCOMPARE(decoded, utf16);
+
+ // test some flags
+ QStringConverter::Flags flag = QStringEncoder::Flag::Stateless;
{
- QStringDecoder decoder(QStringDecoder::Utf8);
- QVERIFY(decoder.isValid());
- QString uniString = decoder(ba);
- QCOMPARE(uniString, QString::fromUtf8(ba));
- QCOMPARE(ba, uniString.toUtf8());
- uniString = decoder.decode(ba);
- QCOMPARE(uniString, QString::fromUtf8(ba));
- QCOMPARE(ba, uniString.toUtf8());
-
- QStringEncoder encoder(QStringEncoder::Utf8);
- QCOMPARE(ba, encoder(uniString));
- QCOMPARE(ba, encoder.encode(uniString));
+ QStringEncoder out2(code, flag);
+ QStringDecoder back2(code, flag);
+ decoded = back2.decode(out2.encode(utf16));
+ QCOMPARE(decoded, utf16);
+ }
+ flag |= QStringConverter::Flag::ConvertInvalidToNull;
+ {
+ QStringEncoder out2(code, flag);
+ QStringDecoder back2(code, flag);
+ decoded = back2.decode(out2.encode(utf16));
+ QCOMPARE(decoded, utf16);
}
+ if (utf16.isEmpty())
+ return;
+
+ // repeat, with a longer string
+ constexpr qsizetype MinSize = 128;
+ QString uniString = utf16.toString();
+ while (uniString.size() < MinSize && encoded.size() < MinSize) {
+ uniString += uniString;
+ encoded += encoded;
+ }
+ QCOMPARE(out.encode(uniString), encoded);
+ QCOMPARE(back.decode(encoded), uniString);
+
+ QStringEncoder out2(code, flag);
+ QStringDecoder back2(code, flag);
+ decoded = back2.decode(out2.encode(uniString));
+ QCOMPARE(decoded, uniString);
+}
+
+void tst_QStringConverter::convertL1U8()
+{
{
- // once again converting char by char
- QStringDecoder decoder(QStringDecoder::Utf8);
- QVERIFY(decoder.isValid());
- QString uniString;
- for (int i = 0; i < ba.size(); ++i)
- uniString += decoder(QByteArrayView(ba).sliced(i, 1));
- QCOMPARE(uniString, QString::fromUtf8(ba));
- uniString.clear();
- for (int i = 0; i < ba.size(); ++i)
- uniString += decoder.decode(QByteArrayView(ba).sliced(i, 1));
- QCOMPARE(uniString, QString::fromUtf8(ba));
-
- QStringEncoder encoder(QStringEncoder::Utf8);
- QByteArray reencoded;
- for (int i = 0; i < uniString.size(); ++i)
- reencoded += encoder(QStringView(uniString).sliced(i, 1));
- QCOMPARE(ba, encoder(uniString));
- reencoded.clear();
- for (int i = 0; i < uniString.size(); ++i)
- reencoded += encoder.encode(QStringView(uniString).sliced(i, 1));
- QCOMPARE(ba, encoder(uniString));
+ std::array<char, 256> latin1;
+ std::iota(latin1.data(), latin1.data() + latin1.size(), uchar(0));
+ std::array<char, 512> utf8;
+ auto out = QUtf8::convertFromLatin1(utf8.data(), QLatin1StringView{latin1.data(), latin1.size()});
+ QCOMPARE(QString::fromLatin1(latin1.data(), latin1.size()),
+ QString::fromUtf8(utf8.data(), out - utf8.data()));
}
}
-void tst_QStringConverter::nonFlaggedCodepointFFFF() const
+#if QT_CONFIG(icu)
+
+void tst_QStringConverter::roundtripIcu_data()
{
- //Check that the code point 0xFFFF (=non-character code 0xEFBFBF) is not flagged
- const QChar ch(0xFFFF);
+ QTest::addColumn<QString>("original");
+ QTest::addColumn<QByteArray>("codec");
+
+ QTest::addRow("shift_jis") << u"古池や 蛙飛び込む 水の音"_s << QByteArray("shift_jis");
+ QTest::addRow("UTF7") << u"Übermäßig: čçö"_s << QByteArray("UTF-7");
+}
+
+void tst_QStringConverter::roundtripIcu()
+{
+ QFETCH(QString, original);
+ QFETCH(QByteArray, codec);
+ QStringEncoder fromUtf16(codec);
+ if (!fromUtf16.isValid())
+ QSKIP("Unsupported codec");
+ QStringDecoder toUtf16(codec);
+ QByteArray asShiftJIS = fromUtf16(original);
+ QString roundTripped = toUtf16(asShiftJIS);
+ QCOMPARE(roundTripped, original);
+}
- QStringEncoder encoder(QStringEncoder::Utf8);
+void tst_QStringConverter::icuEncodeEdgeCases_data()
+{
+ QTest::addColumn<QString>("source");
+ QTest::addColumn<QByteArray>("expected") ;
+ QTest::addColumn<QByteArray>("codec");
+
+ QTest::addRow("empty") << QString() << QByteArray() << QByteArray("ISO-2022-CN");
+ QTest::addRow("BOMonly") << QString(QChar(QChar::ByteOrderMark)) << QByteArray() << QByteArray("ISO-2022-CN");
+ QTest::addRow("1to6") << u"좋"_s << QByteArray::fromHex("1b2428434141") << QByteArray("ISO-2022-JP-2");
+ QTest::addRow("1to7") << u"漢"_s << QByteArray::fromHex("1b2429470e6947") << QByteArray("ISO-2022-CN");
+ QTest::addRow("1to8") << u"墎"_s << QByteArray::fromHex("1b242a481b4e4949") << QByteArray("ISO-2022-CN");
+ QTest::addRow("utf7") << u"Übergröße"_s << QByteArray("+ANw-bergr+APYA3w-e") << QByteArray("UTF-7");
+}
+
+void tst_QStringConverter::icuEncodeEdgeCases()
+{
+ QFETCH(QString, source);
+ QFETCH(QByteArray, expected);
+ QFETCH(QByteArray, codec);
+ QStringEncoder encoder(codec);
+ if (!encoder.isValid())
+ QSKIP("Unsupported codec");
QVERIFY(encoder.isValid());
+ QByteArray encoded = encoder.encode(source);
+ QCOMPARE(encoded, expected);
+}
+
+void tst_QStringConverter::charByCharConsistency_data()
+{
+ QTest::addColumn<QStringView>("source");
+ QTest::addColumn<QByteArray>("codec");
+
+ auto addRow = [](const TestString &s) {
+ QTest::addRow("%s_shift_jis", s.description) << s.utf16 << QByteArray("shift_jis");
+ QTest::addRow("%s_EUC-CN", s.description) << s.utf16 << QByteArray("EUC-CN");
+ };
+
+ for (const TestString &s : testStrings) {
+ if (s.utf16.isEmpty())
+ continue;
+ addRow(s);
+ }
+}
- const QByteArray asDecoded = encoder(QStringView(&ch, 1));
- QCOMPARE(asDecoded, QByteArray("\357\277\277"));
+void tst_QStringConverter::charByCharConsistency()
+{
+ QFETCH(QStringView, source);
+ QFETCH(QByteArray, codec);
- QByteArray ffff("\357\277\277");
- QStringDecoder decoder(QStringEncoder::Utf8, QStringDecoder::Flag::ConvertInvalidToNull);
- QVERIFY(decoder.isValid());
- QVERIFY(decoder(ffff) == QString(1, ch));
+ {
+ QStringEncoder encoder(codec);
+ if (!encoder.isValid())
+ QSKIP("Unsupported codec");
+
+ QByteArray fullyConverted = encoder.encode(source);
+ encoder.resetState();
+ QByteArray stepByStepConverted;
+ for (const auto& codeUnit: source) {
+ stepByStepConverted += encoder.encode(codeUnit);
+ }
+ QCOMPARE(stepByStepConverted, fullyConverted);
+ }
+
+ {
+ QStringEncoder encoder(codec, QStringConverter::Flag::ConvertInvalidToNull);
+
+ QByteArray fullyConverted = encoder.encode(source);
+ encoder.resetState();
+ QByteArray stepByStepConverted;
+ for (const auto& codeUnit: source) {
+ stepByStepConverted += encoder.encode(codeUnit);
+ }
+ QCOMPARE(stepByStepConverted, fullyConverted);
+ }
+}
+
+void tst_QStringConverter::byteByByteConsistency_data()
+{
+ QTest::addColumn<QByteArray>("source");
+ QTest::addColumn<QByteArray>("codec");
+
+ QTest::addRow("plain_ascii_utf7") << QByteArray("Hello, world!") << QByteArray("UTF-7");
+ QFile eucKr(":/euc_kr.txt");
+ if (eucKr.open(QFile::ReadOnly))
+ QTest::addRow("euc_kr_storing_jp") << eucKr.readAll() << QByteArray("EUC-KR");
+ QTest::addRow("incomplete_euc_jp") << QByteArrayLiteral("test\x8Ftest") << QByteArray("EUC-JP");
+}
+
+void tst_QStringConverter::byteByByteConsistency()
+{
+ QFETCH(QByteArray, source);
+ QFETCH(QByteArray, codec);
+
+ {
+ QStringDecoder decoder(codec);
+ if (!decoder.isValid())
+ QSKIP("Unsupported codec");
+
+ QString fullyConverted = decoder.decode(source);
+ decoder.resetState();
+ QString stepByStepConverted;
+ for (const auto& byte: source) {
+ QByteArray singleChar;
+ singleChar.append(byte);
+ stepByStepConverted += decoder.decode(singleChar);
+ }
+ QCOMPARE(stepByStepConverted, fullyConverted);
+ }
+
+ {
+ QStringDecoder decoder(codec, QStringConverter::Flag::ConvertInvalidToNull);
+ if (!decoder.isValid())
+ QSKIP("Unsupported codec");
+
+ QString fullyConverted = decoder.decode(source);
+ decoder.resetState();
+ QString stepByStepConverted;
+ for (const auto& byte: source) {
+ QByteArray singleChar;
+ singleChar.append(byte);
+ stepByStepConverted += decoder.decode(singleChar);
+ }
+ QCOMPARE(stepByStepConverted, fullyConverted);
+ }
+}
+
+void tst_QStringConverter::statefulPieceWise()
+{
+ QStringDecoder decoder("HZ");
+ if (!decoder.isValid())
+ QSKIP("Unsupported codec");
+ QString start = decoder.decode("pure ASCII");
+ QCOMPARE(start, u"pure ASCII");
+ QString shifted = decoder.decode("~{");
+ // shift out changes the state, but won't create any output
+ QCOMPARE(shifted, "");
+ QString continuation = decoder.decode("\x42\x43");
+ QCOMPARE(continuation, "旅");
+ decoder.resetState();
+ // after resetting the state we're in N0 again
+ QString afterReset = decoder.decode("\x42\x43");
+ QCOMPARE(afterReset, "BC");
+}
+
+void tst_QStringConverter::icuUsableAfterMove()
+{
+ {
+ QStringDecoder decoder("EUC-JP");
+ QVERIFY(decoder.isValid());
+ QString partial = decoder.decode("Test\x8E");
+ QCOMPARE(partial, u"Test"_s);
+ QStringDecoder moved(std::move(decoder));
+ QString complete = partial + moved.decode("\xA1Test");
+ QCOMPARE(complete, u"Test\uFF61Test"_s);
+ }
+ {
+ QStringEncoder encoder("Big5");
+ QVERIFY(encoder.isValid());
+ QByteArray encoded = encoder.encode("hello"_L1);
+ QCOMPARE(encoded, "hello");
+ QStringEncoder moved(std::move(encoder));
+ encoded = moved.encode("bye");
+ QCOMPARE(encoded, "bye");
+ }
+}
+
+void tst_QStringConverter::icuInvalidCharacter_data()
+{
+ QTest::addColumn<QString>("string");
+ QTest::addColumn<QByteArray>("bytearray");
+ QTest::addColumn<QByteArray>("codec");
+ QTest::addColumn<QStringConverter::Flags>("flags");
+ QTest::addColumn<bool>("shouldDecode");
+
+ using Flags = QStringConverter::Flags;
+ using Flag = QStringConverter::Flag;
+ QTest::addRow("encode")
+ << u"Test👪Test"_s
+ << QByteArrayLiteral("\xE3\x85\xA2\xA3\x3F\xE3\x85\xA2\xA3")
+ << QByteArray("IBM-037") << Flags(Flag::Default)
+ << false;
+ QTest::addRow("encode_null")
+ << u"Test👪Test"_s
+ << QByteArrayLiteral("\xE3\x85\xA2\xA3\0\xE3\x85\xA2\xA3")
+ << QByteArray("IBM-037") << Flags(Flag::ConvertInvalidToNull)
+ << false;
+ QTest::addRow("decode_incomplete_EUC-JP")
+ << u"test"_s
+ << QByteArrayLiteral("test\x8F")
+ << QByteArray("EUC-JP") << Flags(Flag::Stateless)
+ << true;
+ QTest::addRow("decode_invalid_EUC-JP_sequence")
+ << u"test\0test"_s
+ << QByteArrayLiteral("test\x8Ftest")
+ << QByteArray("EUC-JP") << Flags(Flag::ConvertInvalidToNull)
+ << true;
+ QTest::addRow("encode_incomplete_surrogate")
+ << u"test"_s + QChar::highSurrogate(0x11136)
+ << QByteArray("test")
+ << QByteArray("EUC-JP") << Flags(Flag::Stateless)
+ << false;
}
+void tst_QStringConverter::icuInvalidCharacter()
+{
+ QFETCH(QString, string);
+ QFETCH(QByteArray, bytearray);
+ QFETCH(QByteArray, codec);
+ QFETCH(QStringConverter::Flags, flags);
+ QFETCH(bool, shouldDecode);
+ if (shouldDecode) {
+ QStringDecoder decoder(codec.data(), flags);
+ QVERIFY(decoder.isValid());
+ QString decoded = decoder.decode(bytearray);
+ QVERIFY(decoder.hasError());
+ QCOMPARE(decoded, string);
+ } else {
+ QStringEncoder encoder(codec.data(), flags);
+ QVERIFY(encoder.isValid());
+ QByteArray encoded = encoder.encode(string);
+ QVERIFY(encoder.hasError());
+ QCOMPARE(encoded, bytearray);
+ }
+}
+
+#endif
+
void tst_QStringConverter::flagF7808080() const
{
/* This test case stems from test not-wf-sa-170, tests/qxmlstream/XML-Test-Suite/xmlconf/xmltest/not-wf/sa/166.xml,
@@ -195,45 +782,6 @@ void tst_QStringConverter::flagF7808080() const
QCOMPARE(decoder(input), QString(input.size(), QChar(0)));
}
-void tst_QStringConverter::nonFlaggedEFBFBF() const
-{
- /* Check that the codec does NOT flag EFBFBF.
- * This is a regression test; see QTBUG-33229
- */
- QByteArray validInput;
- validInput.resize(3);
- validInput[0] = char(0xEF);
- validInput[1] = char(0xBF);
- validInput[2] = char(0xBF);
-
- {
- QStringDecoder decoder(QStringEncoder::Utf8, QStringDecoder::Flag::ConvertInvalidToNull);
- QVERIFY(decoder.isValid());
- QVERIFY(decoder(validInput) == QString::fromUtf8(QByteArray::fromHex("EFBFBF")));
- }
-
- // Check that 0xEFBFBF is correctly decoded when preceded by an arbitrary character
- {
- QByteArray start("B");
- start.append(validInput);
-
- QStringDecoder decoder(QStringEncoder::Utf8, QStringDecoder::Flag::ConvertInvalidToNull);
- QVERIFY(decoder.isValid());
- QVERIFY(decoder(start) == QString::fromUtf8(QByteArray("B").append(QByteArray::fromHex("EFBFBF"))));
- }
-}
-
-void tst_QStringConverter::decode0D() const
-{
- QByteArray input;
- input.resize(3);
- input[0] = 'A';
- input[1] = '\r';
- input[2] = 'B';
-
- QCOMPARE(QString::fromUtf8(input.constData()).toUtf8(), input);
-}
-
static QString fromInvalidUtf8Sequence(const QByteArray &ba)
{
return QString().fill(QChar::ReplacementCharacter, ba.size());
@@ -1326,6 +1874,51 @@ void tst_QStringConverter::utf8bom()
QCOMPARE(decoder(data), result);
}
+// someone set us up the BOM!
+void tst_QStringConverter::roundtripBom_data()
+{
+ QTest::addColumn<QStringView>("utf16");
+ QTest::addColumn<QStringConverter::Encoding>("code");
+
+ for (const auto &code : codes) {
+ if (size_t(code.code) >= encodedBoms.size())
+ break;
+ if (code.limitation != FullUnicode)
+ continue; // can't represent BOM
+
+ for (const TestString &s : testStrings) {
+ if (s.utf16.isEmpty())
+ continue;
+ QTest::addRow("%s:%s", code.name, s.description) << s.utf16 << code.code;
+ }
+ }
+}
+
+void tst_QStringConverter::roundtripBom()
+{
+ QFETCH(QStringView, utf16);
+ QFETCH(QStringConverter::Encoding, code);
+ QByteArray encodedBom = encodedBoms[code].toByteArray();
+ QChar bom = QChar::ByteOrderMark;
+
+ // QStringConverter defaults to producing no BOM, but interpreting it if it
+ // is there
+
+ QStringEncoder encoderWithoutBom(code);
+ QStringEncoder encoderWithBom(code, QStringEncoder::Flag::WriteBom);
+ QByteArray encodedWithoutBom = encoderWithoutBom(utf16);
+ QByteArray encodedWithBom = encoderWithBom(utf16);
+ QCOMPARE(encodedWithBom, encodedBom + encodedWithoutBom);
+
+ QStringDecoder decoderWithoutBom(code, QStringDecoder::Flag::ConvertInitialBom);
+ QStringDecoder decoderWithBom(code);
+ QString decoded = decoderWithBom(encodedWithBom);
+ QCOMPARE(decoded, utf16);
+
+ decoded = decoderWithoutBom(encodedWithBom);
+ QCOMPARE(decoded, bom + utf16.toString());
+}
+
void tst_QStringConverter::utf8stateful_data()
{
QTest::addColumn<QByteArray>("buffer1");
@@ -1596,7 +2189,7 @@ void tst_QStringConverter::utfHeaders()
QVERIFY(decode.isValid());
QString result = decode(encoded);
- QCOMPARE(result.length(), unicode.length());
+ QCOMPARE(result.size(), unicode.size());
QCOMPARE(result, unicode);
}
@@ -1607,7 +2200,7 @@ void tst_QStringConverter::utfHeaders()
QString result;
for (char c : encoded)
result += decode(QByteArrayView(&c, 1));
- QCOMPARE(result.length(), unicode.length());
+ QCOMPARE(result.size(), unicode.size());
QCOMPARE(result, unicode);
}
@@ -1635,25 +2228,42 @@ void tst_QStringConverter::encodingForName_data()
QTest::addColumn<QByteArray>("name");
QTest::addColumn<std::optional<QStringConverter::Encoding>>("encoding");
- QTest::newRow("UTF-8") << QByteArray("UTF-8") << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
- QTest::newRow("utf8") << QByteArray("utf8") << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
- QTest::newRow("Utf-8") << QByteArray("Utf-8") << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
- QTest::newRow("UTF-16") << QByteArray("UTF-16") << std::optional<QStringConverter::Encoding>(QStringConverter::Utf16);
- QTest::newRow("UTF-16le") << QByteArray("UTF-16le") << std::optional<QStringConverter::Encoding>(QStringConverter::Utf16LE);
- QTest::newRow("ISO-8859-1") << QByteArray("ISO-8859-1") << std::optional<QStringConverter::Encoding>(QStringConverter::Latin1);
- QTest::newRow("ISO8859-1") << QByteArray("ISO8859-1") << std::optional<QStringConverter::Encoding>(QStringConverter::Latin1);
- QTest::newRow("iso8859-1") << QByteArray("iso8859-1") << std::optional<QStringConverter::Encoding>(QStringConverter::Latin1);
- QTest::newRow("latin1") << QByteArray("latin1") << std::optional<QStringConverter::Encoding>(QStringConverter::Latin1);
- QTest::newRow("latin2") << QByteArray("latin2") << std::optional<QStringConverter::Encoding>();
- QTest::newRow("latin15") << QByteArray("latin15") << std::optional<QStringConverter::Encoding>();
+ auto row = [](const char *name, std::optional<QStringConverter::Encoding> expected = std::nullopt) {
+ auto protect = [](auto p) { return p ? *p ? p : "<empty>" : "<nullptr>"; };
+ QTest::addRow("%s", protect(name)) << QByteArray(name) << expected;
+ };
+
+ row("UTF-8", QStringConverter::Utf8);
+ row("utf8", QStringConverter::Utf8);
+ row("Utf-8", QStringConverter::Utf8);
+ row("UTF-16", QStringConverter::Utf16);
+ row("UTF-16le", QStringConverter::Utf16LE);
+ row("ISO-8859-1", QStringConverter::Latin1);
+ row("ISO8859-1", QStringConverter::Latin1);
+ row("iso8859-1", QStringConverter::Latin1);
+ row("latin1", QStringConverter::Latin1);
+ row("latin-1_-", QStringConverter::Latin1);
+ row("latin_1-_", QStringConverter::Latin1);
+ row("-_latin-1", QStringConverter::Latin1);
+ row("_-latin_1", QStringConverter::Latin1);
+
+ // failures:
+ row(nullptr);
+ row("");
+ row("latin2");
+ row("latin42");
+ row(" latin1"); // spaces are significant
+ row("\tlatin1"); // HTs are significant
}
void tst_QStringConverter::encodingForName()
{
- QFETCH(QByteArray, name);
- QFETCH(std::optional<QStringConverter::Encoding>, encoding);
+ QFETCH(const QByteArray, name);
+ QFETCH(const std::optional<QStringConverter::Encoding>, encoding);
+
+ const auto *ptr = name.isNull() ? nullptr : name.data();
- auto e = QStringConverter::encodingForName(name);
+ const auto e = QStringConverter::encodingForName(ptr);
QCOMPARE(e, encoding);
}
@@ -1737,65 +2347,102 @@ void tst_QStringConverter::encodingForHtml_data()
{
QTest::addColumn<QByteArray>("html");
QTest::addColumn<std::optional<QStringConverter::Encoding>>("encoding");
+ QTest::addColumn<QByteArray>("name"); // ICU name if we have ICU support
QByteArray html = "<html><head></head><body>blah</body></html>";
- QTest::newRow("no charset") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
+ QTest::newRow("no charset") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8");
html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=ISO-8859-15\" /></head></html>";
- QTest::newRow("latin 15") << html << std::optional<QStringConverter::Encoding>();
+ QTest::newRow("latin 15") << html << std::optional<QStringConverter::Encoding>() << QByteArray("ISO-8859-15");
+
+ html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=SJIS\" /></head></html>";
+ QTest::newRow("sjis") << html << std::optional<QStringConverter::Encoding>() << QByteArray("Shift_JIS");
+
+ html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=ISO-2022-JP\" /></head></html>";
+ QTest::newRow("ISO-2022-JP") << html << std::optional<QStringConverter::Encoding>() << QByteArray("ISO-2022-JP");
+
+ html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=ISO-2022\" /></head></html>";
+ QTest::newRow("ISO-2022") << html << std::optional<QStringConverter::Encoding>() << QByteArray("ISO-2022-JP");
+
+ html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=GB2312\" /></head></html>";
+ QTest::newRow("GB2312") << html << std::optional<QStringConverter::Encoding>() << QByteArray("GB2312");
+
+ html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=Big5\" /></head></html>";
+ QTest::newRow("Big5") << html << std::optional<QStringConverter::Encoding>() << QByteArray("Big5");
+
+ html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=GB18030\" /></head></html>";
+ QTest::newRow("GB18030") << html << std::optional<QStringConverter::Encoding>() << QByteArray("GB18030");
+
+ html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=GB2312-HKSCS\" /></head></html>";
+ QTest::newRow("GB2312-HKSCS") << html << std::optional<QStringConverter::Encoding>() << QByteArray("GB2312-HKSCS");
+
+ html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=Big5-HKSCS\" /></head></html>";
+ QTest::newRow("Big5-HKSCS") << html << std::optional<QStringConverter::Encoding>() << QByteArray("Big5-HKSCS");
+
+ html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=EucJP\" /></head></html>";
+ QTest::newRow("EucJP") << html << std::optional<QStringConverter::Encoding>() << QByteArray("EUC-JP");
+
+ html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=EucKR\" /></head></html>";
+ QTest::newRow("EucKR") << html << std::optional<QStringConverter::Encoding>() << QByteArray("EUC-KR");
+
+ html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=KOI8-R\" /></head></html>";
+ QTest::newRow("KOI8-R") << html << std::optional<QStringConverter::Encoding>() << QByteArray("KOI8-R");
+
+ html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=KOI8-U\" /></head></html>";
+ QTest::newRow("KOI8-U") << html << std::optional<QStringConverter::Encoding>() << QByteArray("KOI8-U");
html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=ISO-8859-1\" /></head></html>";
- QTest::newRow("latin 1") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Latin1);
+ QTest::newRow("latin 1") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Latin1) << QByteArray("ISO-8859-1");
html = "<!DOCTYPE html><html><head><meta charset=\"ISO_8859-1:1987\"><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><title>Test</title></head>";
- QTest::newRow("latin 1 (#2)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Latin1);
+ QTest::newRow("latin 1 (#2)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Latin1) << QByteArray("ISO-8859-1");
html = "<!DOCTYPE html><html><head><meta charset=\"utf-8\"><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><title>Test</title></head>";
- QTest::newRow("UTF-8") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
+ QTest::newRow("UTF-8") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8");
html = "<!DOCTYPE html><html><head><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><meta charset=\"utf-8\"><title>Test</title></head>";
- QTest::newRow("UTF-8 (#2)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
+ QTest::newRow("UTF-8 (#2)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8");
html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=UTF-8/></head></html>";
- QTest::newRow("UTF-8, no quotes") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
+ QTest::newRow("UTF-8, no quotes") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8");
html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset='UTF-8'/></head></html>";
- QTest::newRow("UTF-8, single quotes") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
+ QTest::newRow("UTF-8, single quotes") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8");
html = "<!DOCTYPE html><html><head><meta charset=utf-8><title>Test</title></head>";
- QTest::newRow("UTF-8, > terminator") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
+ QTest::newRow("UTF-8, > terminator") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8");
html = "<!DOCTYPE html><html><head><meta charset= utf-8 ><title>Test</title></head>";
- QTest::newRow("UTF-8, > terminator with spaces") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
+ QTest::newRow("UTF-8, > terminator with spaces") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8");
// Test invalid charsets.
html = "<!DOCTYPE html><html><head><meta charset= utf/8 ><title>Test</title></head>";
- QTest::newRow("utf/8") << html << std::optional<QStringConverter::Encoding>();
+ QTest::newRow("utf/8") << html << std::optional<QStringConverter::Encoding>() << QByteArray();
html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=invalid-foo\" /></head></html>";
- QTest::newRow("invalid charset, no default") << html << std::optional<QStringConverter::Encoding>();
+ QTest::newRow("invalid charset, no default") << html << std::optional<QStringConverter::Encoding>() << QByteArray("UTF-8");
html = "<!DOCTYPE html><html><head><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><meta charset=\"";
html.prepend(QByteArray().fill(' ', 512 - html.size()));
- QTest::newRow("invalid charset (large header)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
+ QTest::newRow("invalid charset (large header)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8");
html = "<!DOCTYPE html><html><head><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><meta charset=\"utf-8";
- QTest::newRow("invalid charset (no closing double quote)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
+ QTest::newRow("invalid charset (no closing double quote)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8");
html = "<!DOCTYPE html><html><head><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><meta charset='utf-8";
- QTest::newRow("invalid charset (no closing single quote)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
+ QTest::newRow("invalid charset (no closing single quote)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8");
html = "<!DOCTYPE html><html><head><meta charset=utf-8 foo=bar><title>Test</title></head>";
- QTest::newRow("invalid (space terminator)") << html << std::optional<QStringConverter::Encoding>();
+ QTest::newRow("invalid (space terminator)") << html << std::optional<QStringConverter::Encoding>() << QByteArray();
html = "<!DOCTYPE html><html><head><meta charset=\" utf' 8 /><title>Test</title></head>";
- QTest::newRow("invalid charset, early terminator (')") << html << std::optional<QStringConverter::Encoding>();
+ QTest::newRow("invalid charset, early terminator (')") << html << std::optional<QStringConverter::Encoding>() << QByteArray();
const char src[] = { char(0xff), char(0xfe), char(0x7a), char(0x03), 0, 0 };
html = src;
- QTest::newRow("greek text UTF-16LE") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf16LE);
+ QTest::newRow("greek text UTF-16LE") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf16LE) << QByteArray("UTF-16LE");
html = "<meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\"><span style=\"color: rgb(0, 0, 0); font-family: "
@@ -1803,19 +2450,33 @@ void tst_QStringConverter::encodingForHtml_data()
"line-height: normal; orphans: auto; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; widows: "
"auto; word-spacing: 0px; -webkit-text-size-adjust: auto; -webkit-text-stroke-width: 0px; display: inline !important; float: "
"none;\">&#x37b</span>\000";
- QTest::newRow("greek text UTF-8") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
+ QTest::newRow("greek text UTF-8") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8");
html = "<!DOCTYPE html><html><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=unicode\">"
"<head/><body><p>bla</p></body></html>"; // QTBUG-41998, ICU will return UTF-16.
- QTest::newRow("legacy unicode UTF-8") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
+ QTest::newRow("legacy unicode UTF-8") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8) << QByteArray("UTF-8");
}
void tst_QStringConverter::encodingForHtml()
{
QFETCH(QByteArray, html);
QFETCH(std::optional<QStringConverter::Encoding>, encoding);
+ QFETCH(QByteArray, name);
QCOMPARE(QStringConverter::encodingForHtml(html), encoding);
+
+ QStringDecoder decoder = QStringDecoder::decoderForHtml(html);
+ if (encoding || // we should have a valid decoder independent of ICU support
+ decoder.isValid()) { // we got a valid decoder through ICU
+ QCOMPARE(decoder.name(), name);
+ }
+}
+
+void tst_QStringConverter::availableCodesAreAvailable()
+{
+ auto codecs = QStringConverter::availableCodecs();
+ for (const auto &codecName: codecs)
+ QVERIFY(QStringEncoder(codecName.toLatin1()).isValid());
}
class LoadAndConvert: public QRunnable
@@ -1838,8 +2499,20 @@ public:
}
};
+void tst_QStringConverter::initTestCase()
+{
+ if (localeIsUtf8())
+ qInfo("System locale is UTF-8");
+ else
+ qInfo("System locale is not UTF-8");
+}
+
void tst_QStringConverter::threadSafety()
{
+#if defined(Q_OS_WASM)
+ QSKIP("This test misbehaves on WASM. Investigation needed (QTBUG-110067)");
+#endif
+
QThreadPool::globalInstance()->setMaxThreadCount(12);
QList<QString> res;
@@ -1855,6 +2528,292 @@ void tst_QStringConverter::threadSafety()
QCOMPARE(b, QString::fromLatin1("abcdefghijklmonpqrstufvxyz"));
}
+#ifdef Q_OS_WIN
+void tst_QStringConverter::fromLocal8Bit_data()
+{
+ QTest::addColumn<QByteArray>("eightBit");
+ QTest::addColumn<QString>("utf16");
+ QTest::addColumn<quint32>("codePage");
+
+ constexpr uint WINDOWS_1252 = 1252u;
+ QTest::newRow("windows-1252") << "Hello, world!"_ba << u"Hello, world!"_s << WINDOWS_1252;
+ constexpr uint SHIFT_JIS = 932u;
+ // Mostly two byte characters, but the comma is a single byte character (0xa4)
+ QTest::newRow("shiftJIS")
+ << "\x82\xb1\x82\xf1\x82\xc9\x82\xbf\x82\xcd\xa4\x90\xa2\x8a\x45\x81\x49"_ba
+ << u"こんにちは、世界!"_s << SHIFT_JIS;
+
+ constexpr uint GB_18030 = 54936u;
+ QTest::newRow("GB-18030") << "\xc4\xe3\xba\xc3\xca\xc0\xbd\xe7\xa3\xa1"_ba << u"你好世界!"_s
+ << GB_18030;
+}
+
+void tst_QStringConverter::fromLocal8Bit()
+{
+ QFETCH(const QByteArray, eightBit);
+ QFETCH(const QString, utf16);
+ QFETCH(const quint32, codePage);
+
+ QStringConverter::State state;
+
+ QString result = QLocal8Bit::convertToUnicode_sys(eightBit, codePage, &state);
+ QCOMPARE(result, utf16);
+ QCOMPARE(state.remainingChars, 0);
+
+ result.clear();
+ state.clear();
+ for (char c : eightBit)
+ result += QLocal8Bit::convertToUnicode_sys({&c, 1}, codePage, &state);
+ QCOMPARE(result, utf16);
+ QCOMPARE(state.remainingChars, 0);
+
+ result.clear();
+ state.clear();
+ // Decode the full string again, this time without state
+ state.flags |= QStringConverter::Flag::Stateless;
+ result = QLocal8Bit::convertToUnicode_sys(eightBit, codePage, &state);
+ QCOMPARE(result, utf16);
+ QCOMPARE(state.remainingChars, 0);
+}
+
+void tst_QStringConverter::fromLocal8Bit_special_cases()
+{
+ QStringConverter::State state;
+ constexpr uint SHIFT_JIS = 932u;
+ // Decode a 2-octet character, but only provide 1 octet at first:
+ QString result = QLocal8Bit::convertToUnicode_sys("\x82", SHIFT_JIS, &state);
+ QCOMPARE(result, QString());
+ QVERIFY(result.isNull());
+ QCOMPARE_GT(state.remainingChars, 0);
+ // Then provide the second octet:
+ result = QLocal8Bit::convertToUnicode_sys("\xb1", SHIFT_JIS, &state);
+ QCOMPARE(result, u"こ");
+ QCOMPARE(state.remainingChars, 0);
+
+ // And without state:
+ result.clear();
+ QStringConverter::State statelessState;
+ statelessState.flags |= QStringConverter::Flag::Stateless;
+ result = QLocal8Bit::convertToUnicode_sys("\x82", SHIFT_JIS, &statelessState);
+ result += QLocal8Bit::convertToUnicode_sys("\xb1", SHIFT_JIS, &statelessState);
+ // 0xb1 is a valid single-octet character in Shift-JIS, so the output
+ // isn't really what you would expect:
+ QCOMPARE(result, QString(QChar::ReplacementCharacter) + u'ア');
+ QCOMPARE(statelessState.remainingChars, 0);
+
+ // Now try a 3-octet UTF-8 sequence:
+ result.clear();
+ state.clear();
+ constexpr uint UTF8 = 65001u;
+ // First the first 2 octets:
+ result = QLocal8Bit::convertToUnicode_sys("\xe4\xbd", UTF8, &state);
+ QCOMPARE(result, QString());
+ QVERIFY(result.isNull());
+ QCOMPARE_GT(state.remainingChars, 0);
+ // Then provide the remaining octet:
+ result = QLocal8Bit::convertToUnicode_sys("\xa0", UTF8, &state);
+ QCOMPARE(result, u"你");
+ QCOMPARE(state.remainingChars, 0);
+
+ // Now the same, but there is an incomplete sequence at the start
+ result.clear();
+ state.clear();
+ result = QLocal8Bit::convertToUnicode_sys("\xe4\xe4\xbd", UTF8, &state);
+ QCOMPARE(result, QString());
+ QVERIFY(result.isNull());
+ // Remaining octet (and a '.' to force it to discard something from the
+ // internal state which is currently limited to 4 octets):
+ result += QLocal8Bit::convertToUnicode_sys("\xa0.", UTF8, &state);
+ QCOMPARE(result, QChar::ReplacementCharacter + u"你."_s);
+ QCOMPARE(state.remainingChars, 0);
+
+ // Test QTBUG-118834, which is failing
+ result.clear();
+ state.clear();
+ result = QLocal8Bit::convertToUnicode_sys("\xe4\xe4\xbd", UTF8, &state);
+ QCOMPARE(result, QString());
+ QVERIFY(result.isNull());
+ // Remaining octet:
+ result += QLocal8Bit::convertToUnicode_sys("\xa0", UTF8, &state);
+ QEXPECT_FAIL("", "QTBUG-118834: We don't output anything because it's "
+ "within the size of our internal state, and we cannot "
+ "signal that it needs to be drained.", Continue);
+ QCOMPARE(result, QChar::ReplacementCharacter + u"你"_s);
+ QEXPECT_FAIL("", "QTBUG-118834: As above", Continue);
+ QCOMPARE(state.remainingChars, 0);
+
+ // Now try a 4-octet GB 18030 sequence:
+ result.clear();
+ state.clear();
+ constexpr uint GB_18030 = 54936u;
+ const char sequence[] = "\x95\x32\x90\x31";
+ // Repeat the sequence multiple times to test handling of exhaustion of
+ // internal buffer
+ QByteArray repeated = QByteArray(sequence).repeated(2049);
+ QByteArrayView octets = QByteArrayView(repeated);
+ result = QLocal8Bit::convertToUnicode_sys(octets.first(2), GB_18030, &state);
+ QCOMPARE(result, QString());
+ QVERIFY(result.isNull());
+ QCOMPARE_GT(state.remainingChars, 0);
+ // Then provide one more octet:
+ result = QLocal8Bit::convertToUnicode_sys(octets.sliced(2, 1), GB_18030, &state);
+ QCOMPARE(result, QString());
+ QVERIFY(result.isNull());
+ QCOMPARE_GT(state.remainingChars, 0);
+ // Then provide the last octet + the rest of the string
+ result = QLocal8Bit::convertToUnicode_sys(octets.sliced(3), GB_18030, &state);
+ QCOMPARE(result.first(2), u"𠂇");
+ QCOMPARE(state.remainingChars, 0);
+}
+
+void tst_QStringConverter::fromLocal8Bit_2GiB()
+{
+#if QT_POINTER_SIZE == 4
+ QSKIP("This test is only relevant for 64-bit builds");
+#else
+ qsizetype size = qsizetype(std::numeric_limits<int>::max()) + 3;
+ QByteArray input;
+ QT_TRY {
+ input.reserve(size);
+ } QT_CATCH (const std::bad_alloc &) {
+ QSKIP("Out of memory");
+ }
+ // fill with '、' - a single octet character in Shift-JIS
+ input.fill('\xa4', std::numeric_limits<int>::max() - 1);
+ // then append 'こ' - a two octet character in Shift-JIS
+ // which is now straddling the 2 GiB boundary
+ input += "\x82\xb1";
+ // then append another two '、', so that our output is also crossing the
+ // 2 GiB boundary
+ input += "\xa4\xa4";
+ QCOMPARE(input.size(), input.capacity());
+ constexpr uint SHIFT_JIS = 932u;
+ QStringConverter::State state;
+ QString result;
+ QT_TRY {
+ result = QLocal8Bit::convertToUnicode_sys(input, SHIFT_JIS, &state);
+ } QT_CATCH (const std::bad_alloc &) {
+ QSKIP("Out of memory");
+ }
+ QCOMPARE(result.size(), size - 1); // The 2-octet character is only 1 code unit in UTF-16
+ QCOMPARE(result.last(4), u"、こ、、"); // Check we correctly decoded it
+ QCOMPARE(state.remainingChars, 0); // and there is nothing left in the state
+#endif
+}
+
+void tst_QStringConverter::toLocal8Bit_data()
+{
+ fromLocal8Bit_data();
+}
+
+void tst_QStringConverter::toLocal8Bit()
+{
+ QFETCH(const QByteArray, eightBit);
+ QFETCH(const QString, utf16);
+ QFETCH(const quint32, codePage);
+
+ QStringConverter::State state;
+
+ QByteArray result = QLocal8Bit::convertFromUnicode_sys(utf16, codePage, &state);
+ QCOMPARE(result, eightBit);
+ QCOMPARE(state.remainingChars, 0);
+
+ result.clear();
+ state.clear();
+ for (QChar c : utf16)
+ result += QLocal8Bit::convertFromUnicode_sys(QStringView(&c, 1), codePage, &state);
+ QCOMPARE(result, eightBit);
+ QCOMPARE(state.remainingChars, 0);
+
+ result.clear();
+ state.clear();
+ // Decode the full string again, this time without state
+ state.flags |= QStringConverter::Flag::Stateless;
+ result = QLocal8Bit::convertFromUnicode_sys(utf16, codePage, &state);
+ QCOMPARE(result, eightBit);
+ QCOMPARE(state.remainingChars, 0);
+}
+
+void tst_QStringConverter::toLocal8Bit_special_cases()
+{
+ QStringConverter::State state;
+ // Normally utf8 goes through a different code path, but we can force it here
+ constexpr uint UTF8 = 65001u;
+ // Decode a 2-code unit character, but only provide 1 code unit at first:
+ const char16_t a[] = u"𬽦";
+ QStringView codeUnits = a;
+ QByteArray result = QLocal8Bit::convertFromUnicode_sys(codeUnits.first(1), UTF8, &state);
+ QCOMPARE(result, QString());
+ QVERIFY(result.isNull());
+ QCOMPARE_GT(state.remainingChars, 0);
+ // Then provide the second code unit:
+ result = QLocal8Bit::convertFromUnicode_sys(codeUnits.sliced(1), UTF8, &state);
+ QCOMPARE(result, "\xf0\xac\xbd\xa6"_ba);
+ QCOMPARE(state.remainingChars, 0);
+
+ // Retain compat with the behavior for toLocal8Bit:
+ QCOMPARE(codeUnits.first(1).toLocal8Bit(), "?");
+
+ // QString::toLocal8Bit is already stateless, but test stateless handling
+ // explicitly anyway:
+ result.clear();
+ QStringConverter::State statelessState;
+ statelessState.flags |= QStringConverter::Flag::Stateless;
+ result = QLocal8Bit::convertFromUnicode_sys(codeUnits.first(1), UTF8, &statelessState);
+ result += QLocal8Bit::convertFromUnicode_sys(codeUnits.sliced(1), UTF8, &statelessState);
+ // Windows uses the replacement character for invalid characters:
+ QCOMPARE(result, "\ufffd\ufffd");
+
+ // Now do the same, but the second time we feed in a character, we also
+ // provide many more so the internal stack buffer is not large enough.
+ result.clear();
+ state.clear();
+ QString str = QStringView(a).toString().repeated(2048);
+ codeUnits = str;
+ result = QLocal8Bit::convertFromUnicode_sys(codeUnits.first(1), UTF8, &state);
+ QCOMPARE(result, QString());
+ QVERIFY(result.isNull());
+ QCOMPARE_GT(state.remainingChars, 0);
+ // Then we provide the rest of the string:
+ result = QLocal8Bit::convertFromUnicode_sys(codeUnits.sliced(1), UTF8, &state);
+ QCOMPARE(result.first(4), "\xf0\xac\xbd\xa6"_ba);
+ QCOMPARE(state.remainingChars, 0);
+}
+
+void tst_QStringConverter::toLocal8Bit_2GiB()
+{
+#if QT_POINTER_SIZE == 4
+ QSKIP("This test is only relevant for 64-bit builds");
+#else
+ constexpr qsizetype TwoGiB = qsizetype(std::numeric_limits<int>::max());
+ QString input;
+ QT_TRY {
+ input.reserve(TwoGiB + 1);
+ } QT_CATCH (const std::bad_alloc &) {
+ QSKIP("Out of memory");
+ }
+ // Fill with a single code unit character
+ input.fill(u'.', TwoGiB - 1);
+ // Then append a 2 code unit character, so that the input straddles the 2 GiB
+ // boundary
+ input += u"🙂";
+ QCOMPARE(input.size(), input.capacity());
+ constexpr uint UTF8 = 65001u;
+ QStringConverter::State state;
+ QByteArray result;
+ QT_TRY {
+ result = QLocal8Bit::convertFromUnicode_sys(input, UTF8, &state);
+ } QT_CATCH (const std::bad_alloc &) {
+ QSKIP("Out of memory");
+ }
+ QUtf8StringView rView = result;
+ QCOMPARE(rView.size(), TwoGiB + 3); // The 2 code unit smiley is 4 code units in UTF-8
+ QCOMPARE(rView.last(7), u8"...🙂"); // Check we correctly decoded it
+ QCOMPARE(state.remainingChars, 0); // and there is nothing left in the state
+#endif
+}
+#endif // Q_OS_WIN
+
struct DontCrashAtExit {
~DontCrashAtExit() {
QStringDecoder decoder(QStringDecoder::Utf8);