Long live the ICU-based QStringConverter interface!

This adds support for additional codecs to QStringConverter when ICU is available. We store the converter in the state (d[0]), and its canonical name in d[1]. We need the name there, as in the clear function we close the UConverter, and set the pointer to null. Consequently, the actual conversion functions might need to re-open the converter again. The advantage of this approach is that clear is used in the destructor of State, and with this approach we properly clean up the state. There is however a disadvantage: The clear function was so far also used for resetting the state when QStringConverter::resetState . Discarding the whole Uconverter for that is however rather costly. For that reason we modify resetState to call a new function, State::reset. For existing converters, it behaves the same as clear; for the ICU based converter, we call the more efficient ucnv_reset. Code compiled against Qt 6.4 can benefit from this more efficient version; code compiled against older Qt versions will continue to work, as the conversion functions can just recretate the converter from the name. We can distinguish between ICU and non-ICU converters by checking if the UsesIcu flag is set. QStringConverter::name is changed to return the name stored in d[1]. The interface of the ICU converter has a dummy name, so code using the old name function from QT < 6.4 still returns something, namely a message asking the user to recompile. The function is moved out of line, as we need to check for the private ICU feature, and want to avoid having that check in the public header. As the QStringConverter ctor taking a name now can allocate memory, it can no longer be noexcept. Removing the noexceptness is safe, as it was only added after Qt 6.3. Note that we cannot extend the API consuming or returning Encoding, as we use Encoding values to index into an array of converter interfaces in inline API. Further API to support getting an ICU converter for HTML will be added in a future commit. Currently, the code depending on ICU is enabled at compile time if ICU is found. However, in the future it could be moved into a plugin to avoid a hard dependency on ICU in Core. [ChangeLog][Corelib][Text] QStringConverter and API using it now supports more text codecs if Qt is compiled with ICU support. Fixes: QTBUG-103375 Change-Id: I7afb92fc68ef994179ebc7a3aa73beebb1386204 Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
author: Fabian Kosmale <fabian.kosmale@qt.io> 2022-01-31 11:25:25 +0100
committer: Fabian Kosmale <fabian.kosmale@qt.io> 2022-06-19 00:41:12 +0200
commit: 122270d6bea164e6df4357f4d4d77aacfa430470 (patch)
tree: 4d0477aa23a0575b7d6185311ca51a56746ab0f4
parent: d350373133f169b44fd98faab6fe3f75abab6282 (diff)
5 files changed, 568 insertions, 11 deletions
diff --git a/src/corelib/text/qstringconverter.cpp b/src/corelib/text/qstringconverter.cpp
index d252d7b667..ba13194310 100644
--- a/src/corelib/text/qstringconverter.cpp
+++ b/src/corelib/text/qstringconverter.cpp
@@ -11,6 +11,13 @@
 #include "private/qtools_p.h"
 #include "qbytearraymatcher.h"
 
+#if QT_CONFIG(icu)
+#include <unicode/ucnv.h>
+#include <unicode/ucnv_cb.h>
+#include <unicode/ucnv_err.h>
+#include <unicode/ustring.h>
+#endif
+
 #ifdef Q_OS_WIN
 #include <qt_windows.h>
 #ifndef QT_BOOTSTRAPPED
@@ -1373,6 +1380,21 @@ void QStringConverter::State::clear() noexcept
     internalState = 0;
 }
 
+void QStringConverter::State::reset() noexcept
+{
+    if (flags & Flag::UsesIcu) {
+#if QT_CONFIG(icu)
+        UConverter *converter = static_cast<UConverter *>(d[0]);
+        if (converter)
+            ucnv_reset(converter);
+#else
+        Q_UNREACHABLE();
+#endif
+    } else {
+        clear();
+    }
+}
+
 static QChar *fromUtf16(QChar *out, QByteArrayView in, QStringConverter::State *state)
 {
     return QUtf16::convertToUnicode(out, in, state, DetectEndianness);
@@ -1594,6 +1616,7 @@ static qsizetype toLatin1Len(qsizetype l) { return l + 1; }
     \value Stateless Ignore possible converter states between different function calls
            to encode or decode strings. This will also cause the QStringConverter to raise an error if an incomplete
            sequence of data is encountered.
+    \omitvalue UsesIcu
 */
 
 /*!
@@ -1665,15 +1688,263 @@ static bool nameMatch(const char *a, const char *b)
     \internal
 */
 
+
+#if QT_CONFIG(icu)
+// only derives from QStringConverter to get access to protected types
+struct QStringConverterICU : QStringConverter
+{
+    static void clear_function(QStringConverterBase::State *state) noexcept
+    {
+        ucnv_close(static_cast<UConverter *>(state->d[0]));
+        state->d[0] = nullptr;
+    }
+
+    static void ensureConverter(QStringConverter::State *state)
+    {
+        // old code might reset the state via clear instead of reset
+        // in that case, the converter has been closed, and we have to reopen it
+        if (state->d[0] == nullptr)
+            state->d[0] = createConverterForName(static_cast<const char *>(state->d[1]), state);
+    }
+
+    static QChar *toUtf16(QChar *out, QByteArrayView in, QStringConverter::State *state)
+    {
+        ensureConverter(state);
+
+        auto icu_conv = static_cast<UConverter *>(state->d[0]);
+        UErrorCode err = U_ZERO_ERROR;
+        auto source = in.data();
+        auto sourceLimit = in.data() + in.size();
+
+        qsizetype length = toLen(in.size());
+
+        UChar *target = reinterpret_cast<UChar *>(out);
+        auto targetLimit = target + length;
+        // We explicitly clean up anyway, so no need to set flush to true,
+        // which would just reset the converter.
+        UBool flush = false;
+
+        // If the QStringConverter was moved, the state that we used as a context is stale now.
+        UConverterToUCallback action;
+        const void *context;
+        ucnv_getToUCallBack(icu_conv, &action, &context);
+        if (context != state)
+             ucnv_setToUCallBack(icu_conv, action, &state, nullptr, nullptr, &err);
+
+        ucnv_toUnicode(icu_conv, &target, targetLimit, &source, sourceLimit, nullptr, flush, &err);
+        // We did reserve enough space:
+        Q_ASSERT(err != U_BUFFER_OVERFLOW_ERROR);
+        if (state->flags.testFlag(QStringConverter::Flag::Stateless)) {
+            if (auto leftOver = ucnv_toUCountPending(icu_conv, &err)) {
+                ucnv_reset(icu_conv);
+                state->invalidChars += leftOver;
+            }
+        }
+        return reinterpret_cast<QChar *>(target);
+    }
+
+    static char *fromUtf16(char *out, QStringView in, QStringConverter::State *state)
+    {
+        ensureConverter(state);
+        auto icu_conv = static_cast<UConverter *>(state->d[0]);
+        UErrorCode err = U_ZERO_ERROR;
+        auto source = reinterpret_cast<const UChar *>(in.data());
+        auto sourceLimit = reinterpret_cast<const UChar *>(in.data() + in.size());
+
+        qsizetype length = UCNV_GET_MAX_BYTES_FOR_STRING(in.length(), ucnv_getMaxCharSize(icu_conv));
+
+        char *target = out;
+        char *targetLimit = out + length;
+        UBool flush = false;
+
+        // If the QStringConverter was moved, the state that we used as a context is stale now.
+        UConverterFromUCallback action;
+        const void *context;
+        ucnv_getFromUCallBack(icu_conv, &action, &context);
+        if (context != state)
+             ucnv_setFromUCallBack(icu_conv, action, &state, nullptr, nullptr, &err);
+
+        ucnv_fromUnicode(icu_conv, &target, targetLimit, &source, sourceLimit, nullptr, flush, &err);
+        // We did reserve enough space:
+        Q_ASSERT(err != U_BUFFER_OVERFLOW_ERROR);
+        if (state->flags.testFlag(QStringConverter::Flag::Stateless)) {
+            if (auto leftOver = ucnv_fromUCountPending(icu_conv, &err)) {
+                ucnv_reset(icu_conv);
+                state->invalidChars += leftOver;
+            }
+        }
+        return target;
+    }
+
+    Q_DISABLE_COPY_MOVE(QStringConverterICU)
+
+    template<qsizetype X>
+    static qsizetype fromLen(qsizetype inLength)
+    {
+        return X * inLength * sizeof(UChar);
+    }
+
+    static qsizetype toLen(qsizetype inLength)
+    {
+
+        /* Assumption: each input char might map to a different codepoint
+           Each codepoint can take up to 4 bytes == 2 QChar
+           We can ignore reserving space for a BOM, as only UTF encodings use one
+           and those are not handled by the ICU converter.
+         */
+        return 2 * inLength;
+    }
+
+    static constexpr QStringConverter::Interface forLength[] = {
+        {"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<1>},
+        {"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<2>},
+        {"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<3>},
+        {"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<4>},
+        {"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<5>},
+        {"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<6>},
+        {"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<7>},
+        {"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<8>}
+    };
+
+    static UConverter *createConverterForName(const char *name, const State *state)
+    {
+        Q_ASSERT(name);
+        Q_ASSERT(state);
+        UErrorCode status = U_ZERO_ERROR;
+        UConverter *conv = ucnv_open(name, &status);
+        if (status != U_ZERO_ERROR && status != U_AMBIGUOUS_ALIAS_WARNING) {
+            ucnv_close(conv);
+            return nullptr;
+        }
+
+        if (state->flags.testFlag(Flag::ConvertInvalidToNull)) {
+            UErrorCode error = U_ZERO_ERROR;
+
+            auto nullToSubstituter = [](const void *context, UConverterToUnicodeArgs *toUArgs,
+                                        const char *, int32_t length,
+                                        UConverterCallbackReason reason, UErrorCode *err) {
+                if (reason <= UCNV_IRREGULAR) {
+                    *err = U_ZERO_ERROR;
+                    UChar c = '\0';
+                    ucnv_cbToUWriteUChars(toUArgs, &c, 1, 0, err);
+                    // Recover outer scope's state (which isn't const) from context:
+                    auto state = const_cast<State *>(static_cast<const State *>(context));
+                    state->invalidChars += length;
+                }
+            };
+            ucnv_setToUCallBack(conv, nullToSubstituter, state, nullptr, nullptr, &error);
+
+            auto nullFromSubstituter = [](const void *context, UConverterFromUnicodeArgs *fromUArgs,
+                                          const UChar *, int32_t length,
+                                          UChar32, UConverterCallbackReason reason, UErrorCode *err) {
+                if (reason <= UCNV_IRREGULAR) {
+                    *err = U_ZERO_ERROR;
+                    const UChar replacement[] = { 0 };
+                    const UChar *stringBegin = std::begin(replacement);
+                    ucnv_cbFromUWriteUChars(fromUArgs, &stringBegin, std::end(replacement), 0, err);
+                    // Recover outer scope's state (which isn't const) from context:
+                    auto state = const_cast<State *>(static_cast<const State *>(context));
+                    state->invalidChars += length;
+                }
+            };
+            ucnv_setFromUCallBack(conv, nullFromSubstituter, state, nullptr, nullptr, &error);
+        } else {
+            UErrorCode error = U_ZERO_ERROR;
+
+            auto qmarkToSubstituter = [](const void *context, UConverterToUnicodeArgs *toUArgs,
+                                         const char *codeUnits,int32_t length,
+                                         UConverterCallbackReason reason, UErrorCode *err) {
+                if (reason <= UCNV_IRREGULAR) {
+                    // Recover outer scope's state (which isn't const) from context:
+                    auto state = const_cast<State *>(static_cast<const State *>(context));
+                    state->invalidChars += length;
+                }
+                // use existing ICU callback for logic
+                UCNV_TO_U_CALLBACK_SUBSTITUTE(nullptr, toUArgs, codeUnits, length, reason, err);
+
+            };
+            ucnv_setToUCallBack(conv, qmarkToSubstituter, state, nullptr, nullptr, &error);
+
+            auto qmarkFromSubstituter = [](const void *context, UConverterFromUnicodeArgs *fromUArgs,
+                                           const UChar *codeUnits, int32_t length,
+                                           UChar32 codePoint, UConverterCallbackReason reason, UErrorCode *err) {
+                if (reason <= UCNV_IRREGULAR) {
+                    // Recover outer scope's state (which isn't const) from context:
+                    auto state = const_cast<State *>(static_cast<const State *>(context));
+                    state->invalidChars += length;
+                }
+                // use existing ICU callback for logic
+                UCNV_FROM_U_CALLBACK_SUBSTITUTE(nullptr, fromUArgs, codeUnits, length,
+                                                codePoint, reason, err);
+            };
+            ucnv_setFromUCallBack(conv, qmarkFromSubstituter, state, nullptr, nullptr, &error);
+        }
+        return conv;
+    }
+
+    static const QStringConverter::Interface *make_icu_converter(
+            QStringConverterBase::State *state,
+            const char *name)
+    {
+        UErrorCode status = U_ZERO_ERROR;
+        UConverter *conv = createConverterForName(name, state);
+        if (!conv)
+            return nullptr;
+
+        const char *icuName = ucnv_getName(conv, &status);
+        // ucnv_getStandardName returns a name which is owned by the library
+        // we can thus store it in the state without worrying aobut its lifetime
+        const char *persistentName = ucnv_getStandardName(icuName, "MIME", &status);
+        if (U_FAILURE(status) || !persistentName) {
+             status = U_ZERO_ERROR;
+             persistentName = ucnv_getStandardName(icuName, "IANA", &status);
+        }
+        state->d[1] = const_cast<char *>(persistentName);
+        state->d[0] = conv;
+        state->flags |= QStringConverterBase::Flag::UsesIcu;
+        qsizetype maxCharSize = ucnv_getMaxCharSize(conv);
+        state->clearFn = QStringConverterICU::clear_function;
+        if (maxCharSize > 8 || maxCharSize < 1) {
+            qWarning("Encountered unexpected codec \"%s\" which requires >8x space", name);
+            return nullptr;
+        } else {
+            return &forLength[maxCharSize - 1];
+        }
+
+    }
+
+};
+#endif
+
 /*!
     \internal
 */
-QStringConverter::QStringConverter(const char *name, Flags f) noexcept
+QStringConverter::QStringConverter(const char *name, Flags f)
     : iface(nullptr), state(f)
 {
     auto e = encodingForName(name);
     if (e)
         iface = encodingInterfaces + int(e.value());
+#if QT_CONFIG(icu)
+    else
+        iface = QStringConverterICU::make_icu_converter(&state, name);
+#endif
+}
+
+
+const char *QStringConverter::name() const noexcept
+{
+    if (!iface)
+        return nullptr;
+    if (state.flags & QStringConverter::Flag::UsesIcu) {
+#if QT_CONFIG(icu)
+        return static_cast<const char*>(state.d[1]);
+#else
+        return nullptr;
+#endif
+    } else {
+        return iface->name;
+    }
 }
 
 /*!
@@ -1711,8 +1982,12 @@ QStringConverter::QStringConverter(const char *name, Flags f) noexcept
 */
 
 /*!
-    Returns an optional encoding for \a name. The optional is empty if the name could
-    not get converted to a valid encoding.
+    Convert \a name to the corresponding \l Encoding member, if there is one.
+
+    If the \a name is not the name of a codec listed in the Encoding enumeration,
+    \c{std::nullopt} is returned. Such a name may, none the less, be accepted by
+    the QStringConverter constructor when Qt is built with ICU, if ICU provides a
+    converter with the given name.
 */
 std::optional<QStringConverter::Encoding> QStringConverter::encodingForName(const char *name) noexcept
 {
diff --git a/src/corelib/text/qstringconverter_base.h b/src/corelib/text/qstringconverter_base.h
index 313c88f946..68900da8f0 100644
--- a/src/corelib/text/qstringconverter_base.h
+++ b/src/corelib/text/qstringconverter_base.h
@@ -31,7 +31,8 @@ public:
         Stateless = 0x1,
         ConvertInvalidToNull = 0x2,
         WriteBom = 0x4,
-        ConvertInitialBom = 0x8
+        ConvertInitialBom = 0x8,
+        UsesIcu = 0x10,
     };
     Q_DECLARE_FLAGS(Flags, Flag)
 
@@ -39,6 +40,7 @@ public:
         constexpr State(Flags f = Flag::Default) noexcept
             : flags(f), state_data{0, 0, 0, 0} {}
         ~State() { clear(); }
+
         State(State &&other) noexcept
             : flags(other.flags),
               remainingChars(other.remainingChars),
@@ -59,6 +61,7 @@ public:
             return *this;
         }
         Q_CORE_EXPORT void clear() noexcept;
+        Q_CORE_EXPORT void reset() noexcept;
 
         Flags flags;
         int internalState = 0;
@@ -102,7 +105,8 @@ public:
         Stateless = 0x1,
         ConvertInvalidToNull = 0x2,
         WriteBom = 0x4,
-        ConvertInitialBom = 0x8
+        ConvertInitialBom = 0x8,
+        UsesIcu = 0x10,
     };
     Q_DECLARE_FLAGS(Flags, Flag)
 #endif
@@ -130,7 +134,8 @@ protected:
     constexpr explicit QStringConverter(const Interface *i) noexcept
         : iface(i)
     {}
-    Q_CORE_EXPORT explicit QStringConverter(const char *name, Flags f) noexcept;
+    Q_CORE_EXPORT explicit QStringConverter(const char *name, Flags f);
+
 
     ~QStringConverter() = default;
 
@@ -142,12 +147,11 @@ public:
 
     void resetState() noexcept
     {
-        state.clear();
+        state.reset();
     }
     bool hasError() const noexcept { return state.invalidChars != 0; }
 
-    const char *name() const noexcept
-    { return isValid() ? iface->name : nullptr; }
+    Q_CORE_EXPORT const char *name() const noexcept;
 
     Q_CORE_EXPORT static std::optional<Encoding> encodingForName(const char *name) noexcept;
     Q_CORE_EXPORT static const char *nameForEncoding(Encoding e);
diff --git a/tests/auto/corelib/text/qstringconverter/CMakeLists.txt b/tests/auto/corelib/text/qstringconverter/CMakeLists.txt
index 07e33e26ca..a7816de1bf 100644
--- a/tests/auto/corelib/text/qstringconverter/CMakeLists.txt
+++ b/tests/auto/corelib/text/qstringconverter/CMakeLists.txt
@@ -8,4 +8,14 @@ qt_internal_add_test(tst_qstringconverter
     SOURCES
         tst_qstringconverter.cpp
     TESTDATA ${test_data}
+    PUBLIC_LIBRARIES
+        Qt::CorePrivate # for access to Qt's feature system
+)
+
+
+qt_internal_add_resource(tst_qstringconverter  "compressedtexture_bc1"
+    PREFIX
+        "/"
+    FILES
+        "euc_kr.txt"
 )
diff --git a/tests/auto/corelib/text/qstringconverter/euc_kr.txt b/tests/auto/corelib/text/qstringconverter/euc_kr.txt
new file mode 100644
index 0000000000..a0eb9af691
--- /dev/null
+++ b/tests/auto/corelib/text/qstringconverter/euc_kr.txt
@@ -0,0 +1 @@
+�Ы���?�Ȫ�ʦ?����?�ɪǪ���EUC Packed Format�ȡ�2�Ы���ͳ������EUC Fixed Width Format�����롣���������������ġ�������?ݻ?���Īǡ�������EUC�Ȫ�������������򦪹�������Ǫ������˪Ī�����?���롣
diff --git a/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp b/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp
index 633346a639..07f29a6429 100644
--- a/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp
+++ b/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp
@@ -4,11 +4,14 @@
 
 #include <QTest>
 
+#include <QtCore/private/qglobal_p.h>
 #include <qstringconverter.h>
 #include <qthreadpool.h>
 
 #include <array>
 
+using namespace Qt::StringLiterals;
+
 static constexpr bool IsBigEndian = QSysInfo::ByteOrder == QSysInfo::BigEndian;
 enum CodecLimitation {
     AsciiOnly,
@@ -29,8 +32,6 @@ static constexpr bool localeIsUtf8()
 }
 #endif
 
-using namespace Qt::StringLiterals;
-
 struct Codec
 {
     const char name[12];
@@ -129,6 +130,21 @@ private slots:
     void roundtrip_data();
     void roundtrip();
 
+#if QT_CONFIG(icu)
+    void roundtripIcu_data();
+    void roundtripIcu();
+    void icuInvalidCharacter_data();
+    void icuInvalidCharacter();
+    void icuEncodeEdgeCases_data();
+    void icuEncodeEdgeCases();
+    void icuUsableAfterMove();
+    void charByCharConsistency_data();
+    void charByCharConsistency();
+    void byteByByteConsistency_data();
+    void byteByByteConsistency();
+    void statefulPieceWise();
+#endif
+
     void flagF7808080() const;
 
     void utf8Codec_data();
@@ -411,6 +427,257 @@ void tst_QStringConverter::roundtrip()
     QCOMPARE(decoded, uniString);
 }
 
+#if QT_CONFIG(icu)
+
+void tst_QStringConverter::roundtripIcu_data()
+{
+    QTest::addColumn<QString>("original");
+    QTest::addColumn<QByteArray>("codec");
+
+    QTest::addRow("shift_jis") << u"古池や　蛙飛び込む　水の音"_s << QByteArray("shift_jis");
+    QTest::addRow("UTF7") << u"Übermäßig: čçö"_s << QByteArray("UTF-7");
+}
+
+void tst_QStringConverter::roundtripIcu()
+{
+    QFETCH(QString, original);
+    QFETCH(QByteArray, codec);
+    QStringEncoder fromUtf16(codec);
+    if (!fromUtf16.isValid())
+        QSKIP("Unsupported codec");
+    QStringDecoder toUtf16(codec);
+    QByteArray asShiftJIS = fromUtf16(original);
+    QString roundTripped = toUtf16(asShiftJIS);
+    QCOMPARE(roundTripped, original);
+}
+
+void tst_QStringConverter::icuEncodeEdgeCases_data()
+{
+    QTest::addColumn<QString>("source");
+    QTest::addColumn<QByteArray>("expected") ;
+    QTest::addColumn<QByteArray>("codec");
+
+    QTest::addRow("empty") << QString() << QByteArray() << QByteArray("ISO-2022-CN");
+    QTest::addRow("BOMonly") << QString(QChar(QChar::ByteOrderMark)) << QByteArray() << QByteArray("ISO-2022-CN");
+    QTest::addRow("1to6") << u"좋"_s << QByteArray::fromHex("1b2428434141") << QByteArray("ISO-2022-JP-2");
+    QTest::addRow("1to7") << u"漢"_s << QByteArray::fromHex("1b2429470e6947") << QByteArray("ISO-2022-CN");
+    QTest::addRow("1to8") << u"墎"_s << QByteArray::fromHex("1b242a481b4e4949")  << QByteArray("ISO-2022-CN");
+    QTest::addRow("utf7") << u"Übergröße"_s << QByteArray("+ANw-bergr+APYA3w-e") << QByteArray("UTF-7");
+}
+
+void tst_QStringConverter::icuEncodeEdgeCases()
+{
+    QFETCH(QString, source);
+    QFETCH(QByteArray, expected);
+    QFETCH(QByteArray, codec);
+    QStringEncoder encoder(codec);
+    if (!encoder.isValid())
+        QSKIP("Unsupported codec");
+    QVERIFY(encoder.isValid());
+    QByteArray encoded = encoder.encode(source);
+    QCOMPARE(encoded, expected);
+}
+
+void tst_QStringConverter::charByCharConsistency_data()
+{
+    QTest::addColumn<QStringView>("source");
+    QTest::addColumn<QByteArray>("codec");
+
+    auto addRow = [](const TestString &s) {
+        QTest::addRow("%s_shift_jis", s.description) << s.utf16 << QByteArray("shift_jis");
+        QTest::addRow("%s_EUC-CN", s.description) << s.utf16 << QByteArray("EUC-CN");
+    };
+
+    for (const TestString &s : testStrings) {
+        if (s.utf16.isEmpty())
+            continue;
+        addRow(s);
+    }
+}
+
+void tst_QStringConverter::charByCharConsistency()
+{
+    QFETCH(QStringView, source);
+    QFETCH(QByteArray, codec);
+
+    {
+        QStringEncoder encoder(codec);
+        if (!encoder.isValid())
+            QSKIP("Unsupported codec");
+
+        QByteArray fullyConverted = encoder.encode(source);
+        encoder.resetState();
+        QByteArray stepByStepConverted;
+        for (const auto& codeUnit: source) {
+            stepByStepConverted += encoder.encode(codeUnit);
+        }
+        QCOMPARE(stepByStepConverted, fullyConverted);
+    }
+
+    {
+        QStringEncoder encoder(codec, QStringConverter::Flag::ConvertInvalidToNull);
+
+        QByteArray fullyConverted = encoder.encode(source);
+        encoder.resetState();
+        QByteArray stepByStepConverted;
+        for (const auto& codeUnit: source) {
+            stepByStepConverted += encoder.encode(codeUnit);
+        }
+        QCOMPARE(stepByStepConverted, fullyConverted);
+    }
+}
+
+void tst_QStringConverter::byteByByteConsistency_data()
+{
+    QTest::addColumn<QByteArray>("source");
+    QTest::addColumn<QByteArray>("codec");
+
+    QTest::addRow("plain_ascii_utf7") << QByteArray("Hello, world!") << QByteArray("UTF-7");
+    QFile eucKr(":/euc_kr.txt");
+    if (eucKr.open(QFile::ReadOnly))
+        QTest::addRow("euc_kr_storing_jp") << eucKr.readAll() << QByteArray("EUC-KR");
+    QTest::addRow("incomplete_euc_jp") << QByteArrayLiteral("test\x8Ftest") << QByteArray("EUC-JP");
+}
+
+void tst_QStringConverter::byteByByteConsistency()
+{
+    QFETCH(QByteArray, source);
+    QFETCH(QByteArray, codec);
+
+    {
+        QStringDecoder decoder(codec);
+        if (!decoder.isValid())
+            QSKIP("Unsupported codec");
+
+        QString fullyConverted = decoder.decode(source);
+        decoder.resetState();
+        QString stepByStepConverted;
+        for (const auto& byte: source) {
+            QByteArray singleChar;
+            singleChar.append(byte);
+            stepByStepConverted += decoder.decode(singleChar);
+        }
+        QCOMPARE(stepByStepConverted, fullyConverted);
+    }
+
+    {
+        QStringDecoder decoder(codec, QStringConverter::Flag::ConvertInvalidToNull);
+        if (!decoder.isValid())
+            QSKIP("Unsupported codec");
+
+        QString fullyConverted = decoder.decode(source);
+        decoder.resetState();
+        QString stepByStepConverted;
+        for (const auto& byte: source) {
+            QByteArray singleChar;
+            singleChar.append(byte);
+            stepByStepConverted += decoder.decode(singleChar);
+        }
+        QCOMPARE(stepByStepConverted, fullyConverted);
+    }
+}
+
+void tst_QStringConverter::statefulPieceWise()
+{
+    QStringDecoder decoder("HZ");
+    if (!decoder.isValid())
+        QSKIP("Unsupported codec");
+    QString start = decoder.decode("pure ASCII");
+    QCOMPARE(start, u"pure ASCII");
+    QString shifted = decoder.decode("~{");
+    // shift out changes the state, but won't create any output
+    QCOMPARE(shifted, "");
+    QString continuation = decoder.decode("\x42\x43");
+    QCOMPARE(continuation, "旅");
+    decoder.resetState();
+    // after resetting the state we're in N0 again
+    QString afterReset = decoder.decode("\x42\x43");
+    QCOMPARE(afterReset, "BC");
+}
+
+void tst_QStringConverter::icuUsableAfterMove()
+{
+    {
+        QStringDecoder decoder("EUC-JP");
+        QVERIFY(decoder.isValid());
+        QString partial = decoder.decode("Test\x8E");
+        QCOMPARE(partial, u"Test"_s);
+        QStringDecoder moved(std::move(decoder));
+        QString complete = partial + moved.decode("\xA1Test");
+        QCOMPARE(complete, u"Test\uFF61Test"_s);
+    }
+    {
+        QStringEncoder encoder("Big5");
+        QVERIFY(encoder.isValid());
+        QByteArray encoded = encoder.encode("hello"_L1);
+        QCOMPARE(encoded, "hello");
+        QStringEncoder moved(std::move(encoder));
+        encoded = moved.encode("bye");
+        QCOMPARE(encoded, "bye");
+    }
+}
+
+void tst_QStringConverter::icuInvalidCharacter_data()
+{
+    QTest::addColumn<QString>("string");
+    QTest::addColumn<QByteArray>("bytearray");
+    QTest::addColumn<QByteArray>("codec");
+    QTest::addColumn<QStringConverter::Flags>("flags");
+    QTest::addColumn<bool>("shouldDecode");
+
+    using Flags = QStringConverter::Flags;
+    using Flag = QStringConverter::Flag;
+    QTest::addRow("encode")
+            << u"Test👪Test"_s
+            << QByteArrayLiteral("\xE3\x85\xA2\xA3\x3F\xE3\x85\xA2\xA3")
+            << QByteArray("IBM-037") << Flags(Flag::Default)
+            << false;
+    QTest::addRow("encode_null")
+            << u"Test👪Test"_s
+            << QByteArrayLiteral("\xE3\x85\xA2\xA3\0\xE3\x85\xA2\xA3")
+            << QByteArray("IBM-037") << Flags(Flag::ConvertInvalidToNull)
+            << false;
+    QTest::addRow("decode_incomplete_EUC-JP")
+            << u"test"_s
+            << QByteArrayLiteral("test\x8F")
+            << QByteArray("EUC-JP") << Flags(Flag::Stateless)
+            << true;
+    QTest::addRow("decode_invalid_EUC-JP_sequence")
+            << u"test\0test"_s
+            << QByteArrayLiteral("test\x8Ftest")
+            << QByteArray("EUC-JP") << Flags(Flag::ConvertInvalidToNull)
+            << true;
+    QTest::addRow("encode_incomplete_surrogate")
+            << u"test"_s + QChar::highSurrogate(0x11136)
+            << QByteArray("test")
+            << QByteArray("EUC-JP") << Flags(Flag::Stateless)
+            << false;
+}
+
+void tst_QStringConverter::icuInvalidCharacter()
+{
+    QFETCH(QString, string);
+    QFETCH(QByteArray, bytearray);
+    QFETCH(QByteArray, codec);
+    QFETCH(QStringConverter::Flags, flags);
+    QFETCH(bool, shouldDecode);
+    if (shouldDecode) {
+        QStringDecoder decoder(codec.data(), flags);
+        QVERIFY(decoder.isValid());
+        QString decoded = decoder.decode(bytearray);
+        QVERIFY(decoder.hasError());
+        QCOMPARE(decoded, string);
+    } else {
+        QStringEncoder encoder(codec.data(), flags);
+        QVERIFY(encoder.isValid());
+        QByteArray encoded = encoder.encode(string);
+        QVERIFY(encoder.hasError());
+        QCOMPARE(encoded, bytearray);
+    }
+}
+
+#endif
+
 void tst_QStringConverter::flagF7808080() const
 {
     /* This test case stems from test not-wf-sa-170, tests/qxmlstream/XML-Test-Suite/xmlconf/xmltest/not-wf/sa/166.xml,
author	Fabian Kosmale <fabian.kosmale@qt.io>	2022-01-31 11:25:25 +0100
committer	Fabian Kosmale <fabian.kosmale@qt.io>	2022-06-19 00:41:12 +0200
commit	122270d6bea164e6df4357f4d4d77aacfa430470 (patch)
tree	4d0477aa23a0575b7d6185311ca51a56746ab0f4
parent	d350373133f169b44fd98faab6fe3f75abab6282 (diff)