summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIvan Solovev <ivan.solovev@qt.io>2021-08-30 10:38:28 +0200
committerIvan Solovev <ivan.solovev@qt.io>2021-09-08 14:31:29 +0200
commit6ab89e118251a836a32bfd7e6b8db5cb033b0b4b (patch)
treecdd4d439a2599805e245f24c18637fea8816c5f7
parent57c86998fd1e891a032b6cfe5a874d17a238e178 (diff)
Fix QTextCodec::canEncode() for ICU codec
QTextCodec::canEncode() relies on the number of invalid characters to determine if the encoding is possible or not. By default the ICU fromUnicode converter method does not provide any ways to track the amount of failures. However it uses callbacks to report errors or replace unrecognized characters with substitute string. This patch introduces a custom callback for fromUnicode conversion. The callback just increases the invalid characters counter and then calls the default callback, which does its usual job. Task-number: QTBUG-83081 Change-Id: Ie07fd183c728c7c77e8285f55238b1d57f5c9eb2 (adapted from commit 421de71a521ab07e942ae46a8f0a8f36147d86c8) Reviewed-by: Edward Welbourne <edward.welbourne@qt.io>
-rw-r--r--src/corelib/codecs/qicucodec.cpp38
-rw-r--r--tests/auto/corelib/codecs/qtextcodec/test.pro2
-rw-r--r--tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp69
3 files changed, 107 insertions, 2 deletions
diff --git a/src/corelib/codecs/qicucodec.cpp b/src/corelib/codecs/qicucodec.cpp
index c1f4eecd52..c570166427 100644
--- a/src/corelib/codecs/qicucodec.cpp
+++ b/src/corelib/codecs/qicucodec.cpp
@@ -565,6 +565,32 @@ QIcuCodec::~QIcuCodec()
{
}
+/*!
+ \internal
+
+ Custom callback for the ICU from Unicode conversion. It's invoked when the
+ conversion from Unicode detects illegal or unrecognized character.
+
+ Assumes that context contains a pointer to QTextCodec::ConverterState
+ structure. Updates its invalid characters count and calls a default
+ callback, that replaces the invalid characters properly.
+*/
+static void customFromUnicodeSubstitutionCallback(const void *context,
+ UConverterFromUnicodeArgs *fromUArgs,
+ const UChar *codeUnits,
+ int32_t length,
+ UChar32 codePoint,
+ UConverterCallbackReason reason,
+ UErrorCode *err)
+{
+ auto *state = reinterpret_cast<QTextCodec::ConverterState *>(const_cast<void *>(context));
+ if (state)
+ state->invalidChars++;
+ // Call the default callback that replaces all illegal or unrecognized
+ // sequences with the substitute string
+ UCNV_FROM_U_CALLBACK_SUBSTITUTE(nullptr, fromUArgs, codeUnits, length, codePoint, reason, err);
+}
+
UConverter *QIcuCodec::getConverter(QTextCodec::ConverterState *state) const
{
UConverter *conv = nullptr;
@@ -577,8 +603,18 @@ UConverter *QIcuCodec::getConverter(QTextCodec::ConverterState *state) const
state->d = ucnv_open(m_name, &error);
ucnv_setSubstChars(static_cast<UConverter *>(state->d),
state->flags & QTextCodec::ConvertInvalidToNull ? "\0" : "?", 1, &error);
- if (U_FAILURE(error))
+ if (!U_FAILURE(error)) {
+ error = U_ZERO_ERROR;
+ ucnv_setFromUCallBack(static_cast<UConverter *>(state->d),
+ customFromUnicodeSubstitutionCallback, state, nullptr,
+ nullptr, &error);
+ if (U_FAILURE(error)) {
+ qDebug("getConverter(state) failed to install custom callback. "
+ "canEncode() may report incorrect results.");
+ }
+ } else {
qDebug("getConverter(state) ucnv_open failed %s %s", m_name, u_errorName(error));
+ }
}
conv = static_cast<UConverter *>(state->d);
}
diff --git a/tests/auto/corelib/codecs/qtextcodec/test.pro b/tests/auto/corelib/codecs/qtextcodec/test.pro
index 7505c5ad51..07c1e4e2bd 100644
--- a/tests/auto/corelib/codecs/qtextcodec/test.pro
+++ b/tests/auto/corelib/codecs/qtextcodec/test.pro
@@ -1,5 +1,5 @@
CONFIG += testcase
-QT = core testlib
+QT = core-private testlib
SOURCES = tst_qtextcodec.cpp
TARGET = tst_qtextcodec
diff --git a/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp b/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp
index 78b6449a69..62a8321844 100644
--- a/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp
+++ b/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp
@@ -38,6 +38,11 @@
#endif
#include <QThreadPool>
+#include <private/qglobal_p.h> // for the icu feature test
+#if QT_CONFIG(icu)
+# include <unicode/uvernum.h>
+#endif
+
class tst_QTextCodec : public QObject
{
Q_OBJECT
@@ -96,6 +101,9 @@ private slots:
void shiftJis();
void userCodec();
+
+ void canEncode();
+ void canEncode_data();
};
void tst_QTextCodec::toUnicode_data()
@@ -2455,6 +2463,67 @@ void tst_QTextCodec::userCodec()
QCOMPARE(pcodec, nullptr);
}
+void tst_QTextCodec::canEncode()
+{
+ QFETCH(QString, codecName);
+ QFETCH(QString, inputString);
+ QFETCH(QByteArray, expectedData);
+ QFETCH(bool, canEncode);
+
+ QTextCodec *codec = QTextCodec::codecForName(codecName.toLatin1());
+ QVERIFY(codec != nullptr);
+
+ QCOMPARE(codec->canEncode(inputString), canEncode);
+ QByteArray encoded = codec->fromUnicode(inputString);
+ QCOMPARE(encoded, expectedData);
+}
+
+void tst_QTextCodec::canEncode_data()
+{
+ QTest::addColumn<QString>("codecName");
+ QTest::addColumn<QString>("inputString");
+ QTest::addColumn<QByteArray>("expectedData");
+ QTest::addColumn<bool>("canEncode");
+
+ QTest::newRow("English ISO-8859-1") << "ISO-8859-1" << "Hello World"
+ << QByteArray("Hello World") << true;
+ QTest::newRow("English big5") << "Big5" << "Hello World" << QByteArray("Hello World") << true;
+
+ QTest::newRow("Greek win1252")
+ << "Windows-1252"
+ << QString("\u03c0\u03bf\u03bb\u03cd\u03c4\u03c1\u03bf\u03c0\u03bf\u03bd")
+ << QByteArray("??????????") << false;
+ QTest::newRow("Greek win1253")
+ << "Windows-1253"
+ << QString("\u03c0\u03bf\u03bb\u03cd\u03c4\u03c1\u03bf\u03c0\u03bf\u03bd")
+ << QByteArray("\xF0\xEF\xEB\xFD\xF4\xF1\xEF\xF0\xEF\xED") << true;
+
+ QTest::newRow("Russian win1252")
+ << "Windows-1252" << QString("\u041f\u0440\u0438\u0432\u0435\u0442 \u043c\u0438\u0440")
+ << QByteArray("?????? ???") << false;
+ QTest::newRow("Russian win1251")
+ << "Windows-1251" << QString("\u041f\u0440\u0438\u0432\u0435\u0442 \u043c\u0438\u0440")
+ << QByteArray("\xCF\xF0\xE8\xE2\xE5\xF2 \xEC\xE8\xF0") << true;
+
+ QTest::newRow("English from ucs4")
+ << "ISO-8859-1" << QString("\u0048\u0065\u006c\u006c\u006f\u0021")
+ << QByteArray("Hello!") << true;
+
+ // ICU on Linux RHEL 7.6 seems to be old, and does not handle NULL
+ // characters properly. It returns 0x01 instead of 0x00 for it, so
+ // we just skip the test.
+#if !QT_CONFIG(icu) || (U_ICU_VERSION_MAJOR_NUM > 56)
+ QTest::newRow("With null") << "ISO-8859-1" << QString::fromUcs4(U"Hello\u0000World", 11)
+ << QByteArray("Hello\x00World", 11) << true;
+#endif
+
+ QTest::newRow("With special chars")
+ << "ISO-8859-1" << QString("\u0001\u0002\u0003\u0008\u0009\u000a\u000b\u000d")
+ << QByteArray("\x01\x02\x03\b\t\n\x0B\r") << true;
+
+ QTest::newRow("Pencil icon") << "ISO-8859-1" << QString("\u270f") << QByteArray("?") << false;
+}
+
struct DontCrashAtExit {
~DontCrashAtExit() {
QTextCodec *c = QTextCodec::codecForName("utf8");