From 1197280748b017cbf0c6234c7cb226eca73c00eb Mon Sep 17 00:00:00 2001 From: Lars Knoll Date: Tue, 31 May 2022 11:58:50 +0200 Subject: Add QStringDecoder::decoderForHtml() Now that QStringConverter can handle non UTF encodings through ICU, add a way to get a decoder for arbitrary HTML code. Opposed to QStringConverter::encodingForHtml(), this method will try to create a valid string decoder also for non unicode codecs. Change-Id: I343584da1b114396c744f482d9b433c9cedcc511 Reviewed-by: Fabian Kosmale Reviewed-by: Lars Knoll Reviewed-by: Thiago Macieira (cherry picked from commit 9c1f3b6d4d5a5fe59396062c6f68cc1201665c62) Reviewed-by: Qt Cherry-pick Bot --- src/corelib/kernel/qmimedata.cpp | 7 +- src/corelib/text/qstringconverter.cpp | 64 +++++++++++++---- src/corelib/text/qstringconverter.h | 3 + src/widgets/widgets/qtextbrowser.cpp | 9 ++- .../text/qstringconverter/tst_qstringconverter.cpp | 84 ++++++++++++++++------ 5 files changed, 124 insertions(+), 43 deletions(-) diff --git a/src/corelib/kernel/qmimedata.cpp b/src/corelib/kernel/qmimedata.cpp index 661a30b167..02a86bff37 100644 --- a/src/corelib/kernel/qmimedata.cpp +++ b/src/corelib/kernel/qmimedata.cpp @@ -127,10 +127,9 @@ QVariant QMimeDataPrivate::retrieveTypedData(const QString &format, QMetaType ty if (ba.isNull()) return QVariant(); if (format == "text/html"_L1) { - auto encoding = QStringConverter::encodingForHtml(ba); - if (encoding) { - QStringDecoder toUtf16(*encoding); - return QString(toUtf16(ba)); + QStringDecoder decoder = QStringDecoder::decoderForHtml(ba); + if (decoder.isValid()) { + return QString(decoder(ba)); } // fall back to utf8 } diff --git a/src/corelib/text/qstringconverter.cpp b/src/corelib/text/qstringconverter.cpp index ba13194310..38a434f353 100644 --- a/src/corelib/text/qstringconverter.cpp +++ b/src/corelib/text/qstringconverter.cpp @@ -2049,20 +2049,8 @@ QStringConverter::encodingForData(QByteArrayView data, char16_t expectedFirstCha return std::nullopt; } -/*! - Tries to determine the encoding of the HTML in \a data by looking at leading byte - order marks or a charset specifier in the HTML meta tag. If the optional is empty, - the encoding specified is not supported by QStringConverter. If no encoding is - detected, the method returns Utf8. -*/ -std::optional QStringConverter::encodingForHtml(QByteArrayView data) +static QByteArray parseHtmlMetaForEncoding(QByteArrayView data) { - // determine charset - auto encoding = encodingForData(data); - if (encoding) - // trust the initial BOM - return encoding; - static constexpr auto metaSearcher = qMakeStaticByteArrayMatcher("meta "); static constexpr auto charsetSearcher = qMakeStaticByteArrayMatcher("charset="); @@ -2089,14 +2077,62 @@ std::optional QStringConverter::encodingForHtml(QByt if (name == "unicode") // QTBUG-41998, ICU will return UTF-16. name = QByteArrayLiteral("UTF-8"); if (!name.isEmpty()) - return encodingForName(name); + return name; } } } } + return QByteArray(); +} + +/*! + Tries to determine the encoding of the HTML in \a data by looking at leading byte + order marks or a charset specifier in the HTML meta tag. If the optional is empty, + the encoding specified is not supported by QStringConverter. If no encoding is + detected, the method returns Utf8. + + \sa QStringDecoder::decoderForHtml() +*/ +std::optional QStringConverter::encodingForHtml(QByteArrayView data) +{ + // determine charset + std::optional encoding = encodingForData(data); + if (encoding) + // trust the initial BOM + return encoding; + + QByteArray encodingTag = parseHtmlMetaForEncoding(data); + if (!encodingTag.isEmpty()) + return encodingForName(encodingTag); + return Utf8; } +/*! + Tries to determine the encoding of the HTML in \a data by looking at leading byte + order marks or a charset specifier in the HTML meta tag and returns a QStringDecoder + matching the encoding. If the returned decoder is not valid, + the encoding specified is not supported by QStringConverter. If no encoding is + detected, the method returns a decoder for Utf8. + + \sa isValid() +*/ +QStringDecoder QStringDecoder::decoderForHtml(QByteArrayView data) +{ + // determine charset + std::optional encoding = encodingForData(data); + if (encoding) + // trust the initial BOM + return QStringDecoder(encoding.value()); + + QByteArray encodingTag = parseHtmlMetaForEncoding(data); + if (!encodingTag.isEmpty()) + return QStringDecoder(encodingTag); + + return QStringDecoder(Utf8); +} + + /*! Returns the canonical name for encoding \a e. */ diff --git a/src/corelib/text/qstringconverter.h b/src/corelib/text/qstringconverter.h index c1645f0938..6664beccb3 100644 --- a/src/corelib/text/qstringconverter.h +++ b/src/corelib/text/qstringconverter.h @@ -140,6 +140,9 @@ public: } return iface->toUtf16(out, ba, &state); } + + Q_CORE_EXPORT static QStringDecoder decoderForHtml(QByteArrayView data); + private: QString decodeAsString(QByteArrayView in) { diff --git a/src/widgets/widgets/qtextbrowser.cpp b/src/widgets/widgets/qtextbrowser.cpp index 0273331c40..2bab2efc61 100644 --- a/src/widgets/widgets/qtextbrowser.cpp +++ b/src/widgets/widgets/qtextbrowser.cpp @@ -288,12 +288,11 @@ void QTextBrowserPrivate::setSource(const QUrl &url, QTextDocument::ResourceType } else if (data.userType() == QMetaType::QByteArray) { QByteArray ba = data.toByteArray(); if (type == QTextDocument::HtmlResource) { - auto encoding = QStringConverter::encodingForHtml(ba); - if (!encoding) + auto decoder = QStringDecoder::decoderForHtml(ba); + if (!decoder.isValid()) // fall back to utf8 - encoding = QStringDecoder::Utf8; - QStringDecoder toUtf16(*encoding); - txt = toUtf16(ba); + decoder = QStringDecoder(QStringDecoder::Utf8); + txt = decoder(ba); } else { txt = QString::fromUtf8(ba); } diff --git a/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp b/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp index 07f29a6429..541a0bee45 100644 --- a/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp +++ b/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp @@ -2256,65 +2256,102 @@ void tst_QStringConverter::encodingForHtml_data() { QTest::addColumn("html"); QTest::addColumn>("encoding"); + QTest::addColumn("name"); // ICU name if we have ICU support QByteArray html = "blah"; - QTest::newRow("no charset") << html << std::optional(QStringConverter::Utf8); + QTest::newRow("no charset") << html << std::optional(QStringConverter::Utf8) << QByteArray("UTF-8"); html = ""; - QTest::newRow("latin 15") << html << std::optional(); + QTest::newRow("latin 15") << html << std::optional() << QByteArray("ISO-8859-15"); + + html = ""; + QTest::newRow("sjis") << html << std::optional() << QByteArray("Shift_JIS"); + + html = ""; + QTest::newRow("ISO-2022-JP") << html << std::optional() << QByteArray("ISO-2022-JP"); + + html = ""; + QTest::newRow("ISO-2022") << html << std::optional() << QByteArray("ISO-2022-JP"); + + html = ""; + QTest::newRow("GB2312") << html << std::optional() << QByteArray("GB2312"); + + html = ""; + QTest::newRow("Big5") << html << std::optional() << QByteArray("Big5"); + + html = ""; + QTest::newRow("GB18030") << html << std::optional() << QByteArray("GB18030"); + + html = ""; + QTest::newRow("GB2312-HKSCS") << html << std::optional() << QByteArray("GB2312-HKSCS"); + + html = ""; + QTest::newRow("Big5-HKSCS") << html << std::optional() << QByteArray("Big5-HKSCS"); + + html = ""; + QTest::newRow("EucJP") << html << std::optional() << QByteArray("EUC-JP"); + + html = ""; + QTest::newRow("EucKR") << html << std::optional() << QByteArray("EUC-KR"); + + html = ""; + QTest::newRow("KOI8-R") << html << std::optional() << QByteArray("KOI8-R"); + + html = ""; + QTest::newRow("KOI8-U") << html << std::optional() << QByteArray("KOI8-U"); html = ""; - QTest::newRow("latin 1") << html << std::optional(QStringConverter::Latin1); + QTest::newRow("latin 1") << html << std::optional(QStringConverter::Latin1) << QByteArray("ISO-8859-1"); html = "Test"; - QTest::newRow("latin 1 (#2)") << html << std::optional(QStringConverter::Latin1); + QTest::newRow("latin 1 (#2)") << html << std::optional(QStringConverter::Latin1) << QByteArray("ISO-8859-1"); html = "Test"; - QTest::newRow("UTF-8") << html << std::optional(QStringConverter::Utf8); + QTest::newRow("UTF-8") << html << std::optional(QStringConverter::Utf8) << QByteArray("UTF-8"); html = "Test"; - QTest::newRow("UTF-8 (#2)") << html << std::optional(QStringConverter::Utf8); + QTest::newRow("UTF-8 (#2)") << html << std::optional(QStringConverter::Utf8) << QByteArray("UTF-8"); html = ""; - QTest::newRow("UTF-8, no quotes") << html << std::optional(QStringConverter::Utf8); + QTest::newRow("UTF-8, no quotes") << html << std::optional(QStringConverter::Utf8) << QByteArray("UTF-8"); html = ""; - QTest::newRow("UTF-8, single quotes") << html << std::optional(QStringConverter::Utf8); + QTest::newRow("UTF-8, single quotes") << html << std::optional(QStringConverter::Utf8) << QByteArray("UTF-8"); html = "Test"; - QTest::newRow("UTF-8, > terminator") << html << std::optional(QStringConverter::Utf8); + QTest::newRow("UTF-8, > terminator") << html << std::optional(QStringConverter::Utf8) << QByteArray("UTF-8"); html = "Test"; - QTest::newRow("UTF-8, > terminator with spaces") << html << std::optional(QStringConverter::Utf8); + QTest::newRow("UTF-8, > terminator with spaces") << html << std::optional(QStringConverter::Utf8) << QByteArray("UTF-8"); // Test invalid charsets. html = "Test"; - QTest::newRow("utf/8") << html << std::optional(); + QTest::newRow("utf/8") << html << std::optional() << QByteArray(); html = ""; - QTest::newRow("invalid charset, no default") << html << std::optional(); + QTest::newRow("invalid charset, no default") << html << std::optional() << QByteArray("UTF-8"); html = "(QStringConverter::Utf8); + QTest::newRow("invalid charset (large header)") << html << std::optional(QStringConverter::Utf8) << QByteArray("UTF-8"); html = "(QStringConverter::Utf8); + QTest::newRow("invalid charset (no closing double quote)") << html << std::optional(QStringConverter::Utf8) << QByteArray("UTF-8"); html = "Test"; - QTest::newRow("invalid charset, early terminator (')") << html << std::optional(); + QTest::newRow("invalid charset, early terminator (')") << html << std::optional() << QByteArray(); const char src[] = { char(0xff), char(0xfe), char(0x7a), char(0x03), 0, 0 }; html = src; - QTest::newRow("greek text UTF-16LE") << html << std::optional(QStringConverter::Utf16LE); + QTest::newRow("greek text UTF-16LE") << html << std::optional(QStringConverter::Utf16LE) << QByteArray("UTF-16LE"); html = "ͻ\000"; - QTest::newRow("greek text UTF-8") << html << std::optional(QStringConverter::Utf8); + QTest::newRow("greek text UTF-8") << html << std::optional(QStringConverter::Utf8) << QByteArray("UTF-8"); html = "" "

bla

"; // QTBUG-41998, ICU will return UTF-16. - QTest::newRow("legacy unicode UTF-8") << html << std::optional(QStringConverter::Utf8); + QTest::newRow("legacy unicode UTF-8") << html << std::optional(QStringConverter::Utf8) << QByteArray("UTF-8"); } void tst_QStringConverter::encodingForHtml() { QFETCH(QByteArray, html); QFETCH(std::optional, encoding); + QFETCH(QByteArray, name); QCOMPARE(QStringConverter::encodingForHtml(html), encoding); + + QStringDecoder decoder = QStringDecoder::decoderForHtml(html); + if (encoding || // we should have a valid decoder independent of ICU support + decoder.isValid()) { // we got a valid decoder through ICU + QCOMPARE(decoder.name(), name); + } } class LoadAndConvert: public QRunnable -- cgit v1.2.3