summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/corelib/text/qstringconverter.cpp48
-rw-r--r--src/corelib/text/qstringconverter.h1
-rw-r--r--tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp89
3 files changed, 138 insertions, 0 deletions
diff --git a/src/corelib/text/qstringconverter.cpp b/src/corelib/text/qstringconverter.cpp
index 9a3d92dbaa..bc81b06c0e 100644
--- a/src/corelib/text/qstringconverter.cpp
+++ b/src/corelib/text/qstringconverter.cpp
@@ -44,6 +44,7 @@
#include "private/qsimd_p.h"
#include "private/qstringiterator_p.h"
+#include "qbytearraymatcher.h"
#ifdef Q_OS_WIN
#include <qt_windows.h>
@@ -1531,6 +1532,53 @@ std::optional<QStringConverter::Encoding> QStringConverter::encodingForData(cons
return std::nullopt;
}
+/*!
+ Tries to determine the encoding of the HTML in \a buf by looking at leading byte order marks or
+ a charset specifier in the HTML meta tag. If the optional is empty, the encoding specified is
+ not supported by QStringConverter. If no encoding is detected, the method returns Utf8.
+ */
+std::optional<QStringConverter::Encoding> QStringConverter::encodingForHtml(const char *buf, qsizetype arraySize)
+{
+ // determine charset
+ auto encoding = encodingForData(buf, arraySize);
+ if (encoding)
+ // trust the initial BOM
+ return encoding;
+
+ QByteArray header = QByteArray(buf, qMin(arraySize, qsizetype(1024))).toLower();
+ int pos = header.indexOf("meta ");
+ if (pos != -1) {
+ pos = header.indexOf("charset=", pos);
+ if (pos != -1) {
+ pos += qstrlen("charset=");
+ if (pos < header.size() && (header.at(pos) == '\"' || header.at(pos) == '\''))
+ ++pos;
+
+ int pos2 = pos;
+ // The attribute can be closed with either """, "'", ">" or "/",
+ // none of which are valid charset characters.
+ while (++pos2 < header.size()) {
+ char ch = header.at(pos2);
+ if (ch == '\"' || ch == '\'' || ch == '>' || ch == '/') {
+ QByteArray name = header.mid(pos, pos2 - pos);
+ int colon = name.indexOf(':');
+ if (colon > 0)
+ name = name.left(colon);
+ name = name.simplified();
+ if (name == "unicode") // QTBUG-41998, ICU will return UTF-16.
+ name = QByteArrayLiteral("UTF-8");
+ if (!name.isEmpty())
+ return encodingForName(name);
+ }
+ }
+ }
+ }
+ return Utf8;
+}
+
+/*!
+ Returns the canonical name for \a encoding.
+*/
const char *QStringConverter::nameForEncoding(QStringConverter::Encoding e)
{
return encodingInterfaces[int(e)].name;
diff --git a/src/corelib/text/qstringconverter.h b/src/corelib/text/qstringconverter.h
index 6269ace4ac..7443173df6 100644
--- a/src/corelib/text/qstringconverter.h
+++ b/src/corelib/text/qstringconverter.h
@@ -168,6 +168,7 @@ public:
Q_CORE_EXPORT static std::optional<Encoding> encodingForName(const char *name);
Q_CORE_EXPORT static const char *nameForEncoding(Encoding e);
Q_CORE_EXPORT static std::optional<Encoding> encodingForData(const char *buf, qsizetype arraySize, char16_t expectedFirstCharacter = 0);
+ Q_CORE_EXPORT static std::optional<Encoding> encodingForHtml(const char *buf, qsizetype arraySize);
protected:
const Interface *iface;
diff --git a/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp b/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp
index 3f4bbb413f..78595bc17b 100644
--- a/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp
+++ b/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp
@@ -68,6 +68,9 @@ private slots:
void encodingForData_data();
void encodingForData();
+
+ void encodingForHtml_data();
+ void encodingForHtml();
};
void tst_QStringConverter::constructByName()
@@ -1722,6 +1725,92 @@ void tst_QStringConverter::encodingForData()
QCOMPARE(e, encoding);
}
+
+void tst_QStringConverter::encodingForHtml_data()
+{
+ QTest::addColumn<QByteArray>("html");
+ QTest::addColumn<std::optional<QStringConverter::Encoding>>("encoding");
+
+ QByteArray html = "<html><head></head><body>blah</body></html>";
+ QTest::newRow("no charset") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
+
+ html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=ISO-8859-15\" /></head></html>";
+ QTest::newRow("latin 15") << html << std::optional<QStringConverter::Encoding>();
+
+ html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=ISO-8859-1\" /></head></html>";
+ QTest::newRow("latin 1") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Latin1);
+
+ html = "<!DOCTYPE html><html><head><meta charset=\"ISO_8859-1:1987\"><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><title>Test</title></head>";
+ QTest::newRow("latin 1 (#2)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Latin1);
+
+ html = "<!DOCTYPE html><html><head><meta charset=\"utf-8\"><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><title>Test</title></head>";
+ QTest::newRow("UTF-8") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
+
+ html = "<!DOCTYPE html><html><head><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><meta charset=\"utf-8\"><title>Test</title></head>";
+ QTest::newRow("UTF-8 (#2)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
+
+ html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=UTF-8/></head></html>";
+ QTest::newRow("UTF-8, no quotes") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
+
+ html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset='UTF-8'/></head></html>";
+ QTest::newRow("UTF-8, single quotes") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
+
+ html = "<!DOCTYPE html><html><head><meta charset=utf-8><title>Test</title></head>";
+ QTest::newRow("UTF-8, > terminator") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
+
+ html = "<!DOCTYPE html><html><head><meta charset= utf-8 ><title>Test</title></head>";
+ QTest::newRow("UTF-8, > terminator with spaces") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
+
+ // Test invalid charsets.
+ html = "<!DOCTYPE html><html><head><meta charset= utf/8 ><title>Test</title></head>";
+ QTest::newRow("utf/8") << html << std::optional<QStringConverter::Encoding>();
+
+ html = "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=invalid-foo\" /></head></html>";
+ QTest::newRow("invalid charset, no default") << html << std::optional<QStringConverter::Encoding>();
+
+ html = "<!DOCTYPE html><html><head><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><meta charset=\"";
+ html.prepend(QByteArray().fill(' ', 512 - html.size()));
+ QTest::newRow("invalid charset (large header)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
+
+
+ html = "<!DOCTYPE html><html><head><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><meta charset=\"utf-8";
+ QTest::newRow("invalid charset (no closing double quote)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
+
+
+ html = "<!DOCTYPE html><html><head><meta http-equiv=\"X-UA-Compatible\" content=\"IE=9,chrome=1\"><meta charset='utf-8";
+ QTest::newRow("invalid charset (no closing single quote)") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
+
+ html = "<!DOCTYPE html><html><head><meta charset=utf-8 foo=bar><title>Test</title></head>";
+ QTest::newRow("invalid (space terminator)") << html << std::optional<QStringConverter::Encoding>();
+
+ html = "<!DOCTYPE html><html><head><meta charset=\" utf' 8 /><title>Test</title></head>";
+ QTest::newRow("invalid charset, early terminator (')") << html << std::optional<QStringConverter::Encoding>();
+
+ const char src[] = { char(0xff), char(0xfe), char(0x7a), char(0x03), 0, 0 };
+ html = src;
+ QTest::newRow("greek text UTF-16LE") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf16LE);
+
+
+ html = "<meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\"><span style=\"color: rgb(0, 0, 0); font-family: "
+ "'Galatia SIL'; font-size: 27px; font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; "
+ "line-height: normal; orphans: auto; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; widows: "
+ "auto; word-spacing: 0px; -webkit-text-size-adjust: auto; -webkit-text-stroke-width: 0px; display: inline !important; float: "
+ "none;\">&#x37b</span>\000";
+ QTest::newRow("greek text UTF-8") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
+
+ html = "<!DOCTYPE html><html><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=unicode\">"
+ "<head/><body><p>bla</p></body></html>"; // QTBUG-41998, ICU will return UTF-16.
+ QTest::newRow("legacy unicode UTF-8") << html << std::optional<QStringConverter::Encoding>(QStringConverter::Utf8);
+}
+
+void tst_QStringConverter::encodingForHtml()
+{
+ QFETCH(QByteArray, html);
+ QFETCH(std::optional<QStringConverter::Encoding>, encoding);
+
+ QCOMPARE(QStringConverter::encodingForHtml(html.constData(), html.size()), encoding);
+}
+
class LoadAndConvert: public QRunnable
{
public: