summaryrefslogtreecommitdiffstats
path: root/src/corelib/text/qstringconverter.cpp
diff options
context:
space:
mode:
authorLars Knoll <lars.knoll@qt.io>2020-04-30 11:43:21 +0200
committerLars Knoll <lars.knoll@qt.io>2020-05-14 07:49:05 +0200
commitb8db1233411893aaecb0b6a61b02b0ef3c1520e5 (patch)
tree13021d96df118c38d72b64de2eaee3628c8eaad9 /src/corelib/text/qstringconverter.cpp
parent13af1312f7416dd23baf512dcb9e51dce3d936fc (diff)
Add a method to determine the encoding for encoded HTML data
This is a replacement for Qt::codecForHtml(). Change-Id: I31f03518fd9c70507cbd210a8bcf405b6a0106b1 Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
Diffstat (limited to 'src/corelib/text/qstringconverter.cpp')
-rw-r--r--src/corelib/text/qstringconverter.cpp48
1 files changed, 48 insertions, 0 deletions
diff --git a/src/corelib/text/qstringconverter.cpp b/src/corelib/text/qstringconverter.cpp
index 9a3d92dbaa..bc81b06c0e 100644
--- a/src/corelib/text/qstringconverter.cpp
+++ b/src/corelib/text/qstringconverter.cpp
@@ -44,6 +44,7 @@
#include "private/qsimd_p.h"
#include "private/qstringiterator_p.h"
+#include "qbytearraymatcher.h"
#ifdef Q_OS_WIN
#include <qt_windows.h>
@@ -1531,6 +1532,53 @@ std::optional<QStringConverter::Encoding> QStringConverter::encodingForData(cons
return std::nullopt;
}
+/*!
+ Tries to determine the encoding of the HTML in \a buf by looking at leading byte order marks or
+ a charset specifier in the HTML meta tag. If the optional is empty, the encoding specified is
+ not supported by QStringConverter. If no encoding is detected, the method returns Utf8.
+ */
+std::optional<QStringConverter::Encoding> QStringConverter::encodingForHtml(const char *buf, qsizetype arraySize)
+{
+ // determine charset
+ auto encoding = encodingForData(buf, arraySize);
+ if (encoding)
+ // trust the initial BOM
+ return encoding;
+
+ QByteArray header = QByteArray(buf, qMin(arraySize, qsizetype(1024))).toLower();
+ int pos = header.indexOf("meta ");
+ if (pos != -1) {
+ pos = header.indexOf("charset=", pos);
+ if (pos != -1) {
+ pos += qstrlen("charset=");
+ if (pos < header.size() && (header.at(pos) == '\"' || header.at(pos) == '\''))
+ ++pos;
+
+ int pos2 = pos;
+ // The attribute can be closed with either """, "'", ">" or "/",
+ // none of which are valid charset characters.
+ while (++pos2 < header.size()) {
+ char ch = header.at(pos2);
+ if (ch == '\"' || ch == '\'' || ch == '>' || ch == '/') {
+ QByteArray name = header.mid(pos, pos2 - pos);
+ int colon = name.indexOf(':');
+ if (colon > 0)
+ name = name.left(colon);
+ name = name.simplified();
+ if (name == "unicode") // QTBUG-41998, ICU will return UTF-16.
+ name = QByteArrayLiteral("UTF-8");
+ if (!name.isEmpty())
+ return encodingForName(name);
+ }
+ }
+ }
+ }
+ return Utf8;
+}
+
+/*!
+ Returns the canonical name for \a encoding.
+*/
const char *QStringConverter::nameForEncoding(QStringConverter::Encoding e)
{
return encodingInterfaces[int(e)].name;