diff options
author | Lars Knoll <lars.knoll@qt.io> | 2020-04-30 11:43:21 +0200 |
---|---|---|
committer | Lars Knoll <lars.knoll@qt.io> | 2020-05-14 07:49:05 +0200 |
commit | b8db1233411893aaecb0b6a61b02b0ef3c1520e5 (patch) | |
tree | 13021d96df118c38d72b64de2eaee3628c8eaad9 /src/corelib/text/qstringconverter.cpp | |
parent | 13af1312f7416dd23baf512dcb9e51dce3d936fc (diff) |
Add a method to determine the encoding for encoded HTML data
This is a replacement for Qt::codecForHtml().
Change-Id: I31f03518fd9c70507cbd210a8bcf405b6a0106b1
Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
Diffstat (limited to 'src/corelib/text/qstringconverter.cpp')
-rw-r--r-- | src/corelib/text/qstringconverter.cpp | 48 |
1 files changed, 48 insertions, 0 deletions
diff --git a/src/corelib/text/qstringconverter.cpp b/src/corelib/text/qstringconverter.cpp index 9a3d92dbaa..bc81b06c0e 100644 --- a/src/corelib/text/qstringconverter.cpp +++ b/src/corelib/text/qstringconverter.cpp @@ -44,6 +44,7 @@ #include "private/qsimd_p.h" #include "private/qstringiterator_p.h" +#include "qbytearraymatcher.h" #ifdef Q_OS_WIN #include <qt_windows.h> @@ -1531,6 +1532,53 @@ std::optional<QStringConverter::Encoding> QStringConverter::encodingForData(cons return std::nullopt; } +/*! + Tries to determine the encoding of the HTML in \a buf by looking at leading byte order marks or + a charset specifier in the HTML meta tag. If the optional is empty, the encoding specified is + not supported by QStringConverter. If no encoding is detected, the method returns Utf8. + */ +std::optional<QStringConverter::Encoding> QStringConverter::encodingForHtml(const char *buf, qsizetype arraySize) +{ + // determine charset + auto encoding = encodingForData(buf, arraySize); + if (encoding) + // trust the initial BOM + return encoding; + + QByteArray header = QByteArray(buf, qMin(arraySize, qsizetype(1024))).toLower(); + int pos = header.indexOf("meta "); + if (pos != -1) { + pos = header.indexOf("charset=", pos); + if (pos != -1) { + pos += qstrlen("charset="); + if (pos < header.size() && (header.at(pos) == '\"' || header.at(pos) == '\'')) + ++pos; + + int pos2 = pos; + // The attribute can be closed with either """, "'", ">" or "/", + // none of which are valid charset characters. + while (++pos2 < header.size()) { + char ch = header.at(pos2); + if (ch == '\"' || ch == '\'' || ch == '>' || ch == '/') { + QByteArray name = header.mid(pos, pos2 - pos); + int colon = name.indexOf(':'); + if (colon > 0) + name = name.left(colon); + name = name.simplified(); + if (name == "unicode") // QTBUG-41998, ICU will return UTF-16. + name = QByteArrayLiteral("UTF-8"); + if (!name.isEmpty()) + return encodingForName(name); + } + } + } + } + return Utf8; +} + +/*! + Returns the canonical name for \a encoding. +*/ const char *QStringConverter::nameForEncoding(QStringConverter::Encoding e) { return encodingInterfaces[int(e)].name; |