summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorMitch Curtis <mitch.curtis@digia.com>2013-02-07 10:24:01 +0100
committerThe Qt Project <gerrit-noreply@qt-project.org>2013-02-12 01:31:26 +0100
commit86115848b55faa747adf8bb39a213b3cec7673c4 (patch)
treec6a49f6eba56f3eb2986e9f729362084948c5a78 /src
parent7abc1a6a828eb36d26901d4ea5ecf8b4514f3cd7 (diff)
Correctly detect HTML 5 charset attribute in QTextCodec::codecForHtml()
QTextCodec::codecForHtml currently fails to detect the charset for this HTML: <!DOCTYPE html> <html> <head> <meta charset="utf-8"> <meta http-equiv="X-UA-Compatible" content="IE=9,chrome=1"> <title>Test</title> </head> This patch makes the detection of charsets more flexible, allowing for the use of the HTML 5 charset attribute as well more terminator characters ("'", and ">"). I also added a *_data function for the unit tests. Task-number: QTBUG-5451 Change-Id: I69fe4a04582f0d845cbbe9140a86a950fb7dc861 Reviewed-by: Olivier Goffart <ogoffart@woboq.com> Reviewed-by: Denis Dzyubenko <denis@ddenis.info>
Diffstat (limited to 'src')
-rw-r--r--src/corelib/codecs/qtextcodec.cpp32
1 files changed, 17 insertions, 15 deletions
diff --git a/src/corelib/codecs/qtextcodec.cpp b/src/corelib/codecs/qtextcodec.cpp
index 0e671518ef..511817677c 100644
--- a/src/corelib/codecs/qtextcodec.cpp
+++ b/src/corelib/codecs/qtextcodec.cpp
@@ -1043,28 +1043,30 @@ QString QTextDecoder::toUnicode(const QByteArray &ba)
QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba, QTextCodec *defaultCodec)
{
// determine charset
- int pos;
- QTextCodec *c = 0;
-
- c = QTextCodec::codecForUtfText(ba, c);
+ QTextCodec *c = QTextCodec::codecForUtfText(ba, 0);
if (!c) {
QByteArray header = ba.left(512).toLower();
- if ((pos = header.indexOf("http-equiv=")) != -1) {
- if ((pos = header.lastIndexOf("meta ", pos)) != -1) {
- pos = header.indexOf("charset=", pos) + int(strlen("charset="));
- if (pos != -1) {
- int pos2 = header.indexOf('\"', pos+1);
- QByteArray cs = header.mid(pos, pos2-pos);
- // qDebug("found charset: %s", cs.data());
- c = QTextCodec::codecForName(cs);
+ int pos = header.indexOf("meta ");
+ if (pos != -1) {
+ pos = header.indexOf("charset=", pos);
+ if (pos != -1) {
+ pos += qstrlen("charset=");
+
+ int pos2 = pos;
+ // The attribute can be closed with either """, "'", ">" or "/",
+ // none of which are valid charset characters.
+ while (++pos2 < header.size()) {
+ char ch = header.at(pos2);
+ if (ch == '\"' || ch == '\'' || ch == '>') {
+ c = QTextCodec::codecForName(header.mid(pos, pos2 - pos));
+ return c ? c : defaultCodec;
+ }
}
}
}
}
- if (!c)
- c = defaultCodec;
- return c;
+ return defaultCodec;
}
/*!