diff options
author | Mitch Curtis <mitch.curtis@digia.com> | 2013-02-07 10:24:01 +0100 |
---|---|---|
committer | The Qt Project <gerrit-noreply@qt-project.org> | 2013-02-12 01:31:26 +0100 |
commit | 86115848b55faa747adf8bb39a213b3cec7673c4 (patch) | |
tree | c6a49f6eba56f3eb2986e9f729362084948c5a78 /src/corelib/codecs/qtextcodec.cpp | |
parent | 7abc1a6a828eb36d26901d4ea5ecf8b4514f3cd7 (diff) |
Correctly detect HTML 5 charset attribute in QTextCodec::codecForHtml()
QTextCodec::codecForHtml currently fails to detect the charset for this
HTML:
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=9,chrome=1">
<title>Test</title>
</head>
This patch makes the detection of charsets more flexible, allowing for
the use of the HTML 5 charset attribute as well more terminator characters
("'", and ">").
I also added a *_data function for the unit tests.
Task-number: QTBUG-5451
Change-Id: I69fe4a04582f0d845cbbe9140a86a950fb7dc861
Reviewed-by: Olivier Goffart <ogoffart@woboq.com>
Reviewed-by: Denis Dzyubenko <denis@ddenis.info>
Diffstat (limited to 'src/corelib/codecs/qtextcodec.cpp')
-rw-r--r-- | src/corelib/codecs/qtextcodec.cpp | 32 |
1 files changed, 17 insertions, 15 deletions
diff --git a/src/corelib/codecs/qtextcodec.cpp b/src/corelib/codecs/qtextcodec.cpp index 0e671518ef..511817677c 100644 --- a/src/corelib/codecs/qtextcodec.cpp +++ b/src/corelib/codecs/qtextcodec.cpp @@ -1043,28 +1043,30 @@ QString QTextDecoder::toUnicode(const QByteArray &ba) QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba, QTextCodec *defaultCodec) { // determine charset - int pos; - QTextCodec *c = 0; - - c = QTextCodec::codecForUtfText(ba, c); + QTextCodec *c = QTextCodec::codecForUtfText(ba, 0); if (!c) { QByteArray header = ba.left(512).toLower(); - if ((pos = header.indexOf("http-equiv=")) != -1) { - if ((pos = header.lastIndexOf("meta ", pos)) != -1) { - pos = header.indexOf("charset=", pos) + int(strlen("charset=")); - if (pos != -1) { - int pos2 = header.indexOf('\"', pos+1); - QByteArray cs = header.mid(pos, pos2-pos); - // qDebug("found charset: %s", cs.data()); - c = QTextCodec::codecForName(cs); + int pos = header.indexOf("meta "); + if (pos != -1) { + pos = header.indexOf("charset=", pos); + if (pos != -1) { + pos += qstrlen("charset="); + + int pos2 = pos; + // The attribute can be closed with either """, "'", ">" or "/", + // none of which are valid charset characters. + while (++pos2 < header.size()) { + char ch = header.at(pos2); + if (ch == '\"' || ch == '\'' || ch == '>') { + c = QTextCodec::codecForName(header.mid(pos, pos2 - pos)); + return c ? c : defaultCodec; + } } } } } - if (!c) - c = defaultCodec; - return c; + return defaultCodec; } /*! |