summaryrefslogtreecommitdiffstats
path: root/simplehtmlparser.cpp
diff options
context:
space:
mode:
authorTor Arne Vestbø <tor.arne.vestbo@nokia.com>2010-05-03 12:34:27 +0200
committerTor Arne Vestbø <tor.arne.vestbo@nokia.com>2010-05-03 12:34:27 +0200
commit415b44ffb6ffcdeb4ad5b7cdd6d9d9e17ab3e47f (patch)
treed9417f99ef9665167668fe3047416a0d11e1518f /simplehtmlparser.cpp
Initial checkinHEADmaster
Diffstat (limited to 'simplehtmlparser.cpp')
-rw-r--r--simplehtmlparser.cpp600
1 files changed, 600 insertions, 0 deletions
diff --git a/simplehtmlparser.cpp b/simplehtmlparser.cpp
new file mode 100644
index 0000000..579564b
--- /dev/null
+++ b/simplehtmlparser.cpp
@@ -0,0 +1,600 @@
+/****************************************************************************
+ **
+ ** Copyright (C) 2008-2010 Nokia Corporation and/or its subsidiary(-ies).
+ ** Contact: Nokia Corporation (qt-info@nokia.com)
+ **
+ ** This file is part of the doxygen2qthelp project on Trolltech Labs.
+ **
+ ** This file may be used under the terms of the GNU General Public
+ ** License version 2.0 or 3.0 as published by the Free Software Foundation
+ ** and appearing in the file LICENSE.GPL included in the packaging of
+ ** this file. Please review the following information to ensure GNU
+ ** General Public Licensing requirements will be met:
+ ** http://www.fsf.org/licensing/licenses/info/GPLv2.html and
+ ** http://www.gnu.org/copyleft/gpl.html.
+ **
+ ** If you are unsure which license is appropriate for your use, please
+ ** contact the sales department at qt-sales@nokia.com.
+ **
+ ** This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
+ ** WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+ **
+ ****************************************************************************/
+
+#include "simplehtmlparser_p.h"
+#include "simplehtmlreceiver_p.h"
+
+#include <QtCore/QtAlgorithms>
+#include <QtCore/QStringList>
+
+QT_BEGIN_NAMESPACE
+
+// see also tst_qtextdocumentfragment.cpp
+#define MAX_ENTITY 258
+static const struct QTextHtmlEntity
+{
+ const char *name;
+ quint16 code;
+} entities[MAX_ENTITY]= {
+ { "AElig", 0x00c6 },
+ { "AMP", 38 },
+ { "Aacute", 0x00c1 },
+ { "Acirc", 0x00c2 },
+ { "Agrave", 0x00c0 },
+ { "Alpha", 0x0391 },
+ { "Aring", 0x00c5 },
+ { "Atilde", 0x00c3 },
+ { "Auml", 0x00c4 },
+ { "Beta", 0x0392 },
+ { "Ccedil", 0x00c7 },
+ { "Chi", 0x03a7 },
+ { "Dagger", 0x2021 },
+ { "Delta", 0x0394 },
+ { "ETH", 0x00d0 },
+ { "Eacute", 0x00c9 },
+ { "Ecirc", 0x00ca },
+ { "Egrave", 0x00c8 },
+ { "Epsilon", 0x0395 },
+ { "Eta", 0x0397 },
+ { "Euml", 0x00cb },
+ { "GT", 62 },
+ { "Gamma", 0x0393 },
+ { "Iacute", 0x00cd },
+ { "Icirc", 0x00ce },
+ { "Igrave", 0x00cc },
+ { "Iota", 0x0399 },
+ { "Iuml", 0x00cf },
+ { "Kappa", 0x039a },
+ { "LT", 60 },
+ { "Lambda", 0x039b },
+ { "Mu", 0x039c },
+ { "Ntilde", 0x00d1 },
+ { "Nu", 0x039d },
+ { "OElig", 0x0152 },
+ { "Oacute", 0x00d3 },
+ { "Ocirc", 0x00d4 },
+ { "Ograve", 0x00d2 },
+ { "Omega", 0x03a9 },
+ { "Omicron", 0x039f },
+ { "Oslash", 0x00d8 },
+ { "Otilde", 0x00d5 },
+ { "Ouml", 0x00d6 },
+ { "Phi", 0x03a6 },
+ { "Pi", 0x03a0 },
+ { "Prime", 0x2033 },
+ { "Psi", 0x03a8 },
+ { "QUOT", 34 },
+ { "Rho", 0x03a1 },
+ { "Scaron", 0x0160 },
+ { "Sigma", 0x03a3 },
+ { "THORN", 0x00de },
+ { "Tau", 0x03a4 },
+ { "Theta", 0x0398 },
+ { "Uacute", 0x00da },
+ { "Ucirc", 0x00db },
+ { "Ugrave", 0x00d9 },
+ { "Upsilon", 0x03a5 },
+ { "Uuml", 0x00dc },
+ { "Xi", 0x039e },
+ { "Yacute", 0x00dd },
+ { "Yuml", 0x0178 },
+ { "Zeta", 0x0396 },
+ { "aacute", 0x00e1 },
+ { "acirc", 0x00e2 },
+ { "acute", 0x00b4 },
+ { "aelig", 0x00e6 },
+ { "agrave", 0x00e0 },
+ { "alefsym", 0x2135 },
+ { "alpha", 0x03b1 },
+ { "amp", 38 },
+ { "and", 0x22a5 },
+ { "ang", 0x2220 },
+ { "apos", 0x0027 },
+ { "aring", 0x00e5 },
+ { "asymp", 0x2248 },
+ { "atilde", 0x00e3 },
+ { "auml", 0x00e4 },
+ { "bdquo", 0x201e },
+ { "beta", 0x03b2 },
+ { "brvbar", 0x00a6 },
+ { "bull", 0x2022 },
+ { "cap", 0x2229 },
+ { "ccedil", 0x00e7 },
+ { "cedil", 0x00b8 },
+ { "cent", 0x00a2 },
+ { "chi", 0x03c7 },
+ { "circ", 0x02c6 },
+ { "clubs", 0x2663 },
+ { "cong", 0x2245 },
+ { "copy", 0x00a9 },
+ { "crarr", 0x21b5 },
+ { "cup", 0x222a },
+ { "curren", 0x00a4 },
+ { "dArr", 0x21d3 },
+ { "dagger", 0x2020 },
+ { "darr", 0x2193 },
+ { "deg", 0x00b0 },
+ { "delta", 0x03b4 },
+ { "diams", 0x2666 },
+ { "divide", 0x00f7 },
+ { "eacute", 0x00e9 },
+ { "ecirc", 0x00ea },
+ { "egrave", 0x00e8 },
+ { "empty", 0x2205 },
+ { "emsp", 0x2003 },
+ { "ensp", 0x2002 },
+ { "epsilon", 0x03b5 },
+ { "equiv", 0x2261 },
+ { "eta", 0x03b7 },
+ { "eth", 0x00f0 },
+ { "euml", 0x00eb },
+ { "euro", 0x20ac },
+ { "exist", 0x2203 },
+ { "fnof", 0x0192 },
+ { "forall", 0x2200 },
+ { "frac12", 0x00bd },
+ { "frac14", 0x00bc },
+ { "frac34", 0x00be },
+ { "frasl", 0x2044 },
+ { "gamma", 0x03b3 },
+ { "ge", 0x2265 },
+ { "gt", 62 },
+ { "hArr", 0x21d4 },
+ { "harr", 0x2194 },
+ { "hearts", 0x2665 },
+ { "hellip", 0x2026 },
+ { "iacute", 0x00ed },
+ { "icirc", 0x00ee },
+ { "iexcl", 0x00a1 },
+ { "igrave", 0x00ec },
+ { "image", 0x2111 },
+ { "infin", 0x221e },
+ { "int", 0x222b },
+ { "iota", 0x03b9 },
+ { "iquest", 0x00bf },
+ { "isin", 0x2208 },
+ { "iuml", 0x00ef },
+ { "kappa", 0x03ba },
+ { "lArr", 0x21d0 },
+ { "lambda", 0x03bb },
+ { "lang", 0x2329 },
+ { "laquo", 0x00ab },
+ { "larr", 0x2190 },
+ { "lceil", 0x2308 },
+ { "ldquo", 0x201c },
+ { "le", 0x2264 },
+ { "lfloor", 0x230a },
+ { "lowast", 0x2217 },
+ { "loz", 0x25ca },
+ { "lrm", 0x200e },
+ { "lsaquo", 0x2039 },
+ { "lsquo", 0x2018 },
+ { "lt", 60 },
+ { "macr", 0x00af },
+ { "mdash", 0x2014 },
+ { "micro", 0x00b5 },
+ { "middot", 0x00b7 },
+ { "minus", 0x2212 },
+ { "mu", 0x03bc },
+ { "nabla", 0x2207 },
+ { "nbsp", 0x00a0 },
+ { "ndash", 0x2013 },
+ { "ne", 0x2260 },
+ { "ni", 0x220b },
+ { "not", 0x00ac },
+ { "notin", 0x2209 },
+ { "nsub", 0x2284 },
+ { "ntilde", 0x00f1 },
+ { "nu", 0x03bd },
+ { "oacute", 0x00f3 },
+ { "ocirc", 0x00f4 },
+ { "oelig", 0x0153 },
+ { "ograve", 0x00f2 },
+ { "oline", 0x203e },
+ { "omega", 0x03c9 },
+ { "omicron", 0x03bf },
+ { "oplus", 0x2295 },
+ { "or", 0x22a6 },
+ { "ordf", 0x00aa },
+ { "ordm", 0x00ba },
+ { "oslash", 0x00f8 },
+ { "otilde", 0x00f5 },
+ { "otimes", 0x2297 },
+ { "ouml", 0x00f6 },
+ { "para", 0x00b6 },
+ { "part", 0x2202 },
+ { "percnt", 0x0025 },
+ { "permil", 0x2030 },
+ { "perp", 0x22a5 },
+ { "phi", 0x03c6 },
+ { "pi", 0x03c0 },
+ { "piv", 0x03d6 },
+ { "plusmn", 0x00b1 },
+ { "pound", 0x00a3 },
+ { "prime", 0x2032 },
+ { "prod", 0x220f },
+ { "prop", 0x221d },
+ { "psi", 0x03c8 },
+ { "quot", 34 },
+ { "rArr", 0x21d2 },
+ { "radic", 0x221a },
+ { "rang", 0x232a },
+ { "raquo", 0x00bb },
+ { "rarr", 0x2192 },
+ { "rceil", 0x2309 },
+ { "rdquo", 0x201d },
+ { "real", 0x211c },
+ { "reg", 0x00ae },
+ { "rfloor", 0x230b },
+ { "rho", 0x03c1 },
+ { "rlm", 0x200f },
+ { "rsaquo", 0x203a },
+ { "rsquo", 0x2019 },
+ { "sbquo", 0x201a },
+ { "scaron", 0x0161 },
+ { "sdot", 0x22c5 },
+ { "sect", 0x00a7 },
+ { "shy", 0x00ad },
+ { "sigma", 0x03c3 },
+ { "sigmaf", 0x03c2 },
+ { "sim", 0x223c },
+ { "spades", 0x2660 },
+ { "sub", 0x2282 },
+ { "sube", 0x2286 },
+ { "sum", 0x2211 },
+ { "sup", 0x2283 },
+ { "sup1", 0x00b9 },
+ { "sup2", 0x00b2 },
+ { "sup3", 0x00b3 },
+ { "supe", 0x2287 },
+ { "szlig", 0x00df },
+ { "tau", 0x03c4 },
+ { "there4", 0x2234 },
+ { "theta", 0x03b8 },
+ { "thetasym", 0x03d1 },
+ { "thinsp", 0x2009 },
+ { "thorn", 0x00fe },
+ { "tilde", 0x02dc },
+ { "times", 0x00d7 },
+ { "trade", 0x2122 },
+ { "uArr", 0x21d1 },
+ { "uacute", 0x00fa },
+ { "uarr", 0x2191 },
+ { "ucirc", 0x00fb },
+ { "ugrave", 0x00f9 },
+ { "uml", 0x00a8 },
+ { "upsih", 0x03d2 },
+ { "upsilon", 0x03c5 },
+ { "uuml", 0x00fc },
+ { "weierp", 0x2118 },
+ { "xi", 0x03be },
+ { "yacute", 0x00fd },
+ { "yen", 0x00a5 },
+ { "yuml", 0x00ff },
+ { "zeta", 0x03b6 },
+ { "zwj", 0x200d },
+ { "zwnj", 0x200c }
+};
+
+static bool operator<(const QString &entityStr, const QTextHtmlEntity &entity)
+{
+ return entityStr < QLatin1String(entity.name);
+}
+
+static bool operator<(const QTextHtmlEntity &entity, const QString &entityStr)
+{
+ return QLatin1String(entity.name) < entityStr;
+}
+
+static QChar resolveEntity(const QString &entity)
+{
+ const QTextHtmlEntity *start = &entities[0];
+ const QTextHtmlEntity *end = &entities[MAX_ENTITY];
+ const QTextHtmlEntity *e = qBinaryFind(start, end, entity);
+ if (e == end)
+ return QChar();
+ return e->code;
+}
+
+static const uint windowsLatin1ExtendedCharacters[0xA0 - 0x80] = {
+ 0x20ac, // 0x80
+ 0x0081, // 0x81 direct mapping
+ 0x201a, // 0x82
+ 0x0192, // 0x83
+ 0x201e, // 0x84
+ 0x2026, // 0x85
+ 0x2020, // 0x86
+ 0x2021, // 0x87
+ 0x02C6, // 0x88
+ 0x2030, // 0x89
+ 0x0160, // 0x8A
+ 0x2039, // 0x8B
+ 0x0152, // 0x8C
+ 0x008D, // 0x8D direct mapping
+ 0x017D, // 0x8E
+ 0x008F, // 0x8F directmapping
+ 0x0090, // 0x90 directmapping
+ 0x2018, // 0x91
+ 0x2019, // 0x92
+ 0x201C, // 0x93
+ 0X201D, // 0x94
+ 0x2022, // 0x95
+ 0x2013, // 0x96
+ 0x2014, // 0x97
+ 0x02DC, // 0x98
+ 0x2122, // 0x99
+ 0x0161, // 0x9A
+ 0x203A, // 0x9B
+ 0x0153, // 0x9C
+ 0x009D, // 0x9D direct mapping
+ 0x017E, // 0x9E
+ 0x0178 // 0x9F
+};
+
+void SimpleHtmlParser::parse(const QString &text, SimpleHtmlReceiver *receiver)
+{
+ m_receiver = receiver;
+ txt = text;
+ pos = 0;
+ len = txt.length();
+ parse();
+ Q_ASSERT(m_receiver != NULL);
+ m_receiver->onStop();
+ //dumpHtml();
+}
+
+void SimpleHtmlParser::eatSpace()
+{
+ while (pos < len && txt.at(pos).isSpace() && txt.at(pos)
+ != QChar::ParagraphSeparator) {
+ pos++;
+ }
+}
+
+void SimpleHtmlParser::parse()
+{
+ QString buffer;
+ while (pos < len) {
+ QChar c = txt.at(pos++);
+ if (c == QLatin1Char('<')) {
+ if (!buffer.isEmpty()) {
+ Q_ASSERT(m_receiver != NULL);
+ m_receiver->onTextChunk(buffer);
+ buffer.clear();
+ }
+ parseTag();
+ } else if (c == QLatin1Char('&')) {
+ buffer += parseEntity();
+ } else {
+ buffer += c;
+ }
+ }
+
+ if (!buffer.isEmpty()) {
+ Q_ASSERT(m_receiver != NULL);
+ m_receiver->onTextChunk(buffer);
+ buffer.clear();
+ }
+}
+
+// parses a tag after "<"
+void SimpleHtmlParser::parseTag()
+{
+ eatSpace();
+
+ // handle comments and other exclamation mark declarations
+ if (hasPrefix(QLatin1Char('!'))) {
+ parseExclamationTag();
+ return;
+ }
+
+ // if close tag just close
+ if (hasPrefix(QLatin1Char('/'))) {
+ parseCloseTag();
+ return;
+ }
+
+ // parse tag name
+ const QString tagName = parseWord().toLower();
+
+ // _need_ at least one space after the tag name,
+ // otherwise there can't be attributes
+ QStringList attributes;
+ if (pos < len && txt.at(pos).isSpace()) {
+ attributes = parseAttributes();
+ }
+
+ Q_ASSERT(m_receiver != NULL);
+ m_receiver->onTagOpen(tagName, attributes);
+
+ // finish tag
+ bool tagClosed = false;
+ while (pos < len && txt.at(pos) != QLatin1Char('>')) {
+ if (txt.at(pos) == QLatin1Char('/'))
+ tagClosed = true;
+
+ pos++;
+ }
+ pos++;
+}
+
+// parses a tag beginning with "/"
+void SimpleHtmlParser::parseCloseTag()
+{
+ ++pos;
+ const QString tagName = parseWord().toLower().trimmed();
+ Q_ASSERT(m_receiver != NULL);
+ m_receiver->onTagClose(tagName);
+
+ while (pos < len) {
+ QChar c = txt.at(pos++);
+ if (c == QLatin1Char('>'))
+ break;
+ }
+}
+
+// parses a tag beginning with "!"
+void SimpleHtmlParser::parseExclamationTag()
+{
+ ++pos;
+ if (hasPrefix(QLatin1Char('-'),1) && hasPrefix(QLatin1Char('-'),2)) {
+ pos += 3;
+ // eat comments
+ int end = txt.indexOf(QLatin1String("-->"), pos);
+ pos = (end >= 0 ? end + 3 : len);
+ } else {
+ // eat internal tags
+ while (pos < len) {
+ QChar c = txt.at(pos++);
+ if (c == QLatin1Char('>'))
+ break;
+ }
+ }
+}
+
+// parses an entity after "&", and returns it
+QString SimpleHtmlParser::parseEntity()
+{
+ int recover = pos;
+ QString entity;
+ while (pos < len) {
+ QChar c = txt.at(pos++);
+ if (c.isSpace() || pos - recover > 9) {
+ goto error;
+ }
+ if (c == QLatin1Char(';'))
+ break;
+ entity += c;
+ }
+ {
+ QChar resolved = resolveEntity(entity);
+ if (!resolved.isNull())
+ return QString(resolved);
+ }
+ if (entity.length() > 1 && entity.at(0) == QLatin1Char('#')) {
+ entity.remove(0, 1); // removing leading #
+
+ int base = 10;
+ bool ok = false;
+
+ if (entity.at(0).toLower() == QLatin1Char('x')) { // hex entity?
+ entity.remove(0, 1);
+ base = 16;
+ }
+
+ uint uc = entity.toUInt(&ok, base);
+ if (ok) {
+ if (uc >= 0x80 && uc < 0x80
+ + (sizeof(windowsLatin1ExtendedCharacters)
+ /sizeof(windowsLatin1ExtendedCharacters[0])))
+ uc = windowsLatin1ExtendedCharacters[uc - 0x80];
+ QString str;
+ if (uc > 0xffff) {
+ // surrogate pair
+ uc -= 0x10000;
+ ushort high = uc/0x400 + 0xd800;
+ ushort low = uc%0x400 + 0xdc00;
+ str.append(QChar(high));
+ str.append(QChar(low));
+ } else {
+ str.append(QChar(uc));
+ }
+ return str;
+ }
+ }
+error:
+ pos = recover;
+ return QLatin1String("&");
+}
+
+// parses one word, possibly quoted, and returns it
+QString SimpleHtmlParser::parseWord()
+{
+ QString word;
+ if (hasPrefix(QLatin1Char('\"'))) { // double quotes
+ ++pos;
+ while (pos < len) {
+ QChar c = txt.at(pos++);
+ if (c == QLatin1Char('\"'))
+ break;
+ else if (c == QLatin1Char('&'))
+ word += parseEntity();
+ else
+ word += c;
+ }
+ } else if (hasPrefix(QLatin1Char('\''))) { // single quotes
+ ++pos;
+ while (pos < len) {
+ QChar c = txt.at(pos++);
+ if (c == QLatin1Char('\''))
+ break;
+ else
+ word += c;
+ }
+ } else { // normal text
+ while (pos < len) {
+ QChar c = txt.at(pos++);
+ if (c == QLatin1Char('>')
+ || (c == QLatin1Char('/') && hasPrefix(QLatin1Char('>'), 1))
+ || c == QLatin1Char('<')
+ || c == QLatin1Char('=')
+ || c.isSpace()) {
+ --pos;
+ break;
+ }
+ if (c == QLatin1Char('&'))
+ word += parseEntity();
+ else
+ word += c;
+ }
+ }
+ return word;
+}
+
+QStringList SimpleHtmlParser::parseAttributes()
+{
+ QStringList attrs;
+
+ while (pos < len) {
+ eatSpace();
+ if (hasPrefix(QLatin1Char('>')) || hasPrefix(QLatin1Char('/')))
+ break;
+ QString key = parseWord().toLower();
+ QString value = QLatin1String("1");
+ if (key.size() == 0)
+ break;
+ eatSpace();
+ if (hasPrefix(QLatin1Char('='))){
+ pos++;
+ eatSpace();
+ value = parseWord();
+ }
+ if (value.size() == 0)
+ continue;
+ attrs << key << value;
+ }
+
+ return attrs;
+}
+
+QT_END_NAMESPACE