From 415b44ffb6ffcdeb4ad5b7cdd6d9d9e17ab3e47f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tor=20Arne=20Vestb=C3=B8?= Date: Mon, 3 May 2010 12:34:27 +0200 Subject: Initial checkin --- simplehtmlparser.cpp | 600 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 600 insertions(+) create mode 100644 simplehtmlparser.cpp (limited to 'simplehtmlparser.cpp') diff --git a/simplehtmlparser.cpp b/simplehtmlparser.cpp new file mode 100644 index 0000000..579564b --- /dev/null +++ b/simplehtmlparser.cpp @@ -0,0 +1,600 @@ +/**************************************************************************** + ** + ** Copyright (C) 2008-2010 Nokia Corporation and/or its subsidiary(-ies). + ** Contact: Nokia Corporation (qt-info@nokia.com) + ** + ** This file is part of the doxygen2qthelp project on Trolltech Labs. + ** + ** This file may be used under the terms of the GNU General Public + ** License version 2.0 or 3.0 as published by the Free Software Foundation + ** and appearing in the file LICENSE.GPL included in the packaging of + ** this file. Please review the following information to ensure GNU + ** General Public Licensing requirements will be met: + ** http://www.fsf.org/licensing/licenses/info/GPLv2.html and + ** http://www.gnu.org/copyleft/gpl.html. + ** + ** If you are unsure which license is appropriate for your use, please + ** contact the sales department at qt-sales@nokia.com. + ** + ** This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE + ** WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. + ** + ****************************************************************************/ + +#include "simplehtmlparser_p.h" +#include "simplehtmlreceiver_p.h" + +#include +#include + +QT_BEGIN_NAMESPACE + +// see also tst_qtextdocumentfragment.cpp +#define MAX_ENTITY 258 +static const struct QTextHtmlEntity +{ + const char *name; + quint16 code; +} entities[MAX_ENTITY]= { + { "AElig", 0x00c6 }, + { "AMP", 38 }, + { "Aacute", 0x00c1 }, + { "Acirc", 0x00c2 }, + { "Agrave", 0x00c0 }, + { "Alpha", 0x0391 }, + { "Aring", 0x00c5 }, + { "Atilde", 0x00c3 }, + { "Auml", 0x00c4 }, + { "Beta", 0x0392 }, + { "Ccedil", 0x00c7 }, + { "Chi", 0x03a7 }, + { "Dagger", 0x2021 }, + { "Delta", 0x0394 }, + { "ETH", 0x00d0 }, + { "Eacute", 0x00c9 }, + { "Ecirc", 0x00ca }, + { "Egrave", 0x00c8 }, + { "Epsilon", 0x0395 }, + { "Eta", 0x0397 }, + { "Euml", 0x00cb }, + { "GT", 62 }, + { "Gamma", 0x0393 }, + { "Iacute", 0x00cd }, + { "Icirc", 0x00ce }, + { "Igrave", 0x00cc }, + { "Iota", 0x0399 }, + { "Iuml", 0x00cf }, + { "Kappa", 0x039a }, + { "LT", 60 }, + { "Lambda", 0x039b }, + { "Mu", 0x039c }, + { "Ntilde", 0x00d1 }, + { "Nu", 0x039d }, + { "OElig", 0x0152 }, + { "Oacute", 0x00d3 }, + { "Ocirc", 0x00d4 }, + { "Ograve", 0x00d2 }, + { "Omega", 0x03a9 }, + { "Omicron", 0x039f }, + { "Oslash", 0x00d8 }, + { "Otilde", 0x00d5 }, + { "Ouml", 0x00d6 }, + { "Phi", 0x03a6 }, + { "Pi", 0x03a0 }, + { "Prime", 0x2033 }, + { "Psi", 0x03a8 }, + { "QUOT", 34 }, + { "Rho", 0x03a1 }, + { "Scaron", 0x0160 }, + { "Sigma", 0x03a3 }, + { "THORN", 0x00de }, + { "Tau", 0x03a4 }, + { "Theta", 0x0398 }, + { "Uacute", 0x00da }, + { "Ucirc", 0x00db }, + { "Ugrave", 0x00d9 }, + { "Upsilon", 0x03a5 }, + { "Uuml", 0x00dc }, + { "Xi", 0x039e }, + { "Yacute", 0x00dd }, + { "Yuml", 0x0178 }, + { "Zeta", 0x0396 }, + { "aacute", 0x00e1 }, + { "acirc", 0x00e2 }, + { "acute", 0x00b4 }, + { "aelig", 0x00e6 }, + { "agrave", 0x00e0 }, + { "alefsym", 0x2135 }, + { "alpha", 0x03b1 }, + { "amp", 38 }, + { "and", 0x22a5 }, + { "ang", 0x2220 }, + { "apos", 0x0027 }, + { "aring", 0x00e5 }, + { "asymp", 0x2248 }, + { "atilde", 0x00e3 }, + { "auml", 0x00e4 }, + { "bdquo", 0x201e }, + { "beta", 0x03b2 }, + { "brvbar", 0x00a6 }, + { "bull", 0x2022 }, + { "cap", 0x2229 }, + { "ccedil", 0x00e7 }, + { "cedil", 0x00b8 }, + { "cent", 0x00a2 }, + { "chi", 0x03c7 }, + { "circ", 0x02c6 }, + { "clubs", 0x2663 }, + { "cong", 0x2245 }, + { "copy", 0x00a9 }, + { "crarr", 0x21b5 }, + { "cup", 0x222a }, + { "curren", 0x00a4 }, + { "dArr", 0x21d3 }, + { "dagger", 0x2020 }, + { "darr", 0x2193 }, + { "deg", 0x00b0 }, + { "delta", 0x03b4 }, + { "diams", 0x2666 }, + { "divide", 0x00f7 }, + { "eacute", 0x00e9 }, + { "ecirc", 0x00ea }, + { "egrave", 0x00e8 }, + { "empty", 0x2205 }, + { "emsp", 0x2003 }, + { "ensp", 0x2002 }, + { "epsilon", 0x03b5 }, + { "equiv", 0x2261 }, + { "eta", 0x03b7 }, + { "eth", 0x00f0 }, + { "euml", 0x00eb }, + { "euro", 0x20ac }, + { "exist", 0x2203 }, + { "fnof", 0x0192 }, + { "forall", 0x2200 }, + { "frac12", 0x00bd }, + { "frac14", 0x00bc }, + { "frac34", 0x00be }, + { "frasl", 0x2044 }, + { "gamma", 0x03b3 }, + { "ge", 0x2265 }, + { "gt", 62 }, + { "hArr", 0x21d4 }, + { "harr", 0x2194 }, + { "hearts", 0x2665 }, + { "hellip", 0x2026 }, + { "iacute", 0x00ed }, + { "icirc", 0x00ee }, + { "iexcl", 0x00a1 }, + { "igrave", 0x00ec }, + { "image", 0x2111 }, + { "infin", 0x221e }, + { "int", 0x222b }, + { "iota", 0x03b9 }, + { "iquest", 0x00bf }, + { "isin", 0x2208 }, + { "iuml", 0x00ef }, + { "kappa", 0x03ba }, + { "lArr", 0x21d0 }, + { "lambda", 0x03bb }, + { "lang", 0x2329 }, + { "laquo", 0x00ab }, + { "larr", 0x2190 }, + { "lceil", 0x2308 }, + { "ldquo", 0x201c }, + { "le", 0x2264 }, + { "lfloor", 0x230a }, + { "lowast", 0x2217 }, + { "loz", 0x25ca }, + { "lrm", 0x200e }, + { "lsaquo", 0x2039 }, + { "lsquo", 0x2018 }, + { "lt", 60 }, + { "macr", 0x00af }, + { "mdash", 0x2014 }, + { "micro", 0x00b5 }, + { "middot", 0x00b7 }, + { "minus", 0x2212 }, + { "mu", 0x03bc }, + { "nabla", 0x2207 }, + { "nbsp", 0x00a0 }, + { "ndash", 0x2013 }, + { "ne", 0x2260 }, + { "ni", 0x220b }, + { "not", 0x00ac }, + { "notin", 0x2209 }, + { "nsub", 0x2284 }, + { "ntilde", 0x00f1 }, + { "nu", 0x03bd }, + { "oacute", 0x00f3 }, + { "ocirc", 0x00f4 }, + { "oelig", 0x0153 }, + { "ograve", 0x00f2 }, + { "oline", 0x203e }, + { "omega", 0x03c9 }, + { "omicron", 0x03bf }, + { "oplus", 0x2295 }, + { "or", 0x22a6 }, + { "ordf", 0x00aa }, + { "ordm", 0x00ba }, + { "oslash", 0x00f8 }, + { "otilde", 0x00f5 }, + { "otimes", 0x2297 }, + { "ouml", 0x00f6 }, + { "para", 0x00b6 }, + { "part", 0x2202 }, + { "percnt", 0x0025 }, + { "permil", 0x2030 }, + { "perp", 0x22a5 }, + { "phi", 0x03c6 }, + { "pi", 0x03c0 }, + { "piv", 0x03d6 }, + { "plusmn", 0x00b1 }, + { "pound", 0x00a3 }, + { "prime", 0x2032 }, + { "prod", 0x220f }, + { "prop", 0x221d }, + { "psi", 0x03c8 }, + { "quot", 34 }, + { "rArr", 0x21d2 }, + { "radic", 0x221a }, + { "rang", 0x232a }, + { "raquo", 0x00bb }, + { "rarr", 0x2192 }, + { "rceil", 0x2309 }, + { "rdquo", 0x201d }, + { "real", 0x211c }, + { "reg", 0x00ae }, + { "rfloor", 0x230b }, + { "rho", 0x03c1 }, + { "rlm", 0x200f }, + { "rsaquo", 0x203a }, + { "rsquo", 0x2019 }, + { "sbquo", 0x201a }, + { "scaron", 0x0161 }, + { "sdot", 0x22c5 }, + { "sect", 0x00a7 }, + { "shy", 0x00ad }, + { "sigma", 0x03c3 }, + { "sigmaf", 0x03c2 }, + { "sim", 0x223c }, + { "spades", 0x2660 }, + { "sub", 0x2282 }, + { "sube", 0x2286 }, + { "sum", 0x2211 }, + { "sup", 0x2283 }, + { "sup1", 0x00b9 }, + { "sup2", 0x00b2 }, + { "sup3", 0x00b3 }, + { "supe", 0x2287 }, + { "szlig", 0x00df }, + { "tau", 0x03c4 }, + { "there4", 0x2234 }, + { "theta", 0x03b8 }, + { "thetasym", 0x03d1 }, + { "thinsp", 0x2009 }, + { "thorn", 0x00fe }, + { "tilde", 0x02dc }, + { "times", 0x00d7 }, + { "trade", 0x2122 }, + { "uArr", 0x21d1 }, + { "uacute", 0x00fa }, + { "uarr", 0x2191 }, + { "ucirc", 0x00fb }, + { "ugrave", 0x00f9 }, + { "uml", 0x00a8 }, + { "upsih", 0x03d2 }, + { "upsilon", 0x03c5 }, + { "uuml", 0x00fc }, + { "weierp", 0x2118 }, + { "xi", 0x03be }, + { "yacute", 0x00fd }, + { "yen", 0x00a5 }, + { "yuml", 0x00ff }, + { "zeta", 0x03b6 }, + { "zwj", 0x200d }, + { "zwnj", 0x200c } +}; + +static bool operator<(const QString &entityStr, const QTextHtmlEntity &entity) +{ + return entityStr < QLatin1String(entity.name); +} + +static bool operator<(const QTextHtmlEntity &entity, const QString &entityStr) +{ + return QLatin1String(entity.name) < entityStr; +} + +static QChar resolveEntity(const QString &entity) +{ + const QTextHtmlEntity *start = &entities[0]; + const QTextHtmlEntity *end = &entities[MAX_ENTITY]; + const QTextHtmlEntity *e = qBinaryFind(start, end, entity); + if (e == end) + return QChar(); + return e->code; +} + +static const uint windowsLatin1ExtendedCharacters[0xA0 - 0x80] = { + 0x20ac, // 0x80 + 0x0081, // 0x81 direct mapping + 0x201a, // 0x82 + 0x0192, // 0x83 + 0x201e, // 0x84 + 0x2026, // 0x85 + 0x2020, // 0x86 + 0x2021, // 0x87 + 0x02C6, // 0x88 + 0x2030, // 0x89 + 0x0160, // 0x8A + 0x2039, // 0x8B + 0x0152, // 0x8C + 0x008D, // 0x8D direct mapping + 0x017D, // 0x8E + 0x008F, // 0x8F directmapping + 0x0090, // 0x90 directmapping + 0x2018, // 0x91 + 0x2019, // 0x92 + 0x201C, // 0x93 + 0X201D, // 0x94 + 0x2022, // 0x95 + 0x2013, // 0x96 + 0x2014, // 0x97 + 0x02DC, // 0x98 + 0x2122, // 0x99 + 0x0161, // 0x9A + 0x203A, // 0x9B + 0x0153, // 0x9C + 0x009D, // 0x9D direct mapping + 0x017E, // 0x9E + 0x0178 // 0x9F +}; + +void SimpleHtmlParser::parse(const QString &text, SimpleHtmlReceiver *receiver) +{ + m_receiver = receiver; + txt = text; + pos = 0; + len = txt.length(); + parse(); + Q_ASSERT(m_receiver != NULL); + m_receiver->onStop(); + //dumpHtml(); +} + +void SimpleHtmlParser::eatSpace() +{ + while (pos < len && txt.at(pos).isSpace() && txt.at(pos) + != QChar::ParagraphSeparator) { + pos++; + } +} + +void SimpleHtmlParser::parse() +{ + QString buffer; + while (pos < len) { + QChar c = txt.at(pos++); + if (c == QLatin1Char('<')) { + if (!buffer.isEmpty()) { + Q_ASSERT(m_receiver != NULL); + m_receiver->onTextChunk(buffer); + buffer.clear(); + } + parseTag(); + } else if (c == QLatin1Char('&')) { + buffer += parseEntity(); + } else { + buffer += c; + } + } + + if (!buffer.isEmpty()) { + Q_ASSERT(m_receiver != NULL); + m_receiver->onTextChunk(buffer); + buffer.clear(); + } +} + +// parses a tag after "<" +void SimpleHtmlParser::parseTag() +{ + eatSpace(); + + // handle comments and other exclamation mark declarations + if (hasPrefix(QLatin1Char('!'))) { + parseExclamationTag(); + return; + } + + // if close tag just close + if (hasPrefix(QLatin1Char('/'))) { + parseCloseTag(); + return; + } + + // parse tag name + const QString tagName = parseWord().toLower(); + + // _need_ at least one space after the tag name, + // otherwise there can't be attributes + QStringList attributes; + if (pos < len && txt.at(pos).isSpace()) { + attributes = parseAttributes(); + } + + Q_ASSERT(m_receiver != NULL); + m_receiver->onTagOpen(tagName, attributes); + + // finish tag + bool tagClosed = false; + while (pos < len && txt.at(pos) != QLatin1Char('>')) { + if (txt.at(pos) == QLatin1Char('/')) + tagClosed = true; + + pos++; + } + pos++; +} + +// parses a tag beginning with "/" +void SimpleHtmlParser::parseCloseTag() +{ + ++pos; + const QString tagName = parseWord().toLower().trimmed(); + Q_ASSERT(m_receiver != NULL); + m_receiver->onTagClose(tagName); + + while (pos < len) { + QChar c = txt.at(pos++); + if (c == QLatin1Char('>')) + break; + } +} + +// parses a tag beginning with "!" +void SimpleHtmlParser::parseExclamationTag() +{ + ++pos; + if (hasPrefix(QLatin1Char('-'),1) && hasPrefix(QLatin1Char('-'),2)) { + pos += 3; + // eat comments + int end = txt.indexOf(QLatin1String("-->"), pos); + pos = (end >= 0 ? end + 3 : len); + } else { + // eat internal tags + while (pos < len) { + QChar c = txt.at(pos++); + if (c == QLatin1Char('>')) + break; + } + } +} + +// parses an entity after "&", and returns it +QString SimpleHtmlParser::parseEntity() +{ + int recover = pos; + QString entity; + while (pos < len) { + QChar c = txt.at(pos++); + if (c.isSpace() || pos - recover > 9) { + goto error; + } + if (c == QLatin1Char(';')) + break; + entity += c; + } + { + QChar resolved = resolveEntity(entity); + if (!resolved.isNull()) + return QString(resolved); + } + if (entity.length() > 1 && entity.at(0) == QLatin1Char('#')) { + entity.remove(0, 1); // removing leading # + + int base = 10; + bool ok = false; + + if (entity.at(0).toLower() == QLatin1Char('x')) { // hex entity? + entity.remove(0, 1); + base = 16; + } + + uint uc = entity.toUInt(&ok, base); + if (ok) { + if (uc >= 0x80 && uc < 0x80 + + (sizeof(windowsLatin1ExtendedCharacters) + /sizeof(windowsLatin1ExtendedCharacters[0]))) + uc = windowsLatin1ExtendedCharacters[uc - 0x80]; + QString str; + if (uc > 0xffff) { + // surrogate pair + uc -= 0x10000; + ushort high = uc/0x400 + 0xd800; + ushort low = uc%0x400 + 0xdc00; + str.append(QChar(high)); + str.append(QChar(low)); + } else { + str.append(QChar(uc)); + } + return str; + } + } +error: + pos = recover; + return QLatin1String("&"); +} + +// parses one word, possibly quoted, and returns it +QString SimpleHtmlParser::parseWord() +{ + QString word; + if (hasPrefix(QLatin1Char('\"'))) { // double quotes + ++pos; + while (pos < len) { + QChar c = txt.at(pos++); + if (c == QLatin1Char('\"')) + break; + else if (c == QLatin1Char('&')) + word += parseEntity(); + else + word += c; + } + } else if (hasPrefix(QLatin1Char('\''))) { // single quotes + ++pos; + while (pos < len) { + QChar c = txt.at(pos++); + if (c == QLatin1Char('\'')) + break; + else + word += c; + } + } else { // normal text + while (pos < len) { + QChar c = txt.at(pos++); + if (c == QLatin1Char('>') + || (c == QLatin1Char('/') && hasPrefix(QLatin1Char('>'), 1)) + || c == QLatin1Char('<') + || c == QLatin1Char('=') + || c.isSpace()) { + --pos; + break; + } + if (c == QLatin1Char('&')) + word += parseEntity(); + else + word += c; + } + } + return word; +} + +QStringList SimpleHtmlParser::parseAttributes() +{ + QStringList attrs; + + while (pos < len) { + eatSpace(); + if (hasPrefix(QLatin1Char('>')) || hasPrefix(QLatin1Char('/'))) + break; + QString key = parseWord().toLower(); + QString value = QLatin1String("1"); + if (key.size() == 0) + break; + eatSpace(); + if (hasPrefix(QLatin1Char('='))){ + pos++; + eatSpace(); + value = parseWord(); + } + if (value.size() == 0) + continue; + attrs << key << value; + } + + return attrs; +} + +QT_END_NAMESPACE -- cgit v1.2.3