summaryrefslogtreecommitdiffstats
path: root/util/lexgen/re2nfa.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'util/lexgen/re2nfa.cpp')
-rw-r--r--util/lexgen/re2nfa.cpp547
1 files changed, 547 insertions, 0 deletions
diff --git a/util/lexgen/re2nfa.cpp b/util/lexgen/re2nfa.cpp
new file mode 100644
index 0000000000..77cf5019fe
--- /dev/null
+++ b/util/lexgen/re2nfa.cpp
@@ -0,0 +1,547 @@
+/****************************************************************************
+**
+** Copyright (C) 2011 Nokia Corporation and/or its subsidiary(-ies).
+** All rights reserved.
+** Contact: Nokia Corporation (qt-info@nokia.com)
+**
+** This file is part of the utils of the Qt Toolkit.
+**
+** $QT_BEGIN_LICENSE:LGPL$
+** No Commercial Usage
+** This file contains pre-release code and may not be distributed.
+** You may use this file in accordance with the terms and conditions
+** contained in the Technology Preview License Agreement accompanying
+** this package.
+**
+** GNU Lesser General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU Lesser
+** General Public License version 2.1 as published by the Free Software
+** Foundation and appearing in the file LICENSE.LGPL included in the
+** packaging of this file. Please review the following information to
+** ensure the GNU Lesser General Public License version 2.1 requirements
+** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
+**
+** In addition, as a special exception, Nokia gives you certain additional
+** rights. These rights are described in the Nokia Qt LGPL Exception
+** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
+**
+** If you have questions regarding the use of this file, please contact
+** Nokia at qt-info@nokia.com.
+**
+**
+**
+**
+**
+**
+**
+**
+** $QT_END_LICENSE$
+**
+****************************************************************************/
+
+#include "re2nfa.h"
+#include "tokenizer.cpp"
+
+RE2NFA::RE2NFA(const QMap<QString, NFA> &macros, const QSet<InputType> &maxInputSet, Qt::CaseSensitivity cs)
+ : macros(macros), index(0), errorColumn(-1), maxInputSet(maxInputSet), caseSensitivity(cs)
+{
+}
+
+NFA RE2NFA::parse(const QString &expression, int *errCol)
+{
+ tokenize(expression);
+
+ if (symbols.isEmpty())
+ return NFA();
+
+ index = 0;
+
+ NFA result = parseExpr();
+ if (result.isEmpty()) {
+ if (errCol)
+ *errCol = errorColumn;
+ }
+ return result;
+}
+
+NFA RE2NFA::parseExpr()
+{
+ NFA value = parseBranch();
+ while (test(TOK_OR)) {
+ NFA rhs = parseBranch();
+ value = NFA::createAlternatingNFA(value, rhs);
+ }
+ return value;
+}
+
+NFA RE2NFA::parseBranch()
+{
+ NFA value = parsePiece();
+ if (!hasNext())
+ return value;
+ NFA next;
+ do {
+ next = parsePiece();
+ if (!next.isEmpty())
+ value = NFA::createConcatenatingNFA(value, next);
+ } while (!next.isEmpty() && hasNext());
+ return value;
+}
+
+NFA RE2NFA::parsePiece()
+{
+ NFA atom = parseAtom();
+ if (atom.isEmpty() || !hasNext())
+ return atom;
+ return parseMaybeQuantifier(atom);
+}
+
+NFA RE2NFA::parseAtom()
+{
+ // ####
+ switch (next()) {
+ case TOK_STRING:
+ return createCharNFA();
+ case TOK_LPAREN: {
+ NFA subExpr = parseExpr();
+ next(TOK_RPAREN);
+ return subExpr;
+ }
+ case TOK_LBRACE: {
+ QString macroName = lexemUntil(TOK_RBRACE);
+ QMap<QString, NFA>::ConstIterator macro = macros.find(macroName);
+ if (macro == macros.end()) {
+ qWarning("Unknown macro '%s' - probably used before defined", qPrintable(macroName));
+ return NFA();
+ }
+ return *macro;
+ }
+ case TOK_LBRACKET: {
+ NFA set = parseSet();
+ next(TOK_RBRACKET);
+ return set;
+ }
+ case TOK_SEQUENCE:
+ return parseSet2();
+ case TOK_DOT:
+ return NFA::createSetNFA(maxInputSet);
+ default:
+ prev();
+ return NFA();
+ }
+}
+
+NFA RE2NFA::parseMaybeQuantifier(const NFA &nfa)
+{
+ // ####
+ switch (next()) {
+ case TOK_STAR:
+ return NFA::createOptionalNFA(nfa);
+ case TOK_QUESTION:
+ return NFA::createZeroOrOneNFA(nfa);
+ case TOK_PLUS:
+ return NFA::createConcatenatingNFA(nfa, NFA::createOptionalNFA(nfa));
+ case TOK_LBRACE: {
+ const int rewind = index - 1;
+
+ QString lexemBeforeComma;
+ QString lexemAfterComma;
+ bool seenComma = false;
+ forever {
+ if (test(TOK_COMMA)) {
+ if (seenComma) {
+ errorColumn = symbol().column;
+ return NFA();
+ }
+ seenComma = true;
+ } else if (test(TOK_RBRACE)) {
+ break;
+ } else {
+ next(TOK_STRING);
+ if (seenComma)
+ lexemAfterComma += symbol().lexem;
+ else
+ lexemBeforeComma += symbol().lexem;
+ }
+ }
+ bool isNumber = false;
+ int min = lexemBeforeComma.toInt(&isNumber);
+ if (!isNumber) {
+ index = rewind;
+ return nfa;
+ }
+ int max = min;
+ if (seenComma) {
+ max = lexemAfterComma.toInt(&isNumber);
+ if (!isNumber) {
+ errorColumn = symbol().column;
+ return NFA();
+ }
+ }
+ return NFA::applyQuantity(nfa, min, max);
+ }
+ default:
+ prev();
+ return nfa;
+ }
+}
+
+NFA RE2NFA::parseSet()
+{
+ QSet<InputType> set;
+ bool negate = false;
+
+ next(TOK_STRING);
+
+ do {
+ Q_ASSERT(symbol().lexem.length() == 1);
+ // ###
+ QChar ch = symbol().lexem.at(0);
+ if (set.isEmpty() && ch == QLatin1Char('^')) {
+ negate = true;
+ continue;
+ }
+
+ // look ahead for ranges like a-z
+ bool rangeFound = false;
+ if (test(TOK_STRING)) {
+ if (symbol().lexem.length() == 1
+ && symbol().lexem.at(0) == QLatin1Char('-')) {
+ next(TOK_STRING);
+ Q_ASSERT(symbol().lexem.length() == 1);
+ QChar last = symbol().lexem.at(0);
+
+ if (ch.unicode() > last.unicode())
+ qSwap(ch, last);
+
+ for (ushort i = ch.unicode(); i <= last.unicode(); ++i) {
+ if (caseSensitivity == Qt::CaseInsensitive) {
+ set.insert(QChar(i).toLower().unicode());
+ } else {
+ set.insert(i);
+ }
+ }
+
+ rangeFound = true;
+ } else {
+ prev();
+ }
+ }
+
+ if (!rangeFound) {
+ if (caseSensitivity == Qt::CaseInsensitive) {
+ set.insert(ch.toLower().unicode());
+ } else {
+ set.insert(ch.unicode());
+ }
+ }
+ } while (test(TOK_STRING));
+
+ if (negate) {
+ QSet<InputType> negatedSet = maxInputSet;
+ negatedSet.subtract(set);
+ set = negatedSet;
+ }
+
+ return NFA::createSetNFA(set);
+}
+
+NFA RE2NFA::parseSet2()
+{
+ QSet<InputType> set;
+ bool negate = false;
+
+ QString str = symbol().lexem;
+ // strip off brackets
+ str.chop(1);
+ str.remove(0, 1);
+
+ int i = 0;
+ while (i < str.length()) {
+ // ###
+ QChar ch = str.at(i++);
+ if (set.isEmpty() && ch == QLatin1Char('^')) {
+ negate = true;
+ continue;
+ }
+
+ // look ahead for ranges like a-z
+ bool rangeFound = false;
+ if (i < str.length() - 1 && str.at(i) == QLatin1Char('-')) {
+ ++i;
+ QChar last = str.at(i++);
+
+ if (ch.unicode() > last.unicode())
+ qSwap(ch, last);
+
+ for (ushort i = ch.unicode(); i <= last.unicode(); ++i) {
+ if (caseSensitivity == Qt::CaseInsensitive) {
+ set.insert(QChar(i).toLower().unicode());
+ } else {
+ set.insert(i);
+ }
+ }
+
+ rangeFound = true;
+ }
+
+ if (!rangeFound) {
+ if (caseSensitivity == Qt::CaseInsensitive) {
+ set.insert(ch.toLower().unicode());
+ } else {
+ set.insert(ch.unicode());
+ }
+ }
+ }
+
+ if (negate) {
+ QSet<InputType> negatedSet = maxInputSet;
+ negatedSet.subtract(set);
+ set = negatedSet;
+ }
+
+ return NFA::createSetNFA(set);
+}
+NFA RE2NFA::createCharNFA()
+{
+ NFA nfa;
+ // ####
+ if (caseSensitivity == Qt::CaseInsensitive) {
+ nfa = NFA::createStringNFA(symbol().lexem.toLower().toLatin1());
+ } else {
+ nfa = NFA::createStringNFA(symbol().lexem.toLatin1());
+ }
+ return nfa;
+}
+
+static inline int skipQuote(const QString &str, int pos)
+{
+ while (pos < str.length()
+ && str.at(pos) != QLatin1Char('"')) {
+ if (str.at(pos) == QLatin1Char('\\')) {
+ ++pos;
+ if (pos >= str.length())
+ break;
+ }
+ ++pos;
+ }
+ if (pos < str.length())
+ ++pos;
+ return pos;
+}
+
+#if 0
+static const char*tokStr(Token t)
+{
+ switch (t) {
+ case TOK_INVALID: return "TOK_INVALID";
+ case TOK_STRING: return "TOK_STRING";
+ case TOK_LBRACE: return "TOK_LBRACE";
+ case TOK_RBRACE: return "TOK_RBRACE";
+ case TOK_LBRACKET: return "TOK_LBRACKET";
+ case TOK_RBRACKET: return "TOK_RBRACKET";
+ case TOK_LPAREN: return "TOK_LPAREN";
+ case TOK_RPAREN: return "TOK_RPAREN";
+ case TOK_COMMA: return "TOK_COMMA";
+ case TOK_STAR: return "TOK_STAR";
+ case TOK_OR: return "TOK_OR";
+ case TOK_QUESTION: return "TOK_QUESTION";
+ case TOK_DOT: return "TOK_DOT";
+ case TOK_PLUS: return "TOK_PLUS";
+ case TOK_SEQUENCE: return "TOK_SEQUENCE";
+ case TOK_QUOTED_STRING: return "TOK_QUOTED_STRING";
+ }
+ return "";
+}
+#endif
+
+void RE2NFA::tokenize(const QString &input)
+{
+ symbols.clear();
+#if 1
+ RegExpTokenizer tokenizer(input);
+ Symbol sym;
+ int tok = tokenizer.lex();
+ while (tok != -1) {
+ Symbol sym;
+ sym.token = static_cast<Token>(tok);
+ sym.lexem = input.mid(tokenizer.lexemStart, tokenizer.lexemLength);
+
+ if (sym.token == TOK_QUOTED_STRING) {
+ sym.lexem.chop(1);
+ sym.lexem.remove(0, 1);
+ sym.token = TOK_STRING;
+ }
+
+ if (sym.token == TOK_STRING || sym.token == TOK_SEQUENCE) {
+ for (int i = 0; i < sym.lexem.length(); ++i) {
+ if (sym.lexem.at(i) == '\\') {
+ if (i >= sym.lexem.length() - 1)
+ break;
+ QChar ch = sym.lexem.at(i + 1);
+ if (ch == QLatin1Char('n')) {
+ ch = '\n';
+ } else if (ch == QLatin1Char('r')) {
+ ch = '\r';
+ } else if (ch == QLatin1Char('t')) {
+ ch = '\t';
+ } else if (ch == QLatin1Char('f')) {
+ ch = '\f';
+ }
+ sym.lexem.replace(i, 2, ch);
+ }
+ }
+ }
+
+ /*
+ if (sym.token == TOK_SEQUENCE) {
+ Symbol s;
+ s.token = TOK_LBRACKET;
+ s.lexem = "[";
+ symbols.append(s);
+
+ for (int i = 1; i < sym.lexem.length() - 1; ++i) {
+ s.token = TOK_STRING;
+ s.lexem = sym.lexem.at(i);
+ symbols.append(s);
+ }
+
+ s.token = TOK_RBRACKET;
+ s.lexem = "]";
+ symbols.append(s);
+
+ tok = tokenizer.lex();
+ continue;
+ }
+ */
+
+ symbols.append(sym);
+ tok = tokenizer.lex();
+ }
+#else
+ int pos = 0;
+ bool insideSet = false;
+ while (pos < input.length()) {
+ QChar ch = input.at(pos);
+
+ Symbol sym;
+ sym.column = pos;
+ sym.token = TOK_INVALID;
+ sym.lexem = QString(ch);
+ switch (ch.toLatin1()) {
+ case '"': {
+ if (insideSet) {
+ sym.token = TOK_STRING;
+ sym.lexem = QString(ch);
+ symbols += sym;
+ ++pos;
+ continue;
+ }
+ if (pos + 1 >= input.length())
+ return;
+ int quoteEnd = skipQuote(input, pos + 1);
+ sym.token = TOK_STRING;
+ sym.lexem = input.mid(pos + 1, quoteEnd - pos - 2);
+ symbols += sym;
+ pos = quoteEnd;
+ continue;
+ }
+ case '{':
+ sym.token = (insideSet ? TOK_STRING : TOK_LBRACE);
+ break;
+ case '}':
+ sym.token = (insideSet ? TOK_STRING : TOK_RBRACE);
+ break;
+ case '[':
+ insideSet = true;
+ sym.token = TOK_LBRACKET;
+ break;
+ case ']':
+ insideSet = false;
+ sym.token = TOK_RBRACKET;
+ break;
+ case '(':
+ sym.token = (insideSet ? TOK_STRING : TOK_LPAREN);
+ break;
+ case ')':
+ sym.token = (insideSet ? TOK_STRING : TOK_RPAREN);
+ break;
+ case ',':
+ sym.token = (insideSet ? TOK_STRING : TOK_COMMA);
+ break;
+ case '*':
+ sym.token = (insideSet ? TOK_STRING : TOK_STAR);
+ break;
+ case '|':
+ sym.token = (insideSet ? TOK_STRING : TOK_OR);
+ break;
+ case '?':
+ sym.token = (insideSet ? TOK_STRING : TOK_QUESTION);
+ break;
+ case '.':
+ sym.token = (insideSet ? TOK_STRING : TOK_DOT);
+ break;
+ case '+':
+ sym.token = (insideSet ? TOK_STRING : TOK_PLUS);
+ break;
+ case '\\':
+ ++pos;
+ if (pos >= input.length())
+ return;
+ ch = input.at(pos);
+ if (ch == QLatin1Char('n')) {
+ ch = '\n';
+ } else if (ch == QLatin1Char('r')) {
+ ch = '\r';
+ } else if (ch == QLatin1Char('t')) {
+ ch = '\t';
+ } else if (ch == QLatin1Char('f')) {
+ ch = '\f';
+ }
+ // fall through
+ default:
+ sym.token = TOK_STRING;
+ sym.lexem = QString(ch);
+ symbols += sym;
+ ++pos;
+ continue;
+ }
+ symbols += sym;
+ ++pos;
+ }
+#endif
+#if 0
+ foreach (Symbol s, symbols) {
+ qDebug() << "Tok" << tokStr(s.token) << "lexem" << s.lexem;
+ }
+#endif
+}
+
+bool RE2NFA::next(Token t)
+{
+ if (hasNext() && next() == t)
+ return true;
+ errorColumn = symbol().column;
+ Q_ASSERT(false);
+ return false;
+}
+
+bool RE2NFA::test(Token t)
+{
+ if (index >= symbols.count())
+ return false;
+ if (symbols.at(index).token == t) {
+ ++index;
+ return true;
+ }
+ return false;
+}
+
+QString RE2NFA::lexemUntil(Token t)
+{
+ QString lexem;
+ while (hasNext() && next() != t)
+ lexem += symbol().lexem;
+ return lexem;
+}
+