summaryrefslogtreecommitdiffstats
path: root/util/lexgen/main.cpp
diff options
context:
space:
mode:
authorQt by Nokia <qt-info@nokia.com>2011-04-27 12:05:43 +0200
committeraxis <qt-info@nokia.com>2011-04-27 12:05:43 +0200
commit38be0d13830efd2d98281c645c3a60afe05ffece (patch)
tree6ea73f3ec77f7d153333779883e8120f82820abe /util/lexgen/main.cpp
Initial import from the monolithic Qt.
This is the beginning of revision history for this module. If you want to look at revision history older than this, please refer to the Qt Git wiki for how to use Git history grafting. At the time of writing, this wiki is located here: http://qt.gitorious.org/qt/pages/GitIntroductionWithQt If you have already performed the grafting and you don't see any history beyond this commit, try running "git log" with the "--follow" argument. Branched from the monolithic repo, Qt master branch, at commit 896db169ea224deb96c59ce8af800d019de63f12
Diffstat (limited to 'util/lexgen/main.cpp')
-rw-r--r--util/lexgen/main.cpp323
1 files changed, 323 insertions, 0 deletions
diff --git a/util/lexgen/main.cpp b/util/lexgen/main.cpp
new file mode 100644
index 0000000000..1cb5d902a6
--- /dev/null
+++ b/util/lexgen/main.cpp
@@ -0,0 +1,323 @@
+/****************************************************************************
+**
+** Copyright (C) 2011 Nokia Corporation and/or its subsidiary(-ies).
+** All rights reserved.
+** Contact: Nokia Corporation (qt-info@nokia.com)
+**
+** This file is part of the utils of the Qt Toolkit.
+**
+** $QT_BEGIN_LICENSE:LGPL$
+** No Commercial Usage
+** This file contains pre-release code and may not be distributed.
+** You may use this file in accordance with the terms and conditions
+** contained in the Technology Preview License Agreement accompanying
+** this package.
+**
+** GNU Lesser General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU Lesser
+** General Public License version 2.1 as published by the Free Software
+** Foundation and appearing in the file LICENSE.LGPL included in the
+** packaging of this file. Please review the following information to
+** ensure the GNU Lesser General Public License version 2.1 requirements
+** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
+**
+** In addition, as a special exception, Nokia gives you certain additional
+** rights. These rights are described in the Nokia Qt LGPL Exception
+** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
+**
+** If you have questions regarding the use of this file, please contact
+** Nokia at qt-info@nokia.com.
+**
+**
+**
+**
+**
+**
+**
+**
+** $QT_END_LICENSE$
+**
+****************************************************************************/
+
+#include "nfa.h"
+#include "re2nfa.h"
+#include "configfile.h"
+#include "generator.h"
+
+#include <QFile>
+#include <QCoreApplication>
+#include <QFileInfo>
+#include <QDateTime>
+
+struct Symbol
+{
+ QString token;
+ QString lexem;
+};
+
+static QList<Symbol> tokenize(const DFA &dfa, const QString &input, Config *cfg, bool *ok = 0)
+{
+ QList<Symbol> symbols;
+ Symbol lastSymbol;
+ int state = 0;
+ int lastAcceptingState = -1;
+ QString lastAcceptingLexem;
+ int lastAcceptingPos = -1;
+ for (int i = 0; i < input.length(); ++i) {
+ QChar ch = input.at(i);
+ QChar chForInput = ch;
+ if (cfg->caseSensitivity == Qt::CaseInsensitive)
+ chForInput = chForInput.toLower();
+ int next = dfa.at(state).transitions.value(chForInput.unicode());
+ if (cfg->debug)
+ qDebug() << "input" << input.at(i) << "leads to state" << next;
+ if (next) {
+ lastSymbol.lexem.append(input.at(i));
+ lastSymbol.token = dfa.at(next).symbol;
+ if (!lastSymbol.token.isEmpty()) {
+ lastAcceptingState = next;
+ lastAcceptingLexem = lastSymbol.lexem;
+ lastAcceptingPos = i;
+ }
+ state = next;
+ } else {
+ if (lastAcceptingState != -1) {
+ if (cfg->debug)
+ qDebug() << "adding" << dfa.at(lastAcceptingState).symbol << "and backtracking to" << lastAcceptingPos;
+ Symbol s;
+ s.token = dfa.at(lastAcceptingState).symbol;
+ s.lexem = lastAcceptingLexem;
+ symbols << s;
+ lastSymbol = Symbol();
+ state = 0;
+ i = lastAcceptingPos;
+ lastAcceptingPos = -1;
+ lastAcceptingState = -1;
+ continue;
+ }
+ if (state == 0 || lastSymbol.token.isEmpty()) {
+ if (cfg->debug)
+ qDebug() << "invalid input";
+ if (ok)
+ *ok = false;
+ return symbols;
+ }
+ if (cfg->debug)
+ qDebug() << "appending symbol with token" << lastSymbol.token;
+ symbols << lastSymbol;
+ lastSymbol = Symbol();
+ state = 0;
+ lastAcceptingState = -1;
+ --i;
+ }
+ }
+ if (!lastSymbol.token.isEmpty()) {
+ if (cfg->debug)
+ qDebug() << "appending (last) symbol with token" << lastSymbol.token;
+ symbols << lastSymbol;
+ } else if (lastAcceptingState != -1) {
+ if (cfg->debug)
+ qDebug() << "appending last accepting state with token" << dfa.at(lastAcceptingState).symbol;
+ Symbol s;
+ s.lexem = lastAcceptingLexem;
+ s.token = dfa.at(lastAcceptingState).symbol;
+ symbols << s;
+ }
+ if (ok)
+ *ok = true;
+ return symbols;
+}
+
+static QSet<InputType> determineMaxInputSet(const ConfigFile::Section &section)
+{
+ QSet<InputType> set;
+
+ QString inputTypeName;
+
+ foreach (const ConfigFile::Entry &entry, section)
+ if (entry.key == QLatin1String("InputType")) {
+ if (!inputTypeName.isEmpty()) {
+ qWarning("Error: InputType field specified multiple times in config file");
+ return QSet<InputType>();
+ }
+ inputTypeName = entry.value;
+ }
+
+ if (inputTypeName.isEmpty())
+ inputTypeName = "quint8";
+
+ if (inputTypeName == "quint8") {
+ for (int i = 1; i < 256; ++i)
+ set.insert(i);
+ } /* else if ### */
+ else {
+ qWarning("Error: Unknown input type '%s'", qPrintable(inputTypeName));
+ return QSet<InputType>();
+ }
+
+ return set;
+}
+
+static bool loadConfig(const QString &ruleFile, Config *cfg)
+{
+ ConfigFile::SectionMap sections = ConfigFile::parse(ruleFile);
+ if (sections.isEmpty()) {
+ qWarning("Error parsing %s", qPrintable(ruleFile));
+ return false;
+ }
+
+ QSet<InputType> maxInputSet = determineMaxInputSet(sections.value("Options"));
+ if (maxInputSet.isEmpty())
+ return false;
+
+ Qt::CaseSensitivity cs = Qt::CaseInsensitive;
+ if (sections.value("Options").contains("case-sensitive"))
+ cs = Qt::CaseSensitive;
+
+ cfg->configSections = sections;
+ cfg->caseSensitivity = cs;
+ cfg->className = sections.value("Options").value("classname", "Scanner");
+ cfg->maxInputSet = maxInputSet;
+ cfg->ruleFile = ruleFile;
+ return true;
+}
+
+static DFA generateMachine(const Config &cfg)
+{
+ if (cfg.cache) {
+ QFileInfo ruleInfo(cfg.ruleFile);
+ QFileInfo cacheInfo(ruleInfo.baseName() + ".dfa");
+ if (cacheInfo.exists()
+ && cacheInfo.lastModified() > ruleInfo.lastModified()) {
+ QFile f(cacheInfo.absoluteFilePath());
+ f.open(QIODevice::ReadOnly);
+ QDataStream stream(&f);
+ DFA machine;
+ stream >> machine;
+ return machine;
+ }
+ }
+
+ QMap<QString, NFA> macros;
+ foreach (ConfigFile::Entry e, cfg.configSections.value("Macros")) {
+ int errCol = 0;
+ if (cfg.debug)
+ qDebug() << "parsing" << e.value;
+ NFA nfa = RE2NFA(macros, cfg.maxInputSet, cfg.caseSensitivity).parse(e.value, &errCol);
+ if (nfa.isEmpty()) {
+ qWarning("Parse error in line %d column %d", e.lineNumber, errCol);
+ return DFA();
+ }
+ macros.insert(e.key, nfa);
+ }
+
+ if (!cfg.configSections.contains("Tokens")) {
+ qWarning("Rule file does not contain a [Tokens] section!");
+ return DFA();
+ }
+
+ QVector<NFA> tokens;
+
+ foreach (ConfigFile::Entry e, cfg.configSections.value("Tokens")) {
+ int errCol = 0;
+ if (cfg.debug)
+ qDebug() << "parsing" << e.value;
+ NFA tok = RE2NFA(macros, cfg.maxInputSet, cfg.caseSensitivity).parse(e.value, &errCol);
+ if (tok.isEmpty()) {
+ qWarning("Parse error in line %d column %d while parsing token %s", e.lineNumber, errCol, e.key.toLocal8Bit().constData());
+ return DFA();
+ }
+ tok.setTerminationSymbol(e.key);
+ tokens.append(tok);
+ }
+
+ NFA giganticStateMachine;
+ foreach (NFA nfa, tokens)
+ if (giganticStateMachine.isEmpty())
+ giganticStateMachine = nfa;
+ else
+ giganticStateMachine = NFA::createAlternatingNFA(giganticStateMachine, nfa);
+
+ DFA result = giganticStateMachine.toDFA().minimize();
+ if (cfg.cache) {
+ QFileInfo ruleInfo(cfg.ruleFile);
+ QFileInfo cacheInfo(ruleInfo.baseName() + ".dfa");
+ QFile f(cacheInfo.absoluteFilePath());
+ f.open(QIODevice::WriteOnly | QIODevice::Truncate);
+ QDataStream stream(&f);
+ stream << result;
+ }
+ return result;
+}
+
+#if !defined(AUTOTEST)
+int main(int argc, char **argv)
+{
+ QCoreApplication app(argc, argv);
+ QString ruleFile;
+ Config cfg;
+
+ const QStringList arguments = app.arguments().mid(1);
+ cfg.debug = arguments.contains("-debug");
+ const bool testRules = arguments.contains("-test");
+ cfg.cache = arguments.contains("-cache");
+
+ foreach (const QString &arg, arguments)
+ if (!arg.startsWith(QLatin1Char('-'))) {
+ ruleFile = arg;
+ break;
+ }
+
+ if (ruleFile.isEmpty()) {
+ qWarning("usage: lexgen [-test rulefile");
+ qWarning(" ");
+ qWarning(" the -test option will cause lexgen to interpret standard input");
+ qWarning(" according to the specified rules and print out pairs of token and");
+ qWarning(" lexical element");
+ return 1;
+ }
+
+ if (!loadConfig(ruleFile, &cfg))
+ return 1;
+
+ DFA machine = generateMachine(cfg);
+ if (machine.isEmpty())
+ return 1;
+
+ if (testRules) {
+ qWarning("Testing:");
+ QString input = QTextStream(stdin).readAll();
+ /*
+ qDebug() << "NFA has" << machine.stateCount() << "states";
+ qDebug() << "Converting to DFA... (this may take a while)";
+ DFA dfa = machine.toDFA();
+ qDebug() << "DFA has" << dfa.count() << "states";
+ qDebug() << "Minimizing...";
+ dfa = dfa.minimize();
+ qDebug() << "Minimized DFA has" << dfa.count() << "states";
+ */
+ DFA dfa = machine;
+ if (cfg.debug)
+ qDebug() << "tokenizing" << input;
+ bool ok = false;
+ QList<Symbol> symbols = tokenize(dfa, input, &cfg, &ok);
+ if (symbols.isEmpty()) {
+ qWarning("No tokens produced!");
+ } else {
+ foreach (Symbol s, symbols)
+ qDebug() << s.token << ":" << s.lexem;
+ }
+ if (ok)
+ qDebug() << symbols.count() << "tokens produced.";
+ else
+ qDebug() << "Error while tokenizing!";
+ } else {
+ Generator gen(machine, cfg);
+ QTextStream(stdout)
+ << gen.generate();
+ }
+
+ return 0;
+}
+#endif
+