summaryrefslogtreecommitdiffstats
path: root/tools/porting/src/tokenizer.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'tools/porting/src/tokenizer.cpp')
-rw-r--r--tools/porting/src/tokenizer.cpp491
1 files changed, 491 insertions, 0 deletions
diff --git a/tools/porting/src/tokenizer.cpp b/tools/porting/src/tokenizer.cpp
new file mode 100644
index 0000000..1eacd88
--- /dev/null
+++ b/tools/porting/src/tokenizer.cpp
@@ -0,0 +1,491 @@
+/****************************************************************************
+**
+** Copyright (C) 2001-2004 Roberto Raggi
+** Copyright (C) 2011 Nokia Corporation and/or its subsidiary(-ies).
+** All rights reserved.
+** Contact: Nokia Corporation (qt-info@nokia.com)
+**
+** This file is part of the qt3to4 porting application of the Qt Toolkit.
+**
+** $QT_BEGIN_LICENSE:LGPL$
+** No Commercial Usage
+** This file contains pre-release code and may not be distributed.
+** You may use this file in accordance with the terms and conditions
+** contained in the Technology Preview License Agreement accompanying
+** this package.
+**
+** GNU Lesser General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU Lesser
+** General Public License version 2.1 as published by the Free Software
+** Foundation and appearing in the file LICENSE.LGPL included in the
+** packaging of this file. Please review the following information to
+** ensure the GNU Lesser General Public License version 2.1 requirements
+** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
+**
+** In addition, as a special exception, Nokia gives you certain additional
+** rights. These rights are described in the Nokia Qt LGPL Exception
+** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
+**
+** If you have questions regarding the use of this file, please contact
+** Nokia at qt-info@nokia.com.
+**
+**
+**
+**
+**
+**
+**
+**
+** $QT_END_LICENSE$
+**
+****************************************************************************/
+
+#include "tokenizer.h"
+#include "tokens.h"
+#include <QDateTime>
+#include <QHash>
+#include <ctype.h>
+
+QT_BEGIN_NAMESPACE
+
+using TokenEngine::Token;
+
+static QHash<QByteArray, bool> preprocessed;
+bool Tokenizer::s_initialized = false;
+Tokenizer::scan_fun_ptr Tokenizer::s_scan_table[128 + 1];
+int Tokenizer::s_attr_table[256];
+
+Tokenizer::Tokenizer()
+ : m_buffer(0), m_ptr(0)
+{
+ if (!s_initialized)
+ setupScanTable();
+}
+
+Tokenizer::~Tokenizer()
+{
+}
+
+enum
+{
+ A_Alpha = 0x01,
+ A_Digit = 0x02,
+ A_Alphanum = A_Alpha | A_Digit,
+ A_Whitespace = 0x04
+};
+
+void Tokenizer::setupScanTable()
+{
+ s_initialized = true;
+
+ memset(s_attr_table, 0, 256);
+
+ for (int i=0; i<128; ++i) {
+ switch (i) {
+ case ':':
+ case '*':
+ case '%':
+ case '^':
+ case '=':
+ case '!':
+ case '&':
+ case '|':
+ case '+':
+ case '<':
+ case '>':
+ case '-':
+ case '.':
+ s_scan_table[i] = &Tokenizer::scanOperator;
+ break;
+
+ case '\r':
+ case '\n':
+ s_scan_table[i] = &Tokenizer::scanNewline;
+ break;
+
+ case '#':
+ s_scan_table[i] = &Tokenizer::scanPreprocessor;
+ break;
+
+ case '/':
+ s_scan_table[i] = &Tokenizer::scanComment;
+ break;
+
+ case '\'':
+ s_scan_table[i] = &Tokenizer::scanCharLiteral;
+ break;
+
+ case '"':
+ s_scan_table[i] = &Tokenizer::scanStringLiteral;
+ break;
+
+ default:
+ if (isspace(i)) {
+ s_scan_table[i] = &Tokenizer::scanWhiteSpaces;
+ s_attr_table[i] |= A_Whitespace;
+ } else if (isalpha(i) || i == '_') {
+ s_scan_table[i] = &Tokenizer::scanIdentifier;
+ s_attr_table[i] |= A_Alpha;
+ } else if (isdigit(i)) {
+ s_scan_table[i] = &Tokenizer::scanNumberLiteral;
+ s_attr_table[i] |= A_Digit;
+ } else
+ s_scan_table[i] = &Tokenizer::scanChar;
+ }
+ }
+
+ s_scan_table[128] = &Tokenizer::scanUnicodeChar;
+}
+
+QVector<TokenEngine::Token> Tokenizer::tokenize(QByteArray text)
+{
+ m_tokens.clear();
+
+ m_buffer = text;
+ m_ptr = 0;
+
+ // tokenize
+ for (;;) {
+ Token tk;
+ bool endOfFile = nextToken(tk);
+ if (endOfFile) {
+ break;
+ }
+ m_tokens.append(tk);
+ }
+
+ return m_tokens;
+}
+
+bool Tokenizer::nextToken(Token &tok)
+{
+ int start = m_ptr;
+ unsigned char ch = (unsigned char)m_buffer[m_ptr];
+
+ int kind = 0;
+ (this->*s_scan_table[ch < 128 ? ch : 128])(&kind);
+
+ tok.start = start;
+ tok.length = m_ptr - start;
+
+ return (kind == 0);
+}
+
+void Tokenizer::scanChar(int *kind)
+{
+ *kind = m_buffer[m_ptr++];
+}
+
+void Tokenizer::scanWhiteSpaces(int *kind)
+{
+ *kind = Token_whitespaces;
+ while (unsigned char ch = m_buffer[m_ptr]) {
+ if (s_attr_table[ch] & A_Whitespace)
+ ++m_ptr;
+ else
+ break;
+ }
+}
+
+void Tokenizer::scanNewline(int *kind)
+{
+ Q_UNUSED(kind);
+ const unsigned char ch = m_buffer[m_ptr++];
+ // Check for \n.
+ if (ch == '\n') {
+ *kind = '\n';
+ return;
+ }
+
+ // Check for \r\n.
+ if (ch == '\r' && m_buffer[m_ptr] == '\n') {
+ *kind = '\n';
+ ++ m_ptr;
+ return;
+ }
+
+ *kind = ch;
+}
+
+void Tokenizer::scanUnicodeChar(int *kind)
+{
+ *kind = m_buffer[m_ptr++];
+}
+
+void Tokenizer::scanCharLiteral(int *kind)
+{
+ ++m_ptr;
+ for (;;) {
+ unsigned char ch = m_buffer[m_ptr];
+ switch (ch) {
+ case '\0':
+ case '\n':
+ // ### error
+ *kind = Token_char_literal;
+ return;
+ case '\\':
+ if (m_buffer[m_ptr+1] == '\'' || m_buffer[m_ptr+1] == '\\')
+ m_ptr += 2;
+ else
+ ++m_ptr;
+ break;
+ case '\'':
+ ++m_ptr;
+ *kind = Token_char_literal;
+ return;
+ default:
+ ++m_ptr;
+ break;
+ }
+ }
+
+ // ### error
+ *kind = Token_char_literal;
+}
+
+void Tokenizer::scanStringLiteral(int *kind)
+{
+ ++m_ptr;
+ while (m_buffer[m_ptr]) {
+ switch (m_buffer[m_ptr]) {
+ case '\n':
+ // ### error
+ *kind = Token_string_literal;
+ return;
+ case '\\':
+ if (m_buffer[m_ptr+1] == '"' || m_buffer[m_ptr+1] == '\\')
+ m_ptr += 2;
+ else
+ ++m_ptr;
+ break;
+ case '"':
+ ++m_ptr;
+ *kind = Token_string_literal;
+ return;
+ default:
+ ++m_ptr;
+ break;
+ }
+ }
+
+ // ### error
+ *kind = Token_string_literal;
+}
+
+void Tokenizer::scanIdentifier(int *kind)
+{
+ unsigned char ch;
+ for (;;) {
+ ch = m_buffer[m_ptr];
+ if (s_attr_table[ch] & A_Alphanum)
+ ++m_ptr;
+ else
+ break;
+ }
+ *kind = Token_identifier;
+}
+
+void Tokenizer::scanNumberLiteral(int *kind)
+{
+ unsigned char ch;
+ for (;;) {
+ ch = m_buffer[m_ptr];
+ if (s_attr_table[ch] & A_Alphanum || ch == '.')
+ ++m_ptr;
+ else
+ break;
+ }
+
+ // ### finish to implement me!!
+ *kind = Token_number_literal;
+}
+
+void Tokenizer::scanComment(int *kind)
+{
+ if (!(m_buffer[m_ptr+1] == '/' || m_buffer[m_ptr+1] == '*')) {
+ scanOperator(kind);
+ return;
+ }
+
+ ++m_ptr; // skip '/'
+
+ bool multiLineComment = m_buffer[m_ptr++] == '*';
+
+ while (m_buffer[m_ptr]) {
+ switch (m_buffer[m_ptr]) {
+ case '\r':
+ case '\n':
+ if (!multiLineComment) {
+ *kind = Token_comment;
+ return;
+ }
+
+ (void) scanNewline(kind);
+ break;
+
+ case '*':
+ if (multiLineComment && m_buffer[m_ptr+1] == '/') {
+ m_ptr += 2;
+ *kind = Token_comment;
+ return;
+ }
+ ++m_ptr;
+ break;
+
+ default:
+ ++m_ptr;
+ }
+ }
+
+ // ### error
+ *kind = Token_comment;
+}
+
+
+void Tokenizer::scanPreprocessor(int *kind)
+{
+ ++m_ptr;
+ *kind = Token_preproc;
+}
+
+
+void Tokenizer::scanOperator(int *kind)
+{
+ switch (m_buffer[m_ptr]) {
+ case ':':
+ if (m_buffer[m_ptr+1] == ':') {
+ m_ptr += 2;
+ *kind = Token_scope;
+ return;
+ }
+ break;
+
+ case '*':
+ case '/':
+ case '%':
+ case '^':
+ if (m_buffer[m_ptr+1] == '=') {
+ m_ptr += 2;
+ *kind = Token_assign;
+ return;
+ }
+ break;
+
+ case '=':
+ case '!':
+ if (m_buffer[m_ptr+1] == '=') {
+ m_ptr += 2;
+ *kind = Token_eq;
+ return;
+ }
+ break;
+
+ case '&':
+ if (m_buffer[m_ptr+1] == '&') {
+ m_ptr += 2;
+ *kind = Token_and;
+ return;
+ } else if (m_buffer[m_ptr+1] == '=') {
+ m_ptr += 2;
+ *kind = Token_assign;
+ return;
+ }
+ break;
+
+ case '|':
+ if (m_buffer[m_ptr+1] == '|' ) {
+ m_ptr += 2;
+ *kind = Token_or;
+ return;
+ } else if (m_buffer[m_ptr+1] == '=') {
+ m_ptr += 2;
+ *kind = Token_assign;
+ return;
+ }
+ break;
+
+ case '+':
+ if (m_buffer[m_ptr+1] == '+' ) {
+ m_ptr += 2;
+ *kind = Token_incr;
+ return;
+ } else if (m_buffer[m_ptr+1] == '=') {
+ m_ptr += 2;
+ *kind = Token_assign;
+ return;
+ }
+ break;
+
+ case '<':
+ if (m_buffer[m_ptr+1] == '<') {
+ if (m_buffer[m_ptr+2] == '=') {
+ m_ptr += 3;
+ *kind = Token_assign;
+ return;
+ }
+ m_ptr += 2;
+ *kind = Token_shift;
+ return;
+ } else if (m_buffer[m_ptr+1] == '=') {
+ m_ptr += 2;
+ *kind = Token_leq;
+ return;
+ }
+ break;
+
+ case '>':
+ if (m_buffer[m_ptr+1] == '>') {
+ if (m_buffer[m_ptr+2] == '=') {
+ m_ptr += 3;
+ *kind = Token_assign;
+ return;
+ }
+ m_ptr += 2;
+ *kind = Token_shift;
+ return;
+ } else if (m_buffer[m_ptr+1] == '=') {
+ m_ptr += 2;
+ *kind = Token_geq;
+ return;
+ }
+ break;
+
+ case '-':
+ if (m_buffer[m_ptr+1] == '>') {
+ if (m_buffer[m_ptr+2] == '*') {
+ m_ptr += 3;
+ *kind = Token_ptrmem;
+ return;
+ }
+ m_ptr += 2;
+ *kind = Token_arrow;
+ return;
+ } else if (m_buffer[m_ptr+1] == '-') {
+ m_ptr += 2;
+ *kind = Token_decr;
+ return;
+ } else if (m_buffer[m_ptr+1] == '=') {
+ m_ptr += 2;
+ *kind = Token_assign;
+ return;
+ }
+ break;
+
+ case '.':
+ if (m_buffer[m_ptr+1] == '.' && m_buffer[m_ptr+2] == '.') {
+ m_ptr += 3;
+ *kind = Token_ellipsis;
+ return;
+ } else if (m_buffer[m_ptr+1] == '*') {
+ m_ptr += 2;
+ *kind = Token_ptrmem;
+ return;
+ }
+ break;
+
+ }
+
+ *kind = m_buffer[m_ptr++];
+}
+
+QT_END_NAMESPACE