/**************************************************************************** ** ** Copyright (C) 2001-2004 Roberto Raggi ** Copyright (C) 2011 Nokia Corporation and/or its subsidiary(-ies). ** All rights reserved. ** Contact: Nokia Corporation (qt-info@nokia.com) ** ** This file is part of the qt3to4 porting application of the Qt Toolkit. ** ** $QT_BEGIN_LICENSE:LGPL$ ** No Commercial Usage ** This file contains pre-release code and may not be distributed. ** You may use this file in accordance with the terms and conditions ** contained in the Technology Preview License Agreement accompanying ** this package. ** ** GNU Lesser General Public License Usage ** Alternatively, this file may be used under the terms of the GNU Lesser ** General Public License version 2.1 as published by the Free Software ** Foundation and appearing in the file LICENSE.LGPL included in the ** packaging of this file. Please review the following information to ** ensure the GNU Lesser General Public License version 2.1 requirements ** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. ** ** In addition, as a special exception, Nokia gives you certain additional ** rights. These rights are described in the Nokia Qt LGPL Exception ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package. ** ** If you have questions regarding the use of this file, please contact ** Nokia at qt-info@nokia.com. ** ** ** ** ** ** ** ** ** $QT_END_LICENSE$ ** ****************************************************************************/ #include "tokenizer.h" #include "tokens.h" #include #include #include QT_BEGIN_NAMESPACE using TokenEngine::Token; static QHash preprocessed; bool Tokenizer::s_initialized = false; Tokenizer::scan_fun_ptr Tokenizer::s_scan_table[128 + 1]; int Tokenizer::s_attr_table[256]; Tokenizer::Tokenizer() : m_buffer(0), m_ptr(0) { if (!s_initialized) setupScanTable(); } Tokenizer::~Tokenizer() { } enum { A_Alpha = 0x01, A_Digit = 0x02, A_Alphanum = A_Alpha | A_Digit, A_Whitespace = 0x04 }; void Tokenizer::setupScanTable() { s_initialized = true; memset(s_attr_table, 0, 256); for (int i=0; i<128; ++i) { switch (i) { case ':': case '*': case '%': case '^': case '=': case '!': case '&': case '|': case '+': case '<': case '>': case '-': case '.': s_scan_table[i] = &Tokenizer::scanOperator; break; case '\r': case '\n': s_scan_table[i] = &Tokenizer::scanNewline; break; case '#': s_scan_table[i] = &Tokenizer::scanPreprocessor; break; case '/': s_scan_table[i] = &Tokenizer::scanComment; break; case '\'': s_scan_table[i] = &Tokenizer::scanCharLiteral; break; case '"': s_scan_table[i] = &Tokenizer::scanStringLiteral; break; default: if (isspace(i)) { s_scan_table[i] = &Tokenizer::scanWhiteSpaces; s_attr_table[i] |= A_Whitespace; } else if (isalpha(i) || i == '_') { s_scan_table[i] = &Tokenizer::scanIdentifier; s_attr_table[i] |= A_Alpha; } else if (isdigit(i)) { s_scan_table[i] = &Tokenizer::scanNumberLiteral; s_attr_table[i] |= A_Digit; } else s_scan_table[i] = &Tokenizer::scanChar; } } s_scan_table[128] = &Tokenizer::scanUnicodeChar; } QVector Tokenizer::tokenize(QByteArray text) { m_tokens.clear(); m_buffer = text; m_ptr = 0; // tokenize for (;;) { Token tk; bool endOfFile = nextToken(tk); if (endOfFile) { break; } m_tokens.append(tk); } return m_tokens; } bool Tokenizer::nextToken(Token &tok) { int start = m_ptr; unsigned char ch = (unsigned char)m_buffer[m_ptr]; int kind = 0; (this->*s_scan_table[ch < 128 ? ch : 128])(&kind); tok.start = start; tok.length = m_ptr - start; return (kind == 0); } void Tokenizer::scanChar(int *kind) { *kind = m_buffer[m_ptr++]; } void Tokenizer::scanWhiteSpaces(int *kind) { *kind = Token_whitespaces; while (unsigned char ch = m_buffer[m_ptr]) { if (s_attr_table[ch] & A_Whitespace) ++m_ptr; else break; } } void Tokenizer::scanNewline(int *kind) { Q_UNUSED(kind); const unsigned char ch = m_buffer[m_ptr++]; // Check for \n. if (ch == '\n') { *kind = '\n'; return; } // Check for \r\n. if (ch == '\r' && m_buffer[m_ptr] == '\n') { *kind = '\n'; ++ m_ptr; return; } *kind = ch; } void Tokenizer::scanUnicodeChar(int *kind) { *kind = m_buffer[m_ptr++]; } void Tokenizer::scanCharLiteral(int *kind) { ++m_ptr; for (;;) { unsigned char ch = m_buffer[m_ptr]; switch (ch) { case '\0': case '\n': // ### error *kind = Token_char_literal; return; case '\\': if (m_buffer[m_ptr+1] == '\'' || m_buffer[m_ptr+1] == '\\') m_ptr += 2; else ++m_ptr; break; case '\'': ++m_ptr; *kind = Token_char_literal; return; default: ++m_ptr; break; } } // ### error *kind = Token_char_literal; } void Tokenizer::scanStringLiteral(int *kind) { ++m_ptr; while (m_buffer[m_ptr]) { switch (m_buffer[m_ptr]) { case '\n': // ### error *kind = Token_string_literal; return; case '\\': if (m_buffer[m_ptr+1] == '"' || m_buffer[m_ptr+1] == '\\') m_ptr += 2; else ++m_ptr; break; case '"': ++m_ptr; *kind = Token_string_literal; return; default: ++m_ptr; break; } } // ### error *kind = Token_string_literal; } void Tokenizer::scanIdentifier(int *kind) { unsigned char ch; for (;;) { ch = m_buffer[m_ptr]; if (s_attr_table[ch] & A_Alphanum) ++m_ptr; else break; } *kind = Token_identifier; } void Tokenizer::scanNumberLiteral(int *kind) { unsigned char ch; for (;;) { ch = m_buffer[m_ptr]; if (s_attr_table[ch] & A_Alphanum || ch == '.') ++m_ptr; else break; } // ### finish to implement me!! *kind = Token_number_literal; } void Tokenizer::scanComment(int *kind) { if (!(m_buffer[m_ptr+1] == '/' || m_buffer[m_ptr+1] == '*')) { scanOperator(kind); return; } ++m_ptr; // skip '/' bool multiLineComment = m_buffer[m_ptr++] == '*'; while (m_buffer[m_ptr]) { switch (m_buffer[m_ptr]) { case '\r': case '\n': if (!multiLineComment) { *kind = Token_comment; return; } (void) scanNewline(kind); break; case '*': if (multiLineComment && m_buffer[m_ptr+1] == '/') { m_ptr += 2; *kind = Token_comment; return; } ++m_ptr; break; default: ++m_ptr; } } // ### error *kind = Token_comment; } void Tokenizer::scanPreprocessor(int *kind) { ++m_ptr; *kind = Token_preproc; } void Tokenizer::scanOperator(int *kind) { switch (m_buffer[m_ptr]) { case ':': if (m_buffer[m_ptr+1] == ':') { m_ptr += 2; *kind = Token_scope; return; } break; case '*': case '/': case '%': case '^': if (m_buffer[m_ptr+1] == '=') { m_ptr += 2; *kind = Token_assign; return; } break; case '=': case '!': if (m_buffer[m_ptr+1] == '=') { m_ptr += 2; *kind = Token_eq; return; } break; case '&': if (m_buffer[m_ptr+1] == '&') { m_ptr += 2; *kind = Token_and; return; } else if (m_buffer[m_ptr+1] == '=') { m_ptr += 2; *kind = Token_assign; return; } break; case '|': if (m_buffer[m_ptr+1] == '|' ) { m_ptr += 2; *kind = Token_or; return; } else if (m_buffer[m_ptr+1] == '=') { m_ptr += 2; *kind = Token_assign; return; } break; case '+': if (m_buffer[m_ptr+1] == '+' ) { m_ptr += 2; *kind = Token_incr; return; } else if (m_buffer[m_ptr+1] == '=') { m_ptr += 2; *kind = Token_assign; return; } break; case '<': if (m_buffer[m_ptr+1] == '<') { if (m_buffer[m_ptr+2] == '=') { m_ptr += 3; *kind = Token_assign; return; } m_ptr += 2; *kind = Token_shift; return; } else if (m_buffer[m_ptr+1] == '=') { m_ptr += 2; *kind = Token_leq; return; } break; case '>': if (m_buffer[m_ptr+1] == '>') { if (m_buffer[m_ptr+2] == '=') { m_ptr += 3; *kind = Token_assign; return; } m_ptr += 2; *kind = Token_shift; return; } else if (m_buffer[m_ptr+1] == '=') { m_ptr += 2; *kind = Token_geq; return; } break; case '-': if (m_buffer[m_ptr+1] == '>') { if (m_buffer[m_ptr+2] == '*') { m_ptr += 3; *kind = Token_ptrmem; return; } m_ptr += 2; *kind = Token_arrow; return; } else if (m_buffer[m_ptr+1] == '-') { m_ptr += 2; *kind = Token_decr; return; } else if (m_buffer[m_ptr+1] == '=') { m_ptr += 2; *kind = Token_assign; return; } break; case '.': if (m_buffer[m_ptr+1] == '.' && m_buffer[m_ptr+2] == '.') { m_ptr += 3; *kind = Token_ellipsis; return; } else if (m_buffer[m_ptr+1] == '*') { m_ptr += 2; *kind = Token_ptrmem; return; } break; } *kind = m_buffer[m_ptr++]; } QT_END_NAMESPACE