diff options
author | Eike Ziller <git@eikeziller.de> | 2017-04-29 16:17:11 +0200 |
---|---|---|
committer | Eike Ziller <git@eikeziller.de> | 2017-10-01 20:11:08 +0200 |
commit | 5798e33d742c0f413d2d865fdb75739b4374ce98 (patch) | |
tree | e7d36edf5de22ab74ed4b56e2e2b22be24f50ef6 | |
parent | 2f69373309cfe88084c5777baeff6bb46eecd071 (diff) |
Add highlighter
See lexical structure of Haskell
https://www.haskell.org/onlinereport/haskell2010/haskellch2.html
-rw-r--r-- | plugins/haskell/haskell.pro | 8 | ||||
-rw-r--r-- | plugins/haskell/haskelleditorfactory.cpp | 2 | ||||
-rw-r--r-- | plugins/haskell/haskellhighlighter.cpp | 152 | ||||
-rw-r--r-- | plugins/haskell/haskellhighlighter.h | 58 | ||||
-rw-r--r-- | plugins/haskell/haskelltokenizer.cpp | 631 | ||||
-rw-r--r-- | plugins/haskell/haskelltokenizer.h | 91 | ||||
-rw-r--r-- | tests/auto/tokenizer/tokenizer.pro | 11 | ||||
-rw-r--r-- | tests/auto/tokenizer/tst_tokenizer.cpp | 730 |
8 files changed, 1681 insertions, 2 deletions
diff --git a/plugins/haskell/haskell.pro b/plugins/haskell/haskell.pro index 9c557b1..aa92d3c 100644 --- a/plugins/haskell/haskell.pro +++ b/plugins/haskell/haskell.pro @@ -5,14 +5,18 @@ DEFINES += HASKELL_LIBRARY SOURCES += \ haskellcompletionassist.cpp \ haskelleditorfactory.cpp \ - haskellplugin.cpp + haskellplugin.cpp \ + haskellhighlighter.cpp \ + haskelltokenizer.cpp HEADERS += \ haskell_global.h \ haskellcompletionassist.h \ haskellconstants.h \ haskelleditorfactory.h \ - haskellplugin.h + haskellplugin.h \ + haskellhighlighter.h \ + haskelltokenizer.h ## uncomment to build plugin into user config directory ## <localappdata>/plugins/<ideversion> diff --git a/plugins/haskell/haskelleditorfactory.cpp b/plugins/haskell/haskelleditorfactory.cpp index 220e52e..8119105 100644 --- a/plugins/haskell/haskelleditorfactory.cpp +++ b/plugins/haskell/haskelleditorfactory.cpp @@ -27,6 +27,7 @@ #include "haskellcompletionassist.h" #include "haskellconstants.h" +#include "haskellhighlighter.h" #include <texteditor/textdocument.h> #include <texteditor/texteditoractionhandler.h> @@ -47,6 +48,7 @@ HaskellEditorFactory::HaskellEditorFactory() setParenthesesMatchingEnabled(true); setMarksVisible(true); setCompletionAssistProvider(new HaskellCompletionAssistProvider); + setSyntaxHighlighterCreator([] { return new HaskellHighlighter(); }); } } // Internal diff --git a/plugins/haskell/haskellhighlighter.cpp b/plugins/haskell/haskellhighlighter.cpp new file mode 100644 index 0000000..9899cc4 --- /dev/null +++ b/plugins/haskell/haskellhighlighter.cpp @@ -0,0 +1,152 @@ +/**************************************************************************** +** +** Copyright (C) 2017 The Qt Company Ltd. +** Contact: https://www.qt.io/licensing/ +** +** This file is part of Qt Creator. +** +** Commercial License Usage +** Licensees holding valid commercial Qt licenses may use this file in +** accordance with the commercial license agreement provided with the +** Software or, alternatively, in accordance with the terms contained in +** a written agreement between you and The Qt Company. For licensing terms +** and conditions see https://www.qt.io/terms-conditions. For further +** information use the contact form at https://www.qt.io/contact-us. +** +** GNU General Public License Usage +** Alternatively, this file may be used under the terms of the GNU +** General Public License version 3 as published by the Free Software +** Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT +** included in the packaging of this file. Please review the following +** information to ensure the GNU General Public License requirements will +** be met: https://www.gnu.org/licenses/gpl-3.0.html. +** +****************************************************************************/ + +#include "haskellhighlighter.h" + +#include "haskelltokenizer.h" + +#include <texteditor/fontsettings.h> +#include <texteditor/texteditorconstants.h> +#include <texteditor/texteditorsettings.h> + +#include <QDebug> +#include <QVector> + +Q_GLOBAL_STATIC_WITH_ARGS(QSet<QString>, IMPORT_HIGHLIGHTS, ({ + "qualified", + "as", + "hiding" +})); + +using namespace TextEditor; + +namespace Haskell { +namespace Internal { + +HaskellHighlighter::HaskellHighlighter() +{ + setDefaultTextFormatCategories(); + updateFormats(TextEditorSettings::fontSettings()); +} + +void HaskellHighlighter::highlightBlock(const QString &text) +{ + const Tokens tokens = HaskellTokenizer::tokenize(text, previousBlockState()); + setCurrentBlockState(tokens.state); + const Token *firstNonWS = 0; + const Token *secondNonWS = 0; + bool inType = false; + bool inImport = false; + for (const Token & token : tokens) { + switch (token.type) { + case TokenType::Variable: + if (inType) + setTokenFormat(token, C_LOCAL); + else if (inImport && IMPORT_HIGHLIGHTS->contains(token.text.toString())) + setTokenFormat(token, C_KEYWORD); +// else +// setTokenFormat(token, C_TEXT); + break; + case TokenType::Constructor: + case TokenType::OperatorConstructor: + setTokenFormat(token, C_TYPE); + break; + case TokenType::Operator: + setTokenFormat(token, C_OPERATOR); + break; + case TokenType::Whitespace: + setTokenFormat(token, C_VISUAL_WHITESPACE); + break; + case TokenType::Keyword: + if (token.text == "::" && firstNonWS && !secondNonWS) { // toplevel declaration + setFormat(firstNonWS->startCol, firstNonWS->length, m_toplevelDeclFormat); + inType = true; + } else if (token.text == "import") { + inImport = true; + } + setTokenFormat(token, C_KEYWORD); + break; + case TokenType::Integer: + case TokenType::Float: + setTokenFormat(token, C_NUMBER); + break; + case TokenType::String: + setTokenFormatWithSpaces(text, token, C_STRING); + break; + case TokenType::Char: + setTokenFormatWithSpaces(text, token, C_STRING); + break; + case TokenType::EscapeSequence: + setTokenFormat(token, C_PRIMITIVE_TYPE); + break; + case TokenType::SingleLineComment: + setTokenFormatWithSpaces(text, token, C_COMMENT); + break; + case TokenType::MultiLineComment: + setTokenFormatWithSpaces(text, token, C_COMMENT); + break; + case TokenType::Special: +// setTokenFormat(token, C_TEXT); + break; + case TokenType::StringError: + case TokenType::CharError: + case TokenType::Unknown: + setTokenFormat(token, C_PARENTHESES_MISMATCH); + break; + } + if (token.type != TokenType::Whitespace) { + if (!firstNonWS) + firstNonWS = &token; + else if (!secondNonWS) + secondNonWS = &token; + } + } +} + +void HaskellHighlighter::setFontSettings(const FontSettings &fontSettings) +{ + SyntaxHighlighter::setFontSettings(fontSettings); + updateFormats(fontSettings); +} + +void HaskellHighlighter::updateFormats(const FontSettings &fontSettings) +{ + m_toplevelDeclFormat = fontSettings.toTextCharFormat( + TextStyles::mixinStyle(C_FUNCTION, C_DECLARATION)); +} + +void HaskellHighlighter::setTokenFormat(const Token &token, TextStyle style) +{ + setFormat(token.startCol, token.length, formatForCategory(style)); +} + +void HaskellHighlighter::setTokenFormatWithSpaces(const QString &text, const Token &token, + TextStyle style) +{ + setFormatWithSpaces(text, token.startCol, token.length, formatForCategory(style)); +} + +} // Internal +} // Haskell diff --git a/plugins/haskell/haskellhighlighter.h b/plugins/haskell/haskellhighlighter.h new file mode 100644 index 0000000..6213333 --- /dev/null +++ b/plugins/haskell/haskellhighlighter.h @@ -0,0 +1,58 @@ +/**************************************************************************** +** +** Copyright (C) 2017 The Qt Company Ltd. +** Contact: https://www.qt.io/licensing/ +** +** This file is part of Qt Creator. +** +** Commercial License Usage +** Licensees holding valid commercial Qt licenses may use this file in +** accordance with the commercial license agreement provided with the +** Software or, alternatively, in accordance with the terms contained in +** a written agreement between you and The Qt Company. For licensing terms +** and conditions see https://www.qt.io/terms-conditions. For further +** information use the contact form at https://www.qt.io/contact-us. +** +** GNU General Public License Usage +** Alternatively, this file may be used under the terms of the GNU +** General Public License version 3 as published by the Free Software +** Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT +** included in the packaging of this file. Please review the following +** information to ensure the GNU General Public License requirements will +** be met: https://www.gnu.org/licenses/gpl-3.0.html. +** +****************************************************************************/ + +#pragma once + +#include <texteditor/syntaxhighlighter.h> + +#include <QHash> +#include <QTextFormat> + +namespace Haskell { +namespace Internal { + +class Token; + +class HaskellHighlighter : public TextEditor::SyntaxHighlighter +{ + Q_OBJECT + +public: + HaskellHighlighter(); + +protected: + void highlightBlock(const QString &text) override; + +private: + void setFontSettings(const TextEditor::FontSettings &fontSettings) override; + void updateFormats(const TextEditor::FontSettings &fontSettings); + void setTokenFormat(const Token &token, TextEditor::TextStyle style); + void setTokenFormatWithSpaces(const QString &text, const Token &token, + TextEditor::TextStyle style); + QTextCharFormat m_toplevelDeclFormat; +}; + +} // Internal +} // Haskell diff --git a/plugins/haskell/haskelltokenizer.cpp b/plugins/haskell/haskelltokenizer.cpp new file mode 100644 index 0000000..527e505 --- /dev/null +++ b/plugins/haskell/haskelltokenizer.cpp @@ -0,0 +1,631 @@ +/**************************************************************************** +** +** Copyright (C) 2017 The Qt Company Ltd. +** Contact: https://www.qt.io/licensing/ +** +** This file is part of Qt Creator. +** +** Commercial License Usage +** Licensees holding valid commercial Qt licenses may use this file in +** accordance with the commercial license agreement provided with the +** Software or, alternatively, in accordance with the terms contained in +** a written agreement between you and The Qt Company. For licensing terms +** and conditions see https://www.qt.io/terms-conditions. For further +** information use the contact form at https://www.qt.io/contact-us. +** +** GNU General Public License Usage +** Alternatively, this file may be used under the terms of the GNU +** General Public License version 3 as published by the Free Software +** Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT +** included in the packaging of this file. Please review the following +** information to ensure the GNU General Public License requirements will +** be met: https://www.gnu.org/licenses/gpl-3.0.html. +** +****************************************************************************/ + +#include "haskelltokenizer.h" + +#include <QSet> + +#include <algorithm> +#include <functional> + +Q_GLOBAL_STATIC_WITH_ARGS(QSet<QString>, RESERVED_OP, ({ + "..", + ":", + "::", + "=", + "\\", + "|", + "<-", + "->", + "@", + "~", + "=>", + + // Arrows GHC extension + "-<", + "-<<", + ">-", + ">>-", + "(|", + "|)" +})); + +Q_GLOBAL_STATIC_WITH_ARGS(QSet<QString>, RESERVED_ID, ({ + "case", + "class", + "data", + "default", + "deriving", + "do", + "else", + "foreign", + "if", + "import", + "in", + "infix", + "infixl", + "infixr", + "instance", + "let", + "module", + "newtype", + "of", + "then", + "type", + "where", + "_", + + // from GHC extensions + "family", + "forall", + "mdo", + "proc", + "rec" +})); + +Q_GLOBAL_STATIC_WITH_ARGS(QSet<QChar>, SPECIAL, ({ + '(', + ')', + ',', + ';', + '[', + ']', + '`', + '{', + '}', +})); + +Q_GLOBAL_STATIC_WITH_ARGS(QSet<QChar>, CHAR_ESCAPES, + ({'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '"', '\'', '&'})); + +Q_GLOBAL_STATIC_WITH_ARGS(QVector<QString>, ASCII_ESCAPES, ({ + "NUL", + "SOH", // must be before "SO" to match + "STX", + "ETX", + "EOT", + "ENQ", + "ACK", + "BEL", + "BS", + "HT", + "LF", + "VT", + "FF", + "CR", + "SO", + "SI", + "DLE", + "DC1", + "DC2", + "DC3", + "DC4", + "NAK", + "SYN", + "ETB", + "CAN", + "EM", + "SUB", + "ESC", + "FS", + "GS", + "RS", + "US", + "SP", + "DEL" +})); + +namespace Haskell { +namespace Internal { + +Token token(TokenType type, std::shared_ptr<QString> line, int start, int end) +{ + return {type, start, end - start, line->midRef(start, end - start), line}; +} + +Tokens::Tokens(std::shared_ptr<QString> source) + : source(source) +{ +} + +static int grab(const QString &line, int begin, + const std::function<bool(const QChar&)> &test) +{ + const int length = line.length(); + int current = begin; + while (current < length && test(line.at(current))) + ++current; + return current - begin; +}; + + +static bool isIdentifierChar(const QChar &c) +{ + return c.isLetterOrNumber() || c == '\'' || c == '_'; +} + +static bool isVariableIdentifierStart(const QChar &c) +{ + return c == '_' || c.isLower(); +} + +static bool isAscSymbol(const QChar &c) +{ + return c == '!' + || c == '#' + || c == '$' + || c == '%' + || c == '&' + || c == '*' + || c == '+' + || c == '.' + || c == '/' + || c == '<' + || c == '=' + || c == '>' + || c == '?' + || c == '@' + || c == '\\' + || c == '^' + || c == '|' + || c == '-' + || c == '~' + || c == ':'; +} + +static bool isSymbol(const QChar &c) +{ + return isAscSymbol(c) + || ((c.isSymbol() || c.isPunct()) && c != '_' && c != '"' && c != '\'' + && !SPECIAL->contains(c)); +} + +static bool isDigit(const QChar &c) +{ + return c.isDigit(); +} + +static bool isOctit(const QChar &c) +{ + return c >= '0' && c <= '7'; +} + +static bool isHexit(const QChar &c) +{ + return c.isDigit() + || (c >= 'A' && c <= 'F') + || (c >= 'a' && c <= 'f'); +} + +static bool isCntrl(const QChar &c) +{ + return (c >= 'A' && c <= 'Z') + || c == '@' + || c == '[' + || c == '\\' + || c == ']' + || c == '^' + || c == '_'; +} + +static QVector<Token> getSpace(std::shared_ptr<QString> line, int start) +{ + const auto lineEnd = line->cend(); + const auto tokenStart = line->cbegin() + start; + auto current = tokenStart; + while (current != lineEnd && (*current).isSpace()) + ++current; + const int length = int(std::distance(tokenStart, current)); + if (current > tokenStart) + return {{TokenType::Whitespace, start, length, line->midRef(start, length), line}}; + return {}; +} + +static QVector<Token> getNumber(std::shared_ptr<QString> line, int start) +{ + const QChar &startC = line->at(start); + if (!startC.isDigit()) + return {}; + const int length = line->length(); + int current = start + 1; + TokenType type = TokenType::Integer; + if (current < length) { + if (startC == '0') { + // check for octal or hexadecimal + const QChar &secondC = line->at(current); + if (secondC == 'o' || secondC == 'O') { + const int numLen = grab(*line, current + 1, isOctit); + if (numLen > 0) + return {token(TokenType::Integer, line, start, current + numLen + 1)}; + } else if (secondC == 'x' || secondC == 'X') { + const int numLen = grab(*line, current + 1, isHexit); + if (numLen > 0) + return {token(TokenType::Integer, line, start, current + numLen + 1)}; + } + } + // starts with decimal + const int numLen = grab(*line, start, isDigit); + current = start + numLen; + // check for floating point + if (current < length && line->at(current) == '.') { + const int numLen = grab(*line, current + 1, isDigit); + if (numLen > 0) { + current += numLen + 1; + type = TokenType::Float; + } + } + // check for exponent + if (current + 1 < length /*for at least 'e' and digit*/ + && (line->at(current) == 'e' || line->at(current) == 'E')) { + int expEnd = current + 1; + if (line->at(expEnd) == '+' || line->at(expEnd) == '-') + ++expEnd; + const int numLen = grab(*line, expEnd, isDigit); + if (numLen > 0) { + current = expEnd + numLen; + type = TokenType::Float; + } + } + } + return {token(type, line, start, current)}; +} + +static QVector<Token> getIdOrOpOrSingleLineComment(std::shared_ptr<QString> line, int start) +{ + const int length = line->length(); + if (start >= length) + return {}; + int current = start; + // check for {conid.}conid + int conidEnd = start; + bool canOnlyBeConstructor = false; + while (current < length && line->at(current).isUpper()) { + current += grab(*line, current, isIdentifierChar); + conidEnd = current; + // it is definitely a constructor id if it is not followed by a '.' + canOnlyBeConstructor = current >= length || line->at(current) != '.'; + // otherwise it might be a module id, and we skip the dot to check for qualified thing + if (!canOnlyBeConstructor) + ++current; + } + if (canOnlyBeConstructor) + return {token(TokenType::Constructor, line, start, conidEnd)}; + + // check for variable or reserved id + if (current < length && isVariableIdentifierStart(line->at(current))) { + const int varLen = grab(*line, current, isIdentifierChar); + // check for reserved id + if (RESERVED_ID->contains(line->mid(current, varLen))) { + QVector<Token> result; + // possibly add constructor + op '.' + if (conidEnd > start) { + result.append(token(TokenType::Constructor, line, start, conidEnd)); + result.append(token(TokenType::Operator, line, conidEnd, current)); + } + result.append(token(TokenType::Keyword, line, current, current + varLen)); + return result; + } + return {token(TokenType::Variable, line, start, current + varLen)}; + } + // check for operator + if (current < length && isSymbol(line->at(current))) { + const int opLen = grab(*line, current, isSymbol); + // check for reserved op + if (RESERVED_OP->contains(line->mid(current, opLen))) { + // because of the case of F... (constructor + op '...') etc + // we only add conid if we have one, handling the rest in next iteration + if (conidEnd > start) + return {token(TokenType::Constructor, line, start, conidEnd)}; + return {token(TokenType::Keyword, line, start, current + opLen)}; + } + // check for single line comment + if (opLen >= 2 && std::all_of(line->begin() + current, line->begin() + current + opLen, + [](const QChar c) { return c == '-'; })) { + QVector<Token> result; + // possibly add constructor + op '.' + if (conidEnd > start) { + result.append(token(TokenType::Constructor, line, start, conidEnd)); + result.append(token(TokenType::Operator, line, conidEnd, current)); + } + // rest is comment + result.append(token(TokenType::SingleLineComment, line, current, length)); + return result; + } + // check for (qualified?) operator constructor + if (line->at(current) == ':') + return {token(TokenType::OperatorConstructor, line, start, current + opLen)}; + return {token(TokenType::Operator, line, start, current + opLen)}; + } + // Foo.Blah. + if (conidEnd > start) + return {token(TokenType::Constructor, line, start, conidEnd)}; + return {}; +} + +static int getEscape(const QString &line, int start) +{ + if (CHAR_ESCAPES->contains(line.at(start))) + return 1; + + // decimal + if (line.at(start).isDigit()) + return grab(line, start + 1, isDigit) + 1; + // octal + if (line.at(start) == 'o') { + const int count = grab(line, start + 1, isOctit); + if (count < 1) // no octal number after 'o' + return 0; + return count + 1; + } + // hexadecimal + if (line.at(start) == 'x') { + const int count = grab(line, start + 1, isHexit); + if (count < 1) // no octal number after 'o' + return 0; + return count + 1; + } + // ascii cntrl + if (line.at(start) == '^') { + const int count = grab(line, start + 1, isCntrl); + if (count < 1) // no octal number after 'o' + return 0; + return count + 1; + } + const QStringRef s = line.midRef(start); + for (const QString &esc : *ASCII_ESCAPES) { + if (s.startsWith(esc)) + return esc.length(); + } + return 0; +} + +static QVector<Token> getString(std::shared_ptr<QString> line, int start, bool *inStringGap/*in-out*/) +{ + // Haskell has the specialty of using \<whitespace>\ within strings for multiline strings + const int length = line->length(); + if (start >= length) + return {}; + QVector<Token> result; + int tokenStart = start; + int current = tokenStart; + bool inString = *inStringGap; + do { + const QChar c = line->at(current); + if (*inStringGap && !c.isSpace() && c != '\\') { + // invalid non-whitespace in string gap + // add previous string as token, this is at least a whitespace + result.append(token(TokenType::String, line, tokenStart, current)); + // then add wrong non-whitespace + tokenStart = current; + do { ++current; } while (current < length && !line->at(current).isSpace()); + result.append(token(TokenType::StringError, line, tokenStart, current)); + tokenStart = current; + } else if (c == '"') { + inString = !inString; + ++current; + } else if (inString) { + if (c == '\\') { + ++current; + if (*inStringGap) { + // ending string gap + *inStringGap = false; + } else if (current >= length || line->at(current).isSpace()) { + // starting string gap + *inStringGap = true; + current = std::min(current + 1, length); + } else { // there is at least one character after current + const int escapeLength = getEscape(*line, current); + if (escapeLength > 0) { + // valid escape + // add previous string as token without backslash, if necessary + if (tokenStart < current - 1/*backslash*/) + result.append(token(TokenType::String, line, tokenStart, current - 1)); + tokenStart = current - 1; // backslash + current += escapeLength; + result.append(token(TokenType::EscapeSequence, line, tokenStart, current)); + tokenStart = current; + } else { // invalid escape sequence + // add previous string as token, this is at least backslash + result.append(token(TokenType::String, line, tokenStart, current)); + result.append(token(TokenType::StringError, line, current, current + 1)); + ++current; + tokenStart = current; + } + } + } else { + ++current; + } + } + } while (current < length && inString); + if (current > tokenStart) + result.append(token(TokenType::String, line, tokenStart, current)); + if (inString && !*inStringGap) { // unterminated string + // mark last character of last token as Unknown as an error hint + if (!result.isEmpty()) { // should actually never be different + Token &lastRef = result.last(); + if (lastRef.length == 1) { + lastRef.type = TokenType::StringError; + } else { + --lastRef.length; + lastRef.text = line->midRef(lastRef.startCol, lastRef.length); + result.append(token(TokenType::StringError, line, current - 1, current)); + } + } + } + return result; +} + +static QVector<Token> getMultiLineComment(std::shared_ptr<QString> line, int start, + int *commentLevel/*in_out*/) +{ + // Haskell multiline comments can be nested {- foo {- bar -} blah -} + const int length = line->length(); + int current = start; + do { + const QStringRef test = line->midRef(current, 2); + if (test == "{-") { + ++(*commentLevel); + current += 2; + } else if (test == "-}" && *commentLevel > 0) { + --(*commentLevel); + current += 2; + } else if (*commentLevel > 0) { + ++current; + } + } while (current < length && *commentLevel > 0); + if (current > start) { + return {token(TokenType::MultiLineComment, line, start, current)}; + } + return {}; +} + +static QVector<Token> getChar(std::shared_ptr<QString> line, int start) +{ + if (line->at(start) != '\'') + return {}; + QVector<Token> result; + const int length = line->length(); + int tokenStart = start; + int current = tokenStart + 1; + bool inChar = true; + int count = 0; + while (current < length && inChar) { + if (line->at(current) == '\'') { + inChar = false; + ++current; + } else if (count == 1) { + // we already have one character, so start Unknown token + if (current > tokenStart) + result.append(token(TokenType::Char, line, tokenStart, current)); + tokenStart = current; + ++count; + ++current; + } else if (count > 1) { + ++count; + ++current; + } else if (line->at(current) == '\\') { + if (current + 1 < length) { + ++current; + ++count; + const int escapeLength = getEscape(*line, current); + if (line->at(current) != '&' && escapeLength > 0) { // no & escape for chars + // valid escape + // add previous string as token without backslash, if necessary + if (tokenStart < current - 1/*backslash*/) + result.append(token(TokenType::Char, line, tokenStart, current - 1)); + tokenStart = current - 1; // backslash + current += escapeLength; + result.append(token(TokenType::EscapeSequence, line, tokenStart, current)); + tokenStart = current; + } else { // invalid escape sequence + // add previous string as token, this is at least backslash + result.append(token(TokenType::Char, line, tokenStart, current)); + result.append(token(TokenType::CharError, line, current, current + 1)); + ++current; + tokenStart = current; + } + } else { + ++current; + } + } else { + ++count; + ++current; + } + } + if (count > 1 && inChar) { + // too long and unterminated, just add Unknown token till end + result.append(token(TokenType::CharError, line, tokenStart, current)); + } else if (count > 1) { + // too long but terminated, add Unknown up to ending quote, then quote + result.append(token(TokenType::CharError, line, tokenStart, current - 1)); + result.append(token(TokenType::Char, line, current - 1, current)); + } else if (inChar || count < 1) { + // unterminated, or no character inside, mark last character as error + if (current > tokenStart + 1) + result.append(token(TokenType::Char, line, tokenStart, current - 1)); + result.append(token(TokenType::CharError, line, current - 1, current)); + } else { + result.append(token(TokenType::Char, line, tokenStart, current)); + } + return result; +} + +static QVector<Token> getSpecial(std::shared_ptr<QString> line, int start) +{ + if (SPECIAL->contains(line->at(start))) + return {{TokenType::Special, start, 1, line->midRef(start, 1), line}}; + return {}; +} + +Tokens HaskellTokenizer::tokenize(const QString &line, int startState) +{ + Tokens result(std::make_shared<QString>(line)); + const int length = result.source->length(); + bool inStringGap = startState == int(Tokens::State::StringGap); + int multiLineCommentLevel = std::max(startState - int(Tokens::State::MultiLineCommentGuard), 0); + int currentStart = 0; + QVector<Token> tokens; + while (currentStart < length) { + if (multiLineCommentLevel <= 0 && + !(tokens = getString(result.source, currentStart, &inStringGap)).isEmpty()) { + result.append(tokens); + } else if (!(tokens = getMultiLineComment(result.source, currentStart, + &multiLineCommentLevel)).isEmpty()) { + result.append(tokens); + } else if (!(tokens = getChar(result.source, currentStart)).isEmpty()) { + result.append(tokens); + } else if (!(tokens = getSpace(result.source, currentStart)).isEmpty()) { + result.append(tokens); + } else if (!(tokens = getNumber(result.source, currentStart)).isEmpty()) { + result.append(tokens); + } else if (!(tokens = getIdOrOpOrSingleLineComment(result.source, currentStart)).isEmpty()) { + result.append(tokens); + } else if (!(tokens = getSpecial(result.source, currentStart)).isEmpty()) { + result.append(tokens); + } else { + tokens = {{TokenType::Unknown, + currentStart, + 1, + result.source->midRef(currentStart, 1), + result.source}}; + result.append(tokens); + } + currentStart += std::accumulate(tokens.cbegin(), tokens.cend(), 0, + [](int s, const Token &t) { return s + t.length; }); + } + if (inStringGap) + result.state = int(Tokens::State::StringGap); + else if (multiLineCommentLevel > 0) + result.state = int(Tokens::State::MultiLineCommentGuard) + multiLineCommentLevel; + return result; +} + +bool Token::isValid() const +{ + return type != TokenType::Unknown; +} + +} // Internal +} // Haskell diff --git a/plugins/haskell/haskelltokenizer.h b/plugins/haskell/haskelltokenizer.h new file mode 100644 index 0000000..46b4b00 --- /dev/null +++ b/plugins/haskell/haskelltokenizer.h @@ -0,0 +1,91 @@ +/**************************************************************************** +** +** Copyright (C) 2017 The Qt Company Ltd. +** Contact: https://www.qt.io/licensing/ +** +** This file is part of Qt Creator. +** +** Commercial License Usage +** Licensees holding valid commercial Qt licenses may use this file in +** accordance with the commercial license agreement provided with the +** Software or, alternatively, in accordance with the terms contained in +** a written agreement between you and The Qt Company. For licensing terms +** and conditions see https://www.qt.io/terms-conditions. For further +** information use the contact form at https://www.qt.io/contact-us. +** +** GNU General Public License Usage +** Alternatively, this file may be used under the terms of the GNU +** General Public License version 3 as published by the Free Software +** Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT +** included in the packaging of this file. Please review the following +** information to ensure the GNU General Public License requirements will +** be met: https://www.gnu.org/licenses/gpl-3.0.html. +** +****************************************************************************/ + +#pragma once + +#include <QChar> +#include <QString> +#include <QVector> + +#include <memory> + +namespace Haskell { +namespace Internal { + +enum class TokenType { + Variable, + Constructor, + Operator, + OperatorConstructor, + Whitespace, + String, + StringError, + Char, + CharError, + EscapeSequence, + Integer, + Float, + Keyword, + Special, + SingleLineComment, + MultiLineComment, + Unknown +}; + +class Token { +public: + bool isValid() const; + + TokenType type = TokenType::Unknown; + int startCol = -1; + int length = -1; + QStringRef text; + + std::shared_ptr<QString> source; // keep the string ref alive +}; + +class Tokens : public QVector<Token> +{ +public: + enum class State { + None = -1, + StringGap = 0, // gap == two backslashes enclosing only whitespace + MultiLineCommentGuard // nothing may follow that + }; + + Tokens(std::shared_ptr<QString> source); + + std::shared_ptr<QString> source; + int state = int(State::None); +}; + +class HaskellTokenizer +{ +public: + static Tokens tokenize(const QString &line, int startState); +}; + +} // Internal +} // Haskell diff --git a/tests/auto/tokenizer/tokenizer.pro b/tests/auto/tokenizer/tokenizer.pro new file mode 100644 index 0000000..a9ec439 --- /dev/null +++ b/tests/auto/tokenizer/tokenizer.pro @@ -0,0 +1,11 @@ +include(../../../plugins/haskell/config.pri) + +include($$IDE_SOURCE_TREE/tests/auto/qttest.pri) + +SOURCES += tst_tokenizer.cpp \ + $$PWD/../../../plugins/haskell/haskelltokenizer.cpp + +HEADERS += \ + $$PWD/../../../plugins/haskell/haskelltokenizer.h + +INCLUDEPATH += $$PWD/../../../plugins/haskell diff --git a/tests/auto/tokenizer/tst_tokenizer.cpp b/tests/auto/tokenizer/tst_tokenizer.cpp new file mode 100644 index 0000000..ffa34b2 --- /dev/null +++ b/tests/auto/tokenizer/tst_tokenizer.cpp @@ -0,0 +1,730 @@ +/**************************************************************************** +** +** Copyright (C) 2017 The Qt Company Ltd. +** Contact: https://www.qt.io/licensing/ +** +** This file is part of Qt Creator. +** +** Commercial License Usage +** Licensees holding valid commercial Qt licenses may use this file in +** accordance with the commercial license agreement provided with the +** Software or, alternatively, in accordance with the terms contained in +** a written agreement between you and The Qt Company. For licensing terms +** and conditions see https://www.qt.io/terms-conditions. For further +** information use the contact form at https://www.qt.io/contact-us. +** +** GNU General Public License Usage +** Alternatively, this file may be used under the terms of the GNU +** General Public License version 3 as published by the Free Software +** Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT +** included in the packaging of this file. Please review the following +** information to ensure the GNU General Public License requirements will +** be met: https://www.gnu.org/licenses/gpl-3.0.html. +** +****************************************************************************/ + +#include <haskelltokenizer.h> + +#include <QObject> +#include <QtTest> + +using namespace Haskell::Internal; + +const QSet<char> escapes{'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '"', '\'', '&'}; + +struct TokenInfo +{ + TokenType type; + int column; + QString text; +}; + +Q_DECLARE_METATYPE(TokenInfo) + +bool operator==(const TokenInfo &info, const Token &token) +{ + return info.type == token.type + && info.column == token.startCol + && info.text.length() == token.length + && info.text == token.text.toString(); +} + +bool operator==(const Token &token, const TokenInfo &info) +{ + return info == token; +} + +class tst_Tokenizer : public QObject +{ + Q_OBJECT + +private slots: + void singleLineComment_data(); + void singleLineComment(); + + void multiLineComment_data(); + void multiLineComment(); + + void string_data(); + void string(); + + void character_data(); + void character(); + + void number_data(); + void number(); + + void keyword_data(); + void keyword(); + + void variable_data(); + void variable(); + + void constructor_data(); + void constructor(); + + void op_data(); + void op(); + +private: + void setupData(); + void addRow(const char *name, + const QString &input, + const QList<TokenInfo> &tokens, + Tokens::State startState = Tokens::State::None, + Tokens::State endState = Tokens::State::None); + void checkData(); +}; + +void tst_Tokenizer::setupData() +{ + QTest::addColumn<QString>("input"); + QTest::addColumn<QList<TokenInfo>>("output"); + QTest::addColumn<int>("startState"); + QTest::addColumn<int>("endState"); +} + +void tst_Tokenizer::addRow(const char *name, + const QString &input, + const QList<TokenInfo> &tokens, + Tokens::State startState, + Tokens::State endState) +{ + QTest::newRow(name) << input << tokens << int(startState) << int(endState); +} + +void tst_Tokenizer::checkData() +{ + QFETCH(QString, input); + QFETCH(QList<TokenInfo>, output); + QFETCH(int, startState); + QFETCH(int, endState); + const Tokens tokens = HaskellTokenizer::tokenize(input, startState); + QCOMPARE(tokens.length(), output.length()); + QCOMPARE(tokens.state, endState); + for (int i = 0; i < tokens.length(); ++i) { + const Token t = tokens.at(i); + const TokenInfo ti = output.at(i); + QVERIFY2(t == ti, QString("Token at index %1 does not match, {%2, %3, \"%4\"} != {%5, %6, \"%7\"}") + .arg(i) + .arg(int(t.type)).arg(t.startCol).arg(t.text.toString()) + .arg(int(ti.type)).arg(ti.column).arg(ti.text) + .toUtf8().constData()); + } +} + +void tst_Tokenizer::singleLineComment_data() +{ + setupData(); + + addRow("simple", " -- foo", { + {TokenType::Whitespace, 0, " "}, + {TokenType::SingleLineComment, 1, "-- foo"} + }); + addRow("dash, id", "--foo", { + {TokenType::SingleLineComment, 0, "--foo"} + }); + addRow("dash, space, op", "-- |foo", { + {TokenType::SingleLineComment, 0, "-- |foo"} + }); + addRow("multi-dash, space", "---- foo", { + {TokenType::SingleLineComment, 0, "---- foo"} + }); + addRow("dash, op", "--| foo", { + {TokenType::Operator, 0, "--|"}, + {TokenType::Whitespace, 3, " "}, + {TokenType::Variable, 4, "foo"} + }); + addRow("dash, special", "--(foo", { + {TokenType::SingleLineComment, 0, "--(foo"} + }); + addRow("not a qualified varsym", "F.-- foo", { + {TokenType::Constructor, 0, "F"}, + {TokenType::Operator, 1, "."}, + {TokenType::SingleLineComment, 2, "-- foo"} + }); +} + +void tst_Tokenizer::singleLineComment() +{ + checkData(); +} + +void tst_Tokenizer::multiLineComment_data() +{ + setupData(); + + addRow("trailing dashes", "{---foo -}", { + {TokenType::MultiLineComment, 0, "{---foo -}"} + }); + addRow("multiline", "{- foo", { + {TokenType::MultiLineComment, 0, "{- foo"} + }, + Tokens::State::None, + Tokens::State(int(Tokens::State::MultiLineCommentGuard) + 1)); + addRow("multiline2", "bar -}", { + {TokenType::MultiLineComment, 0, "bar -}"} + }, + Tokens::State(int(Tokens::State::MultiLineCommentGuard) + 1), + Tokens::State::None); + addRow("nested", "{- fo{-o", { + {TokenType::MultiLineComment, 0, "{- fo{-o"} + }, + Tokens::State::None, + Tokens::State(int(Tokens::State::MultiLineCommentGuard) + 2)); + addRow("nested2", "bar -}", { + {TokenType::MultiLineComment, 0, "bar -}"} + }, + Tokens::State(int(Tokens::State::MultiLineCommentGuard) + 2), + Tokens::State(int(Tokens::State::MultiLineCommentGuard) + 1)); + addRow("nested3", "bar -}", { + {TokenType::MultiLineComment, 0, "bar -}"} + }, + Tokens::State(int(Tokens::State::MultiLineCommentGuard) + 1), + Tokens::State::None); +} + +void tst_Tokenizer::multiLineComment() +{ + checkData(); +} + +void tst_Tokenizer::string_data() +{ + setupData(); + + addRow("simple", "\"foo\"", { + {TokenType::String, 0, "\"foo\""} + }); + + addRow("unterminated", "\"", { + {TokenType::StringError, 0, "\""} + }); + addRow("unterminated2", "\"foo", { + {TokenType::String, 0, "\"fo"}, + {TokenType::StringError, 3, "o"} + }); + addRow("unterminated with escape", "\"\\\\", { + {TokenType::String, 0, "\""}, + {TokenType::EscapeSequence, 1, "\\"}, + {TokenType::StringError, 2, "\\"} + }); + + // gaps + addRow("gap", "\" \\ \\\"", { + {TokenType::String, 0, "\" \\ \\\""} + }); + addRow("gap over endline", "\"foo\\", { + {TokenType::String, 0, "\"foo\\"} + }, + Tokens::State::None, Tokens::State::StringGap); + addRow("gap over endline2", "\\foo\"", { + {TokenType::String, 0, "\\foo\""} + }, + Tokens::State::StringGap, Tokens::State::None); + addRow("gap error", "\"\\ ab \\\"", { + {TokenType::String, 0, "\"\\ "}, + {TokenType::StringError, 3, "ab"}, + {TokenType::String, 5, " \\\""} + }); + addRow("gap error with quote", "\"\\ \"", { + {TokenType::String, 0, "\"\\ "}, + {TokenType::StringError, 3, "\""} + }, + Tokens::State::None, Tokens::State::StringGap); + + // char escapes (including wrong ones) + for (char c = '!'; c <= '~'; ++c) { + // skip uppercase and '^', since these can be part of ascii escapes + // and 'o' and 'x' since they start octal and hex escapes + // and digits as part of decimal escape + if ((c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '^' || c == 'o' || c == 'x') + continue; + const QChar qc(c); + const QByteArray name = QString("charesc '%1'").arg(qc).toUtf8(); + const QString input = QString("\"\\%1\"").arg(qc); + if (escapes.contains(c)) { + addRow(name.constData(), input, { + {TokenType::String, 0, "\""}, + {TokenType::EscapeSequence, 1, QLatin1String("\\") + qc}, + {TokenType::String, 3, "\""} + }); + } else { + addRow(name.constData(), input, { + {TokenType::String, 0, "\"\\"}, + {TokenType::StringError, 2, qc}, + {TokenType::String, 3, "\""} + }); + } + } + + addRow("decimal escape", "\"\\1234a\"", { + {TokenType::String, 0, "\""}, + {TokenType::EscapeSequence, 1, "\\1234"}, + {TokenType::String, 6, "a\""} + }); + + addRow("octal escape", "\"\\o0678a\"", { + {TokenType::String, 0, "\""}, + {TokenType::EscapeSequence, 1, "\\o067"}, + {TokenType::String, 6, "8a\""} + }); + addRow("octal escape error", "\"\\o8a\"", { + {TokenType::String, 0, "\"\\"}, + {TokenType::StringError, 2, "o"}, + {TokenType::String, 3, "8a\""} + }); + + addRow("hexadecimal escape", "\"\\x0678Abg\"", { + {TokenType::String, 0, "\""}, + {TokenType::EscapeSequence, 1, "\\x0678Ab"}, + {TokenType::String, 9, "g\""} + }); + addRow("hexadecimal escape error", "\"\\xg\"", { + {TokenType::String, 0, "\"\\"}, + {TokenType::StringError, 2, "x"}, + {TokenType::String, 3, "g\""} + }); + + // ascii cntrl escapes (including wrong ones) + for (char c = '!'; c <= '~'; ++c) { + if (c == '"') // is special because it also ends the string + continue; + const QChar qc(c); + const QByteArray name = QString("ascii cntrl '^%1'").arg(qc).toUtf8(); + const QString input = QString("\"\\^%1\"").arg(qc); + if ((qc >= 'A' && qc <= 'Z') || qc == '@' || qc == '[' || qc == '\\' || qc == ']' + || qc == '^' || qc == '_') { + addRow(name.constData(), input, { + {TokenType::String, 0, "\""}, + {TokenType::EscapeSequence, 1, QLatin1String("\\^") + qc}, + {TokenType::String, 4, "\""} + }); + } else { + addRow(name.constData(), input, { + {TokenType::String, 0, "\"\\"}, + {TokenType::StringError, 2, "^"}, + {TokenType::String, 3, QString(qc) + "\""} + }); + } + } + + addRow("ascii escape SOH", "\"\\SOHN\"", { + {TokenType::String, 0, "\""}, + {TokenType::EscapeSequence, 1, "\\SOH"}, + {TokenType::String, 5, "N\""} + }); + addRow("ascii escape SO", "\"\\SON\"", { + {TokenType::String, 0, "\""}, + {TokenType::EscapeSequence, 1, "\\SO"}, + {TokenType::String, 4, "N\""} + }); + addRow("ascii escape error", "\"\\TON\"", { + {TokenType::String, 0, "\"\\"}, + {TokenType::StringError, 2, "T"}, + {TokenType::String, 3, "ON\""} + }); + addRow("ascii escape error 2", "\"\\STO\"", { + {TokenType::String, 0, "\"\\"}, + {TokenType::StringError, 2, "S"}, + {TokenType::String, 3, "TO\""} + }); +} + +void tst_Tokenizer::string() +{ + checkData(); +} + +void tst_Tokenizer::character_data() +{ + setupData(); + + addRow("simple", "'a'", { + {TokenType::Char, 0, "'a'"} + }); + addRow("too many", "'abc'", { + {TokenType::Char, 0, "'a"}, + {TokenType::CharError, 2, "bc"}, + {TokenType::Char, 4, "'"} + }); + addRow("too few", "''", { + {TokenType::Char, 0, "'"}, + {TokenType::CharError, 1, "'"} + }); + addRow("only quote", "'", { + {TokenType::CharError, 0, "'"} + }); + addRow("unterminated", "'a", { + {TokenType::Char, 0, "'"}, + {TokenType::CharError, 1, "a"} + }); + addRow("unterminated too many", "'abc", { + {TokenType::Char, 0, "'a"}, + {TokenType::CharError, 2, "bc"} + }); + addRow("unterminated backslash", "'\\", { + {TokenType::Char, 0, "'"}, + {TokenType::CharError, 1, "\\"} + }); + + // char escapes (including wrong ones) + for (char c = '!'; c <= '~'; ++c) { + // skip uppercase and '^', since these can be part of ascii escapes + // and 'o' and 'x' since they start octal and hex escapes + // and digits as part of decimal escape + if ((c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '^' || c == 'o' || c == 'x') + continue; + const QChar qc(c); + const QByteArray name = QString("charesc '%1'").arg(qc).toUtf8(); + const QString input = QString("'\\%1'").arg(qc); + if (c != '&' && escapes.contains(c)) { + addRow(name.constData(), input, { + {TokenType::Char, 0, "'"}, + {TokenType::EscapeSequence, 1, QLatin1String("\\") + qc}, + {TokenType::Char, 3, "'"} + }); + } else { + addRow(name.constData(), input, { + {TokenType::Char, 0, "'\\"}, + {TokenType::CharError, 2, qc}, + {TokenType::Char, 3, "'"} + }); + } + } + + addRow("decimal escape", "'\\1234'", { + {TokenType::Char, 0, "'"}, + {TokenType::EscapeSequence, 1, "\\1234"}, + {TokenType::Char, 6, "'"} + }); + addRow("decimal escape too long", "'\\1234a'", { + {TokenType::Char, 0, "'"}, + {TokenType::EscapeSequence, 1, "\\1234"}, + {TokenType::CharError, 6, "a"}, + {TokenType::Char, 7, "'"} + }); + + addRow("octal escape", "'\\o067'", { + {TokenType::Char, 0, "'"}, + {TokenType::EscapeSequence, 1, "\\o067"}, + {TokenType::Char, 6, "'"} + }); + addRow("octal escape error", "'\\o8'", { + {TokenType::Char, 0, "'\\"}, + {TokenType::CharError, 2, "o"}, + {TokenType::CharError, 3, "8"}, + {TokenType::Char, 4, "'"} + }); + + addRow("hexadecimal escape", "'\\x0678Ab'", { + {TokenType::Char, 0, "'"}, + {TokenType::EscapeSequence, 1, "\\x0678Ab"}, + {TokenType::Char, 9, "'"} + }); + addRow("hexadecimal escape error", "'\\xg'", { + {TokenType::Char, 0, "'\\"}, + {TokenType::CharError, 2, "x"}, + {TokenType::CharError, 3, "g"}, + {TokenType::Char, 4, "'"} + }); + + // ascii cntrl escapes (including wrong ones) + for (char c = '!'; c <= '~'; ++c) { + if (c == '\'') // is special because it also ends the string + continue; + const QChar qc(c); + const QByteArray name = QString("ascii cntrl '^%1'").arg(qc).toUtf8(); + const QString input = QString("'\\^%1'").arg(qc); + if ((qc >= 'A' && qc <= 'Z') || qc == '@' || qc == '[' || qc == '\\' || qc == ']' + || qc == '^' || qc == '_') { + addRow(name.constData(), input, { + {TokenType::Char, 0, "'"}, + {TokenType::EscapeSequence, 1, QLatin1String("\\^") + qc}, + {TokenType::Char, 4, "'"} + }); + } else { + addRow(name.constData(), input, { + {TokenType::Char, 0, "'\\"}, + {TokenType::CharError, 2, "^"}, + {TokenType::CharError, 3, qc}, + {TokenType::Char, 4, "'"} + }); + } + } + + addRow("ascii escape SOH", "'\\SOH'", { + {TokenType::Char, 0, "'"}, + {TokenType::EscapeSequence, 1, "\\SOH"}, + {TokenType::Char, 5, "'"} + }); + addRow("ascii escape SO, too long", "'\\SON'", { + {TokenType::Char, 0, "'"}, + {TokenType::EscapeSequence, 1, "\\SO"}, + {TokenType::CharError, 4, "N"}, + {TokenType::Char, 5, "'"} + }); + addRow("ascii escape error", "'\\TON'", { + {TokenType::Char, 0, "'\\"}, + {TokenType::CharError, 2, "T"}, + {TokenType::CharError, 3, "ON"}, + {TokenType::Char, 5, "'"} + }); +} + +void tst_Tokenizer::character() +{ + checkData(); +} + +void tst_Tokenizer::number_data() +{ + setupData(); + + addRow("decimal", "012345", { + {TokenType::Integer, 0, "012345"} + }); + addRow("single digit decimal", "0", { + {TokenType::Integer, 0, "0"} + }); + addRow("octal", "0o1234", { + {TokenType::Integer, 0, "0o1234"} + }); + // this is a bit weird, but correct: octal 1 followed by decimal 8 + addRow("number after octal", "0O18", { + {TokenType::Integer, 0, "0O1"}, + {TokenType::Integer, 3, "8"} + }); + addRow("not octal", "0o9", { + {TokenType::Integer, 0, "0"}, + {TokenType::Variable, 1, "o9"}, + }); + addRow("hexadecimal", "0x9fA", { + {TokenType::Integer, 0, "0x9fA"} + }); + // hex number followed by identifier 'g' + addRow("hexadecimal", "0X9fg", { + {TokenType::Integer, 0, "0X9f"}, + {TokenType::Variable, 4, "g"} + }); + + // 0 followed by identifier + addRow("decimal followed by identifier", "0z6", { + {TokenType::Integer, 0, "0"}, + {TokenType::Variable, 1, "z6"} + }); + + addRow("float", "0123.45", { + {TokenType::Float, 0, "0123.45"} + }); + addRow("decimal + operator '.'", "0123.", { + {TokenType::Integer, 0, "0123"}, + {TokenType::Operator, 4, "."} + }); + addRow("operator '.' + decimal", ".0123", { + {TokenType::Operator, 0, "."}, + {TokenType::Integer, 1, "0123"} + }); + addRow("without '.', with exp 'e'", "0123e45", { + {TokenType::Float, 0, "0123e45"} + }); + addRow("without '.', with exp 'E'", "0123E45", { + {TokenType::Float, 0, "0123E45"} + }); + addRow("without '.', with '+'", "0123e+45", { + {TokenType::Float, 0, "0123e+45"} + }); + addRow("without '.', with '-'", "0123e-45", { + {TokenType::Float, 0, "0123e-45"} + }); + addRow("without '.', with '+', missing decimal", "0123e+", { + {TokenType::Integer, 0, "0123"}, + {TokenType::Variable, 4, "e"}, + {TokenType::Operator, 5, "+"} + }); + addRow("without '.', missing decimal", "0123e", { + {TokenType::Integer, 0, "0123"}, + {TokenType::Variable, 4, "e"} + }); + addRow("exp 'e'", "01.23e45", { + {TokenType::Float, 0, "01.23e45"} + }); + addRow("exp 'E'", "01.23E45", { + {TokenType::Float, 0, "01.23E45"} + }); + addRow("with '+'", "01.23e+45", { + {TokenType::Float, 0, "01.23e+45"} + }); + addRow("with '-'", "01.23e-45", { + {TokenType::Float, 0, "01.23e-45"} + }); + addRow("with '+', missing decimal", "01.23e+", { + {TokenType::Float, 0, "01.23"}, + {TokenType::Variable, 5, "e"}, + {TokenType::Operator, 6, "+"} + }); + addRow("missing decimal", "01.23e", { + {TokenType::Float, 0, "01.23"}, + {TokenType::Variable, 5, "e"} + }); +} + +void tst_Tokenizer::number() +{ + checkData(); +} + +void tst_Tokenizer::keyword_data() +{ + setupData(); + + addRow("data", "data", { + {TokenType::Keyword, 0, "data"} + }); + addRow("not a qualified varid", "Foo.case", { + {TokenType::Constructor, 0, "Foo"}, + {TokenType::Operator, 3, "."}, + {TokenType::Keyword, 4, "case"} + }); + addRow(":", ":", { + {TokenType::Keyword, 0, ":"} + }); + addRow("->", "->", { + {TokenType::Keyword, 0, "->"} + }); + addRow("not a qualified varsym", "Foo...", { + {TokenType::Constructor, 0, "Foo"}, + {TokenType::Operator, 3, "..."} + }); +} + +void tst_Tokenizer::keyword() +{ + checkData(); +} + +void tst_Tokenizer::variable_data() +{ + setupData(); + + addRow("simple", "fOo_1'", { + {TokenType::Variable, 0, "fOo_1'"} + }); + addRow("start with '_'", "_1", { + {TokenType::Variable, 0, "_1"} + }); + addRow("not a keyword", "cases", { + {TokenType::Variable, 0, "cases"} + }); + addRow("not a keyword 2", "qualified", { + {TokenType::Variable, 0, "qualified"} + }); + addRow("not a keyword 3", "as", { + {TokenType::Variable, 0, "as"} + }); + addRow("not a keyword 4", "hiding", { + {TokenType::Variable, 0, "hiding"} + }); + addRow(".variable", ".foo", { + {TokenType::Operator, 0, "."}, + {TokenType::Variable, 1, "foo"} + }); + addRow("variable.", "foo.", { + {TokenType::Variable, 0, "foo"}, + {TokenType::Operator, 3, "."} + }); + addRow("variable.variable", "blah.foo", { + {TokenType::Variable, 0, "blah"}, + {TokenType::Operator, 4, "."}, + {TokenType::Variable, 5, "foo"} + }); + addRow("qualified", "Blah.foo", { + {TokenType::Variable, 0, "Blah.foo"} + }); + addRow("qualified2", "Goo.Blah.foo", { + {TokenType::Variable, 0, "Goo.Blah.foo"} + }); + addRow("variable + op '..'", "foo..", { + {TokenType::Variable, 0, "foo"}, + {TokenType::Keyword, 3, ".."} + }); + addRow("variable + op '...'", "foo...", { + {TokenType::Variable, 0, "foo"}, + {TokenType::Operator, 3, "..."} + }); +} + +void tst_Tokenizer::variable() +{ + checkData(); +} + +void tst_Tokenizer::constructor_data() +{ + setupData(); + + addRow("simple", "Foo", { + {TokenType::Constructor, 0, "Foo"} + }); + addRow("qualified", "Foo.Bar", { + {TokenType::Constructor, 0, "Foo.Bar"} + }); + addRow("followed by op '.'", "Foo.Bar.", { + {TokenType::Constructor, 0, "Foo.Bar"}, + {TokenType::Operator, 7, "."} + }); + +} + +void tst_Tokenizer::constructor() +{ + checkData(); +} + +void tst_Tokenizer::op_data() +{ + setupData(); + + addRow("simple", "+-=", { + {TokenType::Operator, 0, "+-="} + }); + addRow("qualified", "Foo.+-=", { + {TokenType::Operator, 0, "Foo.+-="} + }); + addRow("qualified '.'", "Foo..", { + {TokenType::Operator, 0, "Foo.."} + }); + addRow("constructor plus op", "Foo+", { + {TokenType::Constructor, 0, "Foo"}, + {TokenType::Operator, 3, "+"} + }); +} + +void tst_Tokenizer::op() +{ + checkData(); +} + +QTEST_MAIN(tst_Tokenizer) + +#include "tst_tokenizer.moc" |