aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEike Ziller <git@eikeziller.de>2017-04-29 16:17:11 +0200
committerEike Ziller <git@eikeziller.de>2017-10-01 20:11:08 +0200
commit5798e33d742c0f413d2d865fdb75739b4374ce98 (patch)
treee7d36edf5de22ab74ed4b56e2e2b22be24f50ef6
parent2f69373309cfe88084c5777baeff6bb46eecd071 (diff)
Add highlighter
See lexical structure of Haskell https://www.haskell.org/onlinereport/haskell2010/haskellch2.html
-rw-r--r--plugins/haskell/haskell.pro8
-rw-r--r--plugins/haskell/haskelleditorfactory.cpp2
-rw-r--r--plugins/haskell/haskellhighlighter.cpp152
-rw-r--r--plugins/haskell/haskellhighlighter.h58
-rw-r--r--plugins/haskell/haskelltokenizer.cpp631
-rw-r--r--plugins/haskell/haskelltokenizer.h91
-rw-r--r--tests/auto/tokenizer/tokenizer.pro11
-rw-r--r--tests/auto/tokenizer/tst_tokenizer.cpp730
8 files changed, 1681 insertions, 2 deletions
diff --git a/plugins/haskell/haskell.pro b/plugins/haskell/haskell.pro
index 9c557b1..aa92d3c 100644
--- a/plugins/haskell/haskell.pro
+++ b/plugins/haskell/haskell.pro
@@ -5,14 +5,18 @@ DEFINES += HASKELL_LIBRARY
SOURCES += \
haskellcompletionassist.cpp \
haskelleditorfactory.cpp \
- haskellplugin.cpp
+ haskellplugin.cpp \
+ haskellhighlighter.cpp \
+ haskelltokenizer.cpp
HEADERS += \
haskell_global.h \
haskellcompletionassist.h \
haskellconstants.h \
haskelleditorfactory.h \
- haskellplugin.h
+ haskellplugin.h \
+ haskellhighlighter.h \
+ haskelltokenizer.h
## uncomment to build plugin into user config directory
## <localappdata>/plugins/<ideversion>
diff --git a/plugins/haskell/haskelleditorfactory.cpp b/plugins/haskell/haskelleditorfactory.cpp
index 220e52e..8119105 100644
--- a/plugins/haskell/haskelleditorfactory.cpp
+++ b/plugins/haskell/haskelleditorfactory.cpp
@@ -27,6 +27,7 @@
#include "haskellcompletionassist.h"
#include "haskellconstants.h"
+#include "haskellhighlighter.h"
#include <texteditor/textdocument.h>
#include <texteditor/texteditoractionhandler.h>
@@ -47,6 +48,7 @@ HaskellEditorFactory::HaskellEditorFactory()
setParenthesesMatchingEnabled(true);
setMarksVisible(true);
setCompletionAssistProvider(new HaskellCompletionAssistProvider);
+ setSyntaxHighlighterCreator([] { return new HaskellHighlighter(); });
}
} // Internal
diff --git a/plugins/haskell/haskellhighlighter.cpp b/plugins/haskell/haskellhighlighter.cpp
new file mode 100644
index 0000000..9899cc4
--- /dev/null
+++ b/plugins/haskell/haskellhighlighter.cpp
@@ -0,0 +1,152 @@
+/****************************************************************************
+**
+** Copyright (C) 2017 The Qt Company Ltd.
+** Contact: https://www.qt.io/licensing/
+**
+** This file is part of Qt Creator.
+**
+** Commercial License Usage
+** Licensees holding valid commercial Qt licenses may use this file in
+** accordance with the commercial license agreement provided with the
+** Software or, alternatively, in accordance with the terms contained in
+** a written agreement between you and The Qt Company. For licensing terms
+** and conditions see https://www.qt.io/terms-conditions. For further
+** information use the contact form at https://www.qt.io/contact-us.
+**
+** GNU General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU
+** General Public License version 3 as published by the Free Software
+** Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
+** included in the packaging of this file. Please review the following
+** information to ensure the GNU General Public License requirements will
+** be met: https://www.gnu.org/licenses/gpl-3.0.html.
+**
+****************************************************************************/
+
+#include "haskellhighlighter.h"
+
+#include "haskelltokenizer.h"
+
+#include <texteditor/fontsettings.h>
+#include <texteditor/texteditorconstants.h>
+#include <texteditor/texteditorsettings.h>
+
+#include <QDebug>
+#include <QVector>
+
+Q_GLOBAL_STATIC_WITH_ARGS(QSet<QString>, IMPORT_HIGHLIGHTS, ({
+ "qualified",
+ "as",
+ "hiding"
+}));
+
+using namespace TextEditor;
+
+namespace Haskell {
+namespace Internal {
+
+HaskellHighlighter::HaskellHighlighter()
+{
+ setDefaultTextFormatCategories();
+ updateFormats(TextEditorSettings::fontSettings());
+}
+
+void HaskellHighlighter::highlightBlock(const QString &text)
+{
+ const Tokens tokens = HaskellTokenizer::tokenize(text, previousBlockState());
+ setCurrentBlockState(tokens.state);
+ const Token *firstNonWS = 0;
+ const Token *secondNonWS = 0;
+ bool inType = false;
+ bool inImport = false;
+ for (const Token & token : tokens) {
+ switch (token.type) {
+ case TokenType::Variable:
+ if (inType)
+ setTokenFormat(token, C_LOCAL);
+ else if (inImport && IMPORT_HIGHLIGHTS->contains(token.text.toString()))
+ setTokenFormat(token, C_KEYWORD);
+// else
+// setTokenFormat(token, C_TEXT);
+ break;
+ case TokenType::Constructor:
+ case TokenType::OperatorConstructor:
+ setTokenFormat(token, C_TYPE);
+ break;
+ case TokenType::Operator:
+ setTokenFormat(token, C_OPERATOR);
+ break;
+ case TokenType::Whitespace:
+ setTokenFormat(token, C_VISUAL_WHITESPACE);
+ break;
+ case TokenType::Keyword:
+ if (token.text == "::" && firstNonWS && !secondNonWS) { // toplevel declaration
+ setFormat(firstNonWS->startCol, firstNonWS->length, m_toplevelDeclFormat);
+ inType = true;
+ } else if (token.text == "import") {
+ inImport = true;
+ }
+ setTokenFormat(token, C_KEYWORD);
+ break;
+ case TokenType::Integer:
+ case TokenType::Float:
+ setTokenFormat(token, C_NUMBER);
+ break;
+ case TokenType::String:
+ setTokenFormatWithSpaces(text, token, C_STRING);
+ break;
+ case TokenType::Char:
+ setTokenFormatWithSpaces(text, token, C_STRING);
+ break;
+ case TokenType::EscapeSequence:
+ setTokenFormat(token, C_PRIMITIVE_TYPE);
+ break;
+ case TokenType::SingleLineComment:
+ setTokenFormatWithSpaces(text, token, C_COMMENT);
+ break;
+ case TokenType::MultiLineComment:
+ setTokenFormatWithSpaces(text, token, C_COMMENT);
+ break;
+ case TokenType::Special:
+// setTokenFormat(token, C_TEXT);
+ break;
+ case TokenType::StringError:
+ case TokenType::CharError:
+ case TokenType::Unknown:
+ setTokenFormat(token, C_PARENTHESES_MISMATCH);
+ break;
+ }
+ if (token.type != TokenType::Whitespace) {
+ if (!firstNonWS)
+ firstNonWS = &token;
+ else if (!secondNonWS)
+ secondNonWS = &token;
+ }
+ }
+}
+
+void HaskellHighlighter::setFontSettings(const FontSettings &fontSettings)
+{
+ SyntaxHighlighter::setFontSettings(fontSettings);
+ updateFormats(fontSettings);
+}
+
+void HaskellHighlighter::updateFormats(const FontSettings &fontSettings)
+{
+ m_toplevelDeclFormat = fontSettings.toTextCharFormat(
+ TextStyles::mixinStyle(C_FUNCTION, C_DECLARATION));
+}
+
+void HaskellHighlighter::setTokenFormat(const Token &token, TextStyle style)
+{
+ setFormat(token.startCol, token.length, formatForCategory(style));
+}
+
+void HaskellHighlighter::setTokenFormatWithSpaces(const QString &text, const Token &token,
+ TextStyle style)
+{
+ setFormatWithSpaces(text, token.startCol, token.length, formatForCategory(style));
+}
+
+} // Internal
+} // Haskell
diff --git a/plugins/haskell/haskellhighlighter.h b/plugins/haskell/haskellhighlighter.h
new file mode 100644
index 0000000..6213333
--- /dev/null
+++ b/plugins/haskell/haskellhighlighter.h
@@ -0,0 +1,58 @@
+/****************************************************************************
+**
+** Copyright (C) 2017 The Qt Company Ltd.
+** Contact: https://www.qt.io/licensing/
+**
+** This file is part of Qt Creator.
+**
+** Commercial License Usage
+** Licensees holding valid commercial Qt licenses may use this file in
+** accordance with the commercial license agreement provided with the
+** Software or, alternatively, in accordance with the terms contained in
+** a written agreement between you and The Qt Company. For licensing terms
+** and conditions see https://www.qt.io/terms-conditions. For further
+** information use the contact form at https://www.qt.io/contact-us.
+**
+** GNU General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU
+** General Public License version 3 as published by the Free Software
+** Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
+** included in the packaging of this file. Please review the following
+** information to ensure the GNU General Public License requirements will
+** be met: https://www.gnu.org/licenses/gpl-3.0.html.
+**
+****************************************************************************/
+
+#pragma once
+
+#include <texteditor/syntaxhighlighter.h>
+
+#include <QHash>
+#include <QTextFormat>
+
+namespace Haskell {
+namespace Internal {
+
+class Token;
+
+class HaskellHighlighter : public TextEditor::SyntaxHighlighter
+{
+ Q_OBJECT
+
+public:
+ HaskellHighlighter();
+
+protected:
+ void highlightBlock(const QString &text) override;
+
+private:
+ void setFontSettings(const TextEditor::FontSettings &fontSettings) override;
+ void updateFormats(const TextEditor::FontSettings &fontSettings);
+ void setTokenFormat(const Token &token, TextEditor::TextStyle style);
+ void setTokenFormatWithSpaces(const QString &text, const Token &token,
+ TextEditor::TextStyle style);
+ QTextCharFormat m_toplevelDeclFormat;
+};
+
+} // Internal
+} // Haskell
diff --git a/plugins/haskell/haskelltokenizer.cpp b/plugins/haskell/haskelltokenizer.cpp
new file mode 100644
index 0000000..527e505
--- /dev/null
+++ b/plugins/haskell/haskelltokenizer.cpp
@@ -0,0 +1,631 @@
+/****************************************************************************
+**
+** Copyright (C) 2017 The Qt Company Ltd.
+** Contact: https://www.qt.io/licensing/
+**
+** This file is part of Qt Creator.
+**
+** Commercial License Usage
+** Licensees holding valid commercial Qt licenses may use this file in
+** accordance with the commercial license agreement provided with the
+** Software or, alternatively, in accordance with the terms contained in
+** a written agreement between you and The Qt Company. For licensing terms
+** and conditions see https://www.qt.io/terms-conditions. For further
+** information use the contact form at https://www.qt.io/contact-us.
+**
+** GNU General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU
+** General Public License version 3 as published by the Free Software
+** Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
+** included in the packaging of this file. Please review the following
+** information to ensure the GNU General Public License requirements will
+** be met: https://www.gnu.org/licenses/gpl-3.0.html.
+**
+****************************************************************************/
+
+#include "haskelltokenizer.h"
+
+#include <QSet>
+
+#include <algorithm>
+#include <functional>
+
+Q_GLOBAL_STATIC_WITH_ARGS(QSet<QString>, RESERVED_OP, ({
+ "..",
+ ":",
+ "::",
+ "=",
+ "\\",
+ "|",
+ "<-",
+ "->",
+ "@",
+ "~",
+ "=>",
+
+ // Arrows GHC extension
+ "-<",
+ "-<<",
+ ">-",
+ ">>-",
+ "(|",
+ "|)"
+}));
+
+Q_GLOBAL_STATIC_WITH_ARGS(QSet<QString>, RESERVED_ID, ({
+ "case",
+ "class",
+ "data",
+ "default",
+ "deriving",
+ "do",
+ "else",
+ "foreign",
+ "if",
+ "import",
+ "in",
+ "infix",
+ "infixl",
+ "infixr",
+ "instance",
+ "let",
+ "module",
+ "newtype",
+ "of",
+ "then",
+ "type",
+ "where",
+ "_",
+
+ // from GHC extensions
+ "family",
+ "forall",
+ "mdo",
+ "proc",
+ "rec"
+}));
+
+Q_GLOBAL_STATIC_WITH_ARGS(QSet<QChar>, SPECIAL, ({
+ '(',
+ ')',
+ ',',
+ ';',
+ '[',
+ ']',
+ '`',
+ '{',
+ '}',
+}));
+
+Q_GLOBAL_STATIC_WITH_ARGS(QSet<QChar>, CHAR_ESCAPES,
+ ({'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '"', '\'', '&'}));
+
+Q_GLOBAL_STATIC_WITH_ARGS(QVector<QString>, ASCII_ESCAPES, ({
+ "NUL",
+ "SOH", // must be before "SO" to match
+ "STX",
+ "ETX",
+ "EOT",
+ "ENQ",
+ "ACK",
+ "BEL",
+ "BS",
+ "HT",
+ "LF",
+ "VT",
+ "FF",
+ "CR",
+ "SO",
+ "SI",
+ "DLE",
+ "DC1",
+ "DC2",
+ "DC3",
+ "DC4",
+ "NAK",
+ "SYN",
+ "ETB",
+ "CAN",
+ "EM",
+ "SUB",
+ "ESC",
+ "FS",
+ "GS",
+ "RS",
+ "US",
+ "SP",
+ "DEL"
+}));
+
+namespace Haskell {
+namespace Internal {
+
+Token token(TokenType type, std::shared_ptr<QString> line, int start, int end)
+{
+ return {type, start, end - start, line->midRef(start, end - start), line};
+}
+
+Tokens::Tokens(std::shared_ptr<QString> source)
+ : source(source)
+{
+}
+
+static int grab(const QString &line, int begin,
+ const std::function<bool(const QChar&)> &test)
+{
+ const int length = line.length();
+ int current = begin;
+ while (current < length && test(line.at(current)))
+ ++current;
+ return current - begin;
+};
+
+
+static bool isIdentifierChar(const QChar &c)
+{
+ return c.isLetterOrNumber() || c == '\'' || c == '_';
+}
+
+static bool isVariableIdentifierStart(const QChar &c)
+{
+ return c == '_' || c.isLower();
+}
+
+static bool isAscSymbol(const QChar &c)
+{
+ return c == '!'
+ || c == '#'
+ || c == '$'
+ || c == '%'
+ || c == '&'
+ || c == '*'
+ || c == '+'
+ || c == '.'
+ || c == '/'
+ || c == '<'
+ || c == '='
+ || c == '>'
+ || c == '?'
+ || c == '@'
+ || c == '\\'
+ || c == '^'
+ || c == '|'
+ || c == '-'
+ || c == '~'
+ || c == ':';
+}
+
+static bool isSymbol(const QChar &c)
+{
+ return isAscSymbol(c)
+ || ((c.isSymbol() || c.isPunct()) && c != '_' && c != '"' && c != '\''
+ && !SPECIAL->contains(c));
+}
+
+static bool isDigit(const QChar &c)
+{
+ return c.isDigit();
+}
+
+static bool isOctit(const QChar &c)
+{
+ return c >= '0' && c <= '7';
+}
+
+static bool isHexit(const QChar &c)
+{
+ return c.isDigit()
+ || (c >= 'A' && c <= 'F')
+ || (c >= 'a' && c <= 'f');
+}
+
+static bool isCntrl(const QChar &c)
+{
+ return (c >= 'A' && c <= 'Z')
+ || c == '@'
+ || c == '['
+ || c == '\\'
+ || c == ']'
+ || c == '^'
+ || c == '_';
+}
+
+static QVector<Token> getSpace(std::shared_ptr<QString> line, int start)
+{
+ const auto lineEnd = line->cend();
+ const auto tokenStart = line->cbegin() + start;
+ auto current = tokenStart;
+ while (current != lineEnd && (*current).isSpace())
+ ++current;
+ const int length = int(std::distance(tokenStart, current));
+ if (current > tokenStart)
+ return {{TokenType::Whitespace, start, length, line->midRef(start, length), line}};
+ return {};
+}
+
+static QVector<Token> getNumber(std::shared_ptr<QString> line, int start)
+{
+ const QChar &startC = line->at(start);
+ if (!startC.isDigit())
+ return {};
+ const int length = line->length();
+ int current = start + 1;
+ TokenType type = TokenType::Integer;
+ if (current < length) {
+ if (startC == '0') {
+ // check for octal or hexadecimal
+ const QChar &secondC = line->at(current);
+ if (secondC == 'o' || secondC == 'O') {
+ const int numLen = grab(*line, current + 1, isOctit);
+ if (numLen > 0)
+ return {token(TokenType::Integer, line, start, current + numLen + 1)};
+ } else if (secondC == 'x' || secondC == 'X') {
+ const int numLen = grab(*line, current + 1, isHexit);
+ if (numLen > 0)
+ return {token(TokenType::Integer, line, start, current + numLen + 1)};
+ }
+ }
+ // starts with decimal
+ const int numLen = grab(*line, start, isDigit);
+ current = start + numLen;
+ // check for floating point
+ if (current < length && line->at(current) == '.') {
+ const int numLen = grab(*line, current + 1, isDigit);
+ if (numLen > 0) {
+ current += numLen + 1;
+ type = TokenType::Float;
+ }
+ }
+ // check for exponent
+ if (current + 1 < length /*for at least 'e' and digit*/
+ && (line->at(current) == 'e' || line->at(current) == 'E')) {
+ int expEnd = current + 1;
+ if (line->at(expEnd) == '+' || line->at(expEnd) == '-')
+ ++expEnd;
+ const int numLen = grab(*line, expEnd, isDigit);
+ if (numLen > 0) {
+ current = expEnd + numLen;
+ type = TokenType::Float;
+ }
+ }
+ }
+ return {token(type, line, start, current)};
+}
+
+static QVector<Token> getIdOrOpOrSingleLineComment(std::shared_ptr<QString> line, int start)
+{
+ const int length = line->length();
+ if (start >= length)
+ return {};
+ int current = start;
+ // check for {conid.}conid
+ int conidEnd = start;
+ bool canOnlyBeConstructor = false;
+ while (current < length && line->at(current).isUpper()) {
+ current += grab(*line, current, isIdentifierChar);
+ conidEnd = current;
+ // it is definitely a constructor id if it is not followed by a '.'
+ canOnlyBeConstructor = current >= length || line->at(current) != '.';
+ // otherwise it might be a module id, and we skip the dot to check for qualified thing
+ if (!canOnlyBeConstructor)
+ ++current;
+ }
+ if (canOnlyBeConstructor)
+ return {token(TokenType::Constructor, line, start, conidEnd)};
+
+ // check for variable or reserved id
+ if (current < length && isVariableIdentifierStart(line->at(current))) {
+ const int varLen = grab(*line, current, isIdentifierChar);
+ // check for reserved id
+ if (RESERVED_ID->contains(line->mid(current, varLen))) {
+ QVector<Token> result;
+ // possibly add constructor + op '.'
+ if (conidEnd > start) {
+ result.append(token(TokenType::Constructor, line, start, conidEnd));
+ result.append(token(TokenType::Operator, line, conidEnd, current));
+ }
+ result.append(token(TokenType::Keyword, line, current, current + varLen));
+ return result;
+ }
+ return {token(TokenType::Variable, line, start, current + varLen)};
+ }
+ // check for operator
+ if (current < length && isSymbol(line->at(current))) {
+ const int opLen = grab(*line, current, isSymbol);
+ // check for reserved op
+ if (RESERVED_OP->contains(line->mid(current, opLen))) {
+ // because of the case of F... (constructor + op '...') etc
+ // we only add conid if we have one, handling the rest in next iteration
+ if (conidEnd > start)
+ return {token(TokenType::Constructor, line, start, conidEnd)};
+ return {token(TokenType::Keyword, line, start, current + opLen)};
+ }
+ // check for single line comment
+ if (opLen >= 2 && std::all_of(line->begin() + current, line->begin() + current + opLen,
+ [](const QChar c) { return c == '-'; })) {
+ QVector<Token> result;
+ // possibly add constructor + op '.'
+ if (conidEnd > start) {
+ result.append(token(TokenType::Constructor, line, start, conidEnd));
+ result.append(token(TokenType::Operator, line, conidEnd, current));
+ }
+ // rest is comment
+ result.append(token(TokenType::SingleLineComment, line, current, length));
+ return result;
+ }
+ // check for (qualified?) operator constructor
+ if (line->at(current) == ':')
+ return {token(TokenType::OperatorConstructor, line, start, current + opLen)};
+ return {token(TokenType::Operator, line, start, current + opLen)};
+ }
+ // Foo.Blah.
+ if (conidEnd > start)
+ return {token(TokenType::Constructor, line, start, conidEnd)};
+ return {};
+}
+
+static int getEscape(const QString &line, int start)
+{
+ if (CHAR_ESCAPES->contains(line.at(start)))
+ return 1;
+
+ // decimal
+ if (line.at(start).isDigit())
+ return grab(line, start + 1, isDigit) + 1;
+ // octal
+ if (line.at(start) == 'o') {
+ const int count = grab(line, start + 1, isOctit);
+ if (count < 1) // no octal number after 'o'
+ return 0;
+ return count + 1;
+ }
+ // hexadecimal
+ if (line.at(start) == 'x') {
+ const int count = grab(line, start + 1, isHexit);
+ if (count < 1) // no octal number after 'o'
+ return 0;
+ return count + 1;
+ }
+ // ascii cntrl
+ if (line.at(start) == '^') {
+ const int count = grab(line, start + 1, isCntrl);
+ if (count < 1) // no octal number after 'o'
+ return 0;
+ return count + 1;
+ }
+ const QStringRef s = line.midRef(start);
+ for (const QString &esc : *ASCII_ESCAPES) {
+ if (s.startsWith(esc))
+ return esc.length();
+ }
+ return 0;
+}
+
+static QVector<Token> getString(std::shared_ptr<QString> line, int start, bool *inStringGap/*in-out*/)
+{
+ // Haskell has the specialty of using \<whitespace>\ within strings for multiline strings
+ const int length = line->length();
+ if (start >= length)
+ return {};
+ QVector<Token> result;
+ int tokenStart = start;
+ int current = tokenStart;
+ bool inString = *inStringGap;
+ do {
+ const QChar c = line->at(current);
+ if (*inStringGap && !c.isSpace() && c != '\\') {
+ // invalid non-whitespace in string gap
+ // add previous string as token, this is at least a whitespace
+ result.append(token(TokenType::String, line, tokenStart, current));
+ // then add wrong non-whitespace
+ tokenStart = current;
+ do { ++current; } while (current < length && !line->at(current).isSpace());
+ result.append(token(TokenType::StringError, line, tokenStart, current));
+ tokenStart = current;
+ } else if (c == '"') {
+ inString = !inString;
+ ++current;
+ } else if (inString) {
+ if (c == '\\') {
+ ++current;
+ if (*inStringGap) {
+ // ending string gap
+ *inStringGap = false;
+ } else if (current >= length || line->at(current).isSpace()) {
+ // starting string gap
+ *inStringGap = true;
+ current = std::min(current + 1, length);
+ } else { // there is at least one character after current
+ const int escapeLength = getEscape(*line, current);
+ if (escapeLength > 0) {
+ // valid escape
+ // add previous string as token without backslash, if necessary
+ if (tokenStart < current - 1/*backslash*/)
+ result.append(token(TokenType::String, line, tokenStart, current - 1));
+ tokenStart = current - 1; // backslash
+ current += escapeLength;
+ result.append(token(TokenType::EscapeSequence, line, tokenStart, current));
+ tokenStart = current;
+ } else { // invalid escape sequence
+ // add previous string as token, this is at least backslash
+ result.append(token(TokenType::String, line, tokenStart, current));
+ result.append(token(TokenType::StringError, line, current, current + 1));
+ ++current;
+ tokenStart = current;
+ }
+ }
+ } else {
+ ++current;
+ }
+ }
+ } while (current < length && inString);
+ if (current > tokenStart)
+ result.append(token(TokenType::String, line, tokenStart, current));
+ if (inString && !*inStringGap) { // unterminated string
+ // mark last character of last token as Unknown as an error hint
+ if (!result.isEmpty()) { // should actually never be different
+ Token &lastRef = result.last();
+ if (lastRef.length == 1) {
+ lastRef.type = TokenType::StringError;
+ } else {
+ --lastRef.length;
+ lastRef.text = line->midRef(lastRef.startCol, lastRef.length);
+ result.append(token(TokenType::StringError, line, current - 1, current));
+ }
+ }
+ }
+ return result;
+}
+
+static QVector<Token> getMultiLineComment(std::shared_ptr<QString> line, int start,
+ int *commentLevel/*in_out*/)
+{
+ // Haskell multiline comments can be nested {- foo {- bar -} blah -}
+ const int length = line->length();
+ int current = start;
+ do {
+ const QStringRef test = line->midRef(current, 2);
+ if (test == "{-") {
+ ++(*commentLevel);
+ current += 2;
+ } else if (test == "-}" && *commentLevel > 0) {
+ --(*commentLevel);
+ current += 2;
+ } else if (*commentLevel > 0) {
+ ++current;
+ }
+ } while (current < length && *commentLevel > 0);
+ if (current > start) {
+ return {token(TokenType::MultiLineComment, line, start, current)};
+ }
+ return {};
+}
+
+static QVector<Token> getChar(std::shared_ptr<QString> line, int start)
+{
+ if (line->at(start) != '\'')
+ return {};
+ QVector<Token> result;
+ const int length = line->length();
+ int tokenStart = start;
+ int current = tokenStart + 1;
+ bool inChar = true;
+ int count = 0;
+ while (current < length && inChar) {
+ if (line->at(current) == '\'') {
+ inChar = false;
+ ++current;
+ } else if (count == 1) {
+ // we already have one character, so start Unknown token
+ if (current > tokenStart)
+ result.append(token(TokenType::Char, line, tokenStart, current));
+ tokenStart = current;
+ ++count;
+ ++current;
+ } else if (count > 1) {
+ ++count;
+ ++current;
+ } else if (line->at(current) == '\\') {
+ if (current + 1 < length) {
+ ++current;
+ ++count;
+ const int escapeLength = getEscape(*line, current);
+ if (line->at(current) != '&' && escapeLength > 0) { // no & escape for chars
+ // valid escape
+ // add previous string as token without backslash, if necessary
+ if (tokenStart < current - 1/*backslash*/)
+ result.append(token(TokenType::Char, line, tokenStart, current - 1));
+ tokenStart = current - 1; // backslash
+ current += escapeLength;
+ result.append(token(TokenType::EscapeSequence, line, tokenStart, current));
+ tokenStart = current;
+ } else { // invalid escape sequence
+ // add previous string as token, this is at least backslash
+ result.append(token(TokenType::Char, line, tokenStart, current));
+ result.append(token(TokenType::CharError, line, current, current + 1));
+ ++current;
+ tokenStart = current;
+ }
+ } else {
+ ++current;
+ }
+ } else {
+ ++count;
+ ++current;
+ }
+ }
+ if (count > 1 && inChar) {
+ // too long and unterminated, just add Unknown token till end
+ result.append(token(TokenType::CharError, line, tokenStart, current));
+ } else if (count > 1) {
+ // too long but terminated, add Unknown up to ending quote, then quote
+ result.append(token(TokenType::CharError, line, tokenStart, current - 1));
+ result.append(token(TokenType::Char, line, current - 1, current));
+ } else if (inChar || count < 1) {
+ // unterminated, or no character inside, mark last character as error
+ if (current > tokenStart + 1)
+ result.append(token(TokenType::Char, line, tokenStart, current - 1));
+ result.append(token(TokenType::CharError, line, current - 1, current));
+ } else {
+ result.append(token(TokenType::Char, line, tokenStart, current));
+ }
+ return result;
+}
+
+static QVector<Token> getSpecial(std::shared_ptr<QString> line, int start)
+{
+ if (SPECIAL->contains(line->at(start)))
+ return {{TokenType::Special, start, 1, line->midRef(start, 1), line}};
+ return {};
+}
+
+Tokens HaskellTokenizer::tokenize(const QString &line, int startState)
+{
+ Tokens result(std::make_shared<QString>(line));
+ const int length = result.source->length();
+ bool inStringGap = startState == int(Tokens::State::StringGap);
+ int multiLineCommentLevel = std::max(startState - int(Tokens::State::MultiLineCommentGuard), 0);
+ int currentStart = 0;
+ QVector<Token> tokens;
+ while (currentStart < length) {
+ if (multiLineCommentLevel <= 0 &&
+ !(tokens = getString(result.source, currentStart, &inStringGap)).isEmpty()) {
+ result.append(tokens);
+ } else if (!(tokens = getMultiLineComment(result.source, currentStart,
+ &multiLineCommentLevel)).isEmpty()) {
+ result.append(tokens);
+ } else if (!(tokens = getChar(result.source, currentStart)).isEmpty()) {
+ result.append(tokens);
+ } else if (!(tokens = getSpace(result.source, currentStart)).isEmpty()) {
+ result.append(tokens);
+ } else if (!(tokens = getNumber(result.source, currentStart)).isEmpty()) {
+ result.append(tokens);
+ } else if (!(tokens = getIdOrOpOrSingleLineComment(result.source, currentStart)).isEmpty()) {
+ result.append(tokens);
+ } else if (!(tokens = getSpecial(result.source, currentStart)).isEmpty()) {
+ result.append(tokens);
+ } else {
+ tokens = {{TokenType::Unknown,
+ currentStart,
+ 1,
+ result.source->midRef(currentStart, 1),
+ result.source}};
+ result.append(tokens);
+ }
+ currentStart += std::accumulate(tokens.cbegin(), tokens.cend(), 0,
+ [](int s, const Token &t) { return s + t.length; });
+ }
+ if (inStringGap)
+ result.state = int(Tokens::State::StringGap);
+ else if (multiLineCommentLevel > 0)
+ result.state = int(Tokens::State::MultiLineCommentGuard) + multiLineCommentLevel;
+ return result;
+}
+
+bool Token::isValid() const
+{
+ return type != TokenType::Unknown;
+}
+
+} // Internal
+} // Haskell
diff --git a/plugins/haskell/haskelltokenizer.h b/plugins/haskell/haskelltokenizer.h
new file mode 100644
index 0000000..46b4b00
--- /dev/null
+++ b/plugins/haskell/haskelltokenizer.h
@@ -0,0 +1,91 @@
+/****************************************************************************
+**
+** Copyright (C) 2017 The Qt Company Ltd.
+** Contact: https://www.qt.io/licensing/
+**
+** This file is part of Qt Creator.
+**
+** Commercial License Usage
+** Licensees holding valid commercial Qt licenses may use this file in
+** accordance with the commercial license agreement provided with the
+** Software or, alternatively, in accordance with the terms contained in
+** a written agreement between you and The Qt Company. For licensing terms
+** and conditions see https://www.qt.io/terms-conditions. For further
+** information use the contact form at https://www.qt.io/contact-us.
+**
+** GNU General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU
+** General Public License version 3 as published by the Free Software
+** Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
+** included in the packaging of this file. Please review the following
+** information to ensure the GNU General Public License requirements will
+** be met: https://www.gnu.org/licenses/gpl-3.0.html.
+**
+****************************************************************************/
+
+#pragma once
+
+#include <QChar>
+#include <QString>
+#include <QVector>
+
+#include <memory>
+
+namespace Haskell {
+namespace Internal {
+
+enum class TokenType {
+ Variable,
+ Constructor,
+ Operator,
+ OperatorConstructor,
+ Whitespace,
+ String,
+ StringError,
+ Char,
+ CharError,
+ EscapeSequence,
+ Integer,
+ Float,
+ Keyword,
+ Special,
+ SingleLineComment,
+ MultiLineComment,
+ Unknown
+};
+
+class Token {
+public:
+ bool isValid() const;
+
+ TokenType type = TokenType::Unknown;
+ int startCol = -1;
+ int length = -1;
+ QStringRef text;
+
+ std::shared_ptr<QString> source; // keep the string ref alive
+};
+
+class Tokens : public QVector<Token>
+{
+public:
+ enum class State {
+ None = -1,
+ StringGap = 0, // gap == two backslashes enclosing only whitespace
+ MultiLineCommentGuard // nothing may follow that
+ };
+
+ Tokens(std::shared_ptr<QString> source);
+
+ std::shared_ptr<QString> source;
+ int state = int(State::None);
+};
+
+class HaskellTokenizer
+{
+public:
+ static Tokens tokenize(const QString &line, int startState);
+};
+
+} // Internal
+} // Haskell
diff --git a/tests/auto/tokenizer/tokenizer.pro b/tests/auto/tokenizer/tokenizer.pro
new file mode 100644
index 0000000..a9ec439
--- /dev/null
+++ b/tests/auto/tokenizer/tokenizer.pro
@@ -0,0 +1,11 @@
+include(../../../plugins/haskell/config.pri)
+
+include($$IDE_SOURCE_TREE/tests/auto/qttest.pri)
+
+SOURCES += tst_tokenizer.cpp \
+ $$PWD/../../../plugins/haskell/haskelltokenizer.cpp
+
+HEADERS += \
+ $$PWD/../../../plugins/haskell/haskelltokenizer.h
+
+INCLUDEPATH += $$PWD/../../../plugins/haskell
diff --git a/tests/auto/tokenizer/tst_tokenizer.cpp b/tests/auto/tokenizer/tst_tokenizer.cpp
new file mode 100644
index 0000000..ffa34b2
--- /dev/null
+++ b/tests/auto/tokenizer/tst_tokenizer.cpp
@@ -0,0 +1,730 @@
+/****************************************************************************
+**
+** Copyright (C) 2017 The Qt Company Ltd.
+** Contact: https://www.qt.io/licensing/
+**
+** This file is part of Qt Creator.
+**
+** Commercial License Usage
+** Licensees holding valid commercial Qt licenses may use this file in
+** accordance with the commercial license agreement provided with the
+** Software or, alternatively, in accordance with the terms contained in
+** a written agreement between you and The Qt Company. For licensing terms
+** and conditions see https://www.qt.io/terms-conditions. For further
+** information use the contact form at https://www.qt.io/contact-us.
+**
+** GNU General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU
+** General Public License version 3 as published by the Free Software
+** Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
+** included in the packaging of this file. Please review the following
+** information to ensure the GNU General Public License requirements will
+** be met: https://www.gnu.org/licenses/gpl-3.0.html.
+**
+****************************************************************************/
+
+#include <haskelltokenizer.h>
+
+#include <QObject>
+#include <QtTest>
+
+using namespace Haskell::Internal;
+
+const QSet<char> escapes{'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '"', '\'', '&'};
+
+struct TokenInfo
+{
+ TokenType type;
+ int column;
+ QString text;
+};
+
+Q_DECLARE_METATYPE(TokenInfo)
+
+bool operator==(const TokenInfo &info, const Token &token)
+{
+ return info.type == token.type
+ && info.column == token.startCol
+ && info.text.length() == token.length
+ && info.text == token.text.toString();
+}
+
+bool operator==(const Token &token, const TokenInfo &info)
+{
+ return info == token;
+}
+
+class tst_Tokenizer : public QObject
+{
+ Q_OBJECT
+
+private slots:
+ void singleLineComment_data();
+ void singleLineComment();
+
+ void multiLineComment_data();
+ void multiLineComment();
+
+ void string_data();
+ void string();
+
+ void character_data();
+ void character();
+
+ void number_data();
+ void number();
+
+ void keyword_data();
+ void keyword();
+
+ void variable_data();
+ void variable();
+
+ void constructor_data();
+ void constructor();
+
+ void op_data();
+ void op();
+
+private:
+ void setupData();
+ void addRow(const char *name,
+ const QString &input,
+ const QList<TokenInfo> &tokens,
+ Tokens::State startState = Tokens::State::None,
+ Tokens::State endState = Tokens::State::None);
+ void checkData();
+};
+
+void tst_Tokenizer::setupData()
+{
+ QTest::addColumn<QString>("input");
+ QTest::addColumn<QList<TokenInfo>>("output");
+ QTest::addColumn<int>("startState");
+ QTest::addColumn<int>("endState");
+}
+
+void tst_Tokenizer::addRow(const char *name,
+ const QString &input,
+ const QList<TokenInfo> &tokens,
+ Tokens::State startState,
+ Tokens::State endState)
+{
+ QTest::newRow(name) << input << tokens << int(startState) << int(endState);
+}
+
+void tst_Tokenizer::checkData()
+{
+ QFETCH(QString, input);
+ QFETCH(QList<TokenInfo>, output);
+ QFETCH(int, startState);
+ QFETCH(int, endState);
+ const Tokens tokens = HaskellTokenizer::tokenize(input, startState);
+ QCOMPARE(tokens.length(), output.length());
+ QCOMPARE(tokens.state, endState);
+ for (int i = 0; i < tokens.length(); ++i) {
+ const Token t = tokens.at(i);
+ const TokenInfo ti = output.at(i);
+ QVERIFY2(t == ti, QString("Token at index %1 does not match, {%2, %3, \"%4\"} != {%5, %6, \"%7\"}")
+ .arg(i)
+ .arg(int(t.type)).arg(t.startCol).arg(t.text.toString())
+ .arg(int(ti.type)).arg(ti.column).arg(ti.text)
+ .toUtf8().constData());
+ }
+}
+
+void tst_Tokenizer::singleLineComment_data()
+{
+ setupData();
+
+ addRow("simple", " -- foo", {
+ {TokenType::Whitespace, 0, " "},
+ {TokenType::SingleLineComment, 1, "-- foo"}
+ });
+ addRow("dash, id", "--foo", {
+ {TokenType::SingleLineComment, 0, "--foo"}
+ });
+ addRow("dash, space, op", "-- |foo", {
+ {TokenType::SingleLineComment, 0, "-- |foo"}
+ });
+ addRow("multi-dash, space", "---- foo", {
+ {TokenType::SingleLineComment, 0, "---- foo"}
+ });
+ addRow("dash, op", "--| foo", {
+ {TokenType::Operator, 0, "--|"},
+ {TokenType::Whitespace, 3, " "},
+ {TokenType::Variable, 4, "foo"}
+ });
+ addRow("dash, special", "--(foo", {
+ {TokenType::SingleLineComment, 0, "--(foo"}
+ });
+ addRow("not a qualified varsym", "F.-- foo", {
+ {TokenType::Constructor, 0, "F"},
+ {TokenType::Operator, 1, "."},
+ {TokenType::SingleLineComment, 2, "-- foo"}
+ });
+}
+
+void tst_Tokenizer::singleLineComment()
+{
+ checkData();
+}
+
+void tst_Tokenizer::multiLineComment_data()
+{
+ setupData();
+
+ addRow("trailing dashes", "{---foo -}", {
+ {TokenType::MultiLineComment, 0, "{---foo -}"}
+ });
+ addRow("multiline", "{- foo", {
+ {TokenType::MultiLineComment, 0, "{- foo"}
+ },
+ Tokens::State::None,
+ Tokens::State(int(Tokens::State::MultiLineCommentGuard) + 1));
+ addRow("multiline2", "bar -}", {
+ {TokenType::MultiLineComment, 0, "bar -}"}
+ },
+ Tokens::State(int(Tokens::State::MultiLineCommentGuard) + 1),
+ Tokens::State::None);
+ addRow("nested", "{- fo{-o", {
+ {TokenType::MultiLineComment, 0, "{- fo{-o"}
+ },
+ Tokens::State::None,
+ Tokens::State(int(Tokens::State::MultiLineCommentGuard) + 2));
+ addRow("nested2", "bar -}", {
+ {TokenType::MultiLineComment, 0, "bar -}"}
+ },
+ Tokens::State(int(Tokens::State::MultiLineCommentGuard) + 2),
+ Tokens::State(int(Tokens::State::MultiLineCommentGuard) + 1));
+ addRow("nested3", "bar -}", {
+ {TokenType::MultiLineComment, 0, "bar -}"}
+ },
+ Tokens::State(int(Tokens::State::MultiLineCommentGuard) + 1),
+ Tokens::State::None);
+}
+
+void tst_Tokenizer::multiLineComment()
+{
+ checkData();
+}
+
+void tst_Tokenizer::string_data()
+{
+ setupData();
+
+ addRow("simple", "\"foo\"", {
+ {TokenType::String, 0, "\"foo\""}
+ });
+
+ addRow("unterminated", "\"", {
+ {TokenType::StringError, 0, "\""}
+ });
+ addRow("unterminated2", "\"foo", {
+ {TokenType::String, 0, "\"fo"},
+ {TokenType::StringError, 3, "o"}
+ });
+ addRow("unterminated with escape", "\"\\\\", {
+ {TokenType::String, 0, "\""},
+ {TokenType::EscapeSequence, 1, "\\"},
+ {TokenType::StringError, 2, "\\"}
+ });
+
+ // gaps
+ addRow("gap", "\" \\ \\\"", {
+ {TokenType::String, 0, "\" \\ \\\""}
+ });
+ addRow("gap over endline", "\"foo\\", {
+ {TokenType::String, 0, "\"foo\\"}
+ },
+ Tokens::State::None, Tokens::State::StringGap);
+ addRow("gap over endline2", "\\foo\"", {
+ {TokenType::String, 0, "\\foo\""}
+ },
+ Tokens::State::StringGap, Tokens::State::None);
+ addRow("gap error", "\"\\ ab \\\"", {
+ {TokenType::String, 0, "\"\\ "},
+ {TokenType::StringError, 3, "ab"},
+ {TokenType::String, 5, " \\\""}
+ });
+ addRow("gap error with quote", "\"\\ \"", {
+ {TokenType::String, 0, "\"\\ "},
+ {TokenType::StringError, 3, "\""}
+ },
+ Tokens::State::None, Tokens::State::StringGap);
+
+ // char escapes (including wrong ones)
+ for (char c = '!'; c <= '~'; ++c) {
+ // skip uppercase and '^', since these can be part of ascii escapes
+ // and 'o' and 'x' since they start octal and hex escapes
+ // and digits as part of decimal escape
+ if ((c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '^' || c == 'o' || c == 'x')
+ continue;
+ const QChar qc(c);
+ const QByteArray name = QString("charesc '%1'").arg(qc).toUtf8();
+ const QString input = QString("\"\\%1\"").arg(qc);
+ if (escapes.contains(c)) {
+ addRow(name.constData(), input, {
+ {TokenType::String, 0, "\""},
+ {TokenType::EscapeSequence, 1, QLatin1String("\\") + qc},
+ {TokenType::String, 3, "\""}
+ });
+ } else {
+ addRow(name.constData(), input, {
+ {TokenType::String, 0, "\"\\"},
+ {TokenType::StringError, 2, qc},
+ {TokenType::String, 3, "\""}
+ });
+ }
+ }
+
+ addRow("decimal escape", "\"\\1234a\"", {
+ {TokenType::String, 0, "\""},
+ {TokenType::EscapeSequence, 1, "\\1234"},
+ {TokenType::String, 6, "a\""}
+ });
+
+ addRow("octal escape", "\"\\o0678a\"", {
+ {TokenType::String, 0, "\""},
+ {TokenType::EscapeSequence, 1, "\\o067"},
+ {TokenType::String, 6, "8a\""}
+ });
+ addRow("octal escape error", "\"\\o8a\"", {
+ {TokenType::String, 0, "\"\\"},
+ {TokenType::StringError, 2, "o"},
+ {TokenType::String, 3, "8a\""}
+ });
+
+ addRow("hexadecimal escape", "\"\\x0678Abg\"", {
+ {TokenType::String, 0, "\""},
+ {TokenType::EscapeSequence, 1, "\\x0678Ab"},
+ {TokenType::String, 9, "g\""}
+ });
+ addRow("hexadecimal escape error", "\"\\xg\"", {
+ {TokenType::String, 0, "\"\\"},
+ {TokenType::StringError, 2, "x"},
+ {TokenType::String, 3, "g\""}
+ });
+
+ // ascii cntrl escapes (including wrong ones)
+ for (char c = '!'; c <= '~'; ++c) {
+ if (c == '"') // is special because it also ends the string
+ continue;
+ const QChar qc(c);
+ const QByteArray name = QString("ascii cntrl '^%1'").arg(qc).toUtf8();
+ const QString input = QString("\"\\^%1\"").arg(qc);
+ if ((qc >= 'A' && qc <= 'Z') || qc == '@' || qc == '[' || qc == '\\' || qc == ']'
+ || qc == '^' || qc == '_') {
+ addRow(name.constData(), input, {
+ {TokenType::String, 0, "\""},
+ {TokenType::EscapeSequence, 1, QLatin1String("\\^") + qc},
+ {TokenType::String, 4, "\""}
+ });
+ } else {
+ addRow(name.constData(), input, {
+ {TokenType::String, 0, "\"\\"},
+ {TokenType::StringError, 2, "^"},
+ {TokenType::String, 3, QString(qc) + "\""}
+ });
+ }
+ }
+
+ addRow("ascii escape SOH", "\"\\SOHN\"", {
+ {TokenType::String, 0, "\""},
+ {TokenType::EscapeSequence, 1, "\\SOH"},
+ {TokenType::String, 5, "N\""}
+ });
+ addRow("ascii escape SO", "\"\\SON\"", {
+ {TokenType::String, 0, "\""},
+ {TokenType::EscapeSequence, 1, "\\SO"},
+ {TokenType::String, 4, "N\""}
+ });
+ addRow("ascii escape error", "\"\\TON\"", {
+ {TokenType::String, 0, "\"\\"},
+ {TokenType::StringError, 2, "T"},
+ {TokenType::String, 3, "ON\""}
+ });
+ addRow("ascii escape error 2", "\"\\STO\"", {
+ {TokenType::String, 0, "\"\\"},
+ {TokenType::StringError, 2, "S"},
+ {TokenType::String, 3, "TO\""}
+ });
+}
+
+void tst_Tokenizer::string()
+{
+ checkData();
+}
+
+void tst_Tokenizer::character_data()
+{
+ setupData();
+
+ addRow("simple", "'a'", {
+ {TokenType::Char, 0, "'a'"}
+ });
+ addRow("too many", "'abc'", {
+ {TokenType::Char, 0, "'a"},
+ {TokenType::CharError, 2, "bc"},
+ {TokenType::Char, 4, "'"}
+ });
+ addRow("too few", "''", {
+ {TokenType::Char, 0, "'"},
+ {TokenType::CharError, 1, "'"}
+ });
+ addRow("only quote", "'", {
+ {TokenType::CharError, 0, "'"}
+ });
+ addRow("unterminated", "'a", {
+ {TokenType::Char, 0, "'"},
+ {TokenType::CharError, 1, "a"}
+ });
+ addRow("unterminated too many", "'abc", {
+ {TokenType::Char, 0, "'a"},
+ {TokenType::CharError, 2, "bc"}
+ });
+ addRow("unterminated backslash", "'\\", {
+ {TokenType::Char, 0, "'"},
+ {TokenType::CharError, 1, "\\"}
+ });
+
+ // char escapes (including wrong ones)
+ for (char c = '!'; c <= '~'; ++c) {
+ // skip uppercase and '^', since these can be part of ascii escapes
+ // and 'o' and 'x' since they start octal and hex escapes
+ // and digits as part of decimal escape
+ if ((c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '^' || c == 'o' || c == 'x')
+ continue;
+ const QChar qc(c);
+ const QByteArray name = QString("charesc '%1'").arg(qc).toUtf8();
+ const QString input = QString("'\\%1'").arg(qc);
+ if (c != '&' && escapes.contains(c)) {
+ addRow(name.constData(), input, {
+ {TokenType::Char, 0, "'"},
+ {TokenType::EscapeSequence, 1, QLatin1String("\\") + qc},
+ {TokenType::Char, 3, "'"}
+ });
+ } else {
+ addRow(name.constData(), input, {
+ {TokenType::Char, 0, "'\\"},
+ {TokenType::CharError, 2, qc},
+ {TokenType::Char, 3, "'"}
+ });
+ }
+ }
+
+ addRow("decimal escape", "'\\1234'", {
+ {TokenType::Char, 0, "'"},
+ {TokenType::EscapeSequence, 1, "\\1234"},
+ {TokenType::Char, 6, "'"}
+ });
+ addRow("decimal escape too long", "'\\1234a'", {
+ {TokenType::Char, 0, "'"},
+ {TokenType::EscapeSequence, 1, "\\1234"},
+ {TokenType::CharError, 6, "a"},
+ {TokenType::Char, 7, "'"}
+ });
+
+ addRow("octal escape", "'\\o067'", {
+ {TokenType::Char, 0, "'"},
+ {TokenType::EscapeSequence, 1, "\\o067"},
+ {TokenType::Char, 6, "'"}
+ });
+ addRow("octal escape error", "'\\o8'", {
+ {TokenType::Char, 0, "'\\"},
+ {TokenType::CharError, 2, "o"},
+ {TokenType::CharError, 3, "8"},
+ {TokenType::Char, 4, "'"}
+ });
+
+ addRow("hexadecimal escape", "'\\x0678Ab'", {
+ {TokenType::Char, 0, "'"},
+ {TokenType::EscapeSequence, 1, "\\x0678Ab"},
+ {TokenType::Char, 9, "'"}
+ });
+ addRow("hexadecimal escape error", "'\\xg'", {
+ {TokenType::Char, 0, "'\\"},
+ {TokenType::CharError, 2, "x"},
+ {TokenType::CharError, 3, "g"},
+ {TokenType::Char, 4, "'"}
+ });
+
+ // ascii cntrl escapes (including wrong ones)
+ for (char c = '!'; c <= '~'; ++c) {
+ if (c == '\'') // is special because it also ends the string
+ continue;
+ const QChar qc(c);
+ const QByteArray name = QString("ascii cntrl '^%1'").arg(qc).toUtf8();
+ const QString input = QString("'\\^%1'").arg(qc);
+ if ((qc >= 'A' && qc <= 'Z') || qc == '@' || qc == '[' || qc == '\\' || qc == ']'
+ || qc == '^' || qc == '_') {
+ addRow(name.constData(), input, {
+ {TokenType::Char, 0, "'"},
+ {TokenType::EscapeSequence, 1, QLatin1String("\\^") + qc},
+ {TokenType::Char, 4, "'"}
+ });
+ } else {
+ addRow(name.constData(), input, {
+ {TokenType::Char, 0, "'\\"},
+ {TokenType::CharError, 2, "^"},
+ {TokenType::CharError, 3, qc},
+ {TokenType::Char, 4, "'"}
+ });
+ }
+ }
+
+ addRow("ascii escape SOH", "'\\SOH'", {
+ {TokenType::Char, 0, "'"},
+ {TokenType::EscapeSequence, 1, "\\SOH"},
+ {TokenType::Char, 5, "'"}
+ });
+ addRow("ascii escape SO, too long", "'\\SON'", {
+ {TokenType::Char, 0, "'"},
+ {TokenType::EscapeSequence, 1, "\\SO"},
+ {TokenType::CharError, 4, "N"},
+ {TokenType::Char, 5, "'"}
+ });
+ addRow("ascii escape error", "'\\TON'", {
+ {TokenType::Char, 0, "'\\"},
+ {TokenType::CharError, 2, "T"},
+ {TokenType::CharError, 3, "ON"},
+ {TokenType::Char, 5, "'"}
+ });
+}
+
+void tst_Tokenizer::character()
+{
+ checkData();
+}
+
+void tst_Tokenizer::number_data()
+{
+ setupData();
+
+ addRow("decimal", "012345", {
+ {TokenType::Integer, 0, "012345"}
+ });
+ addRow("single digit decimal", "0", {
+ {TokenType::Integer, 0, "0"}
+ });
+ addRow("octal", "0o1234", {
+ {TokenType::Integer, 0, "0o1234"}
+ });
+ // this is a bit weird, but correct: octal 1 followed by decimal 8
+ addRow("number after octal", "0O18", {
+ {TokenType::Integer, 0, "0O1"},
+ {TokenType::Integer, 3, "8"}
+ });
+ addRow("not octal", "0o9", {
+ {TokenType::Integer, 0, "0"},
+ {TokenType::Variable, 1, "o9"},
+ });
+ addRow("hexadecimal", "0x9fA", {
+ {TokenType::Integer, 0, "0x9fA"}
+ });
+ // hex number followed by identifier 'g'
+ addRow("hexadecimal", "0X9fg", {
+ {TokenType::Integer, 0, "0X9f"},
+ {TokenType::Variable, 4, "g"}
+ });
+
+ // 0 followed by identifier
+ addRow("decimal followed by identifier", "0z6", {
+ {TokenType::Integer, 0, "0"},
+ {TokenType::Variable, 1, "z6"}
+ });
+
+ addRow("float", "0123.45", {
+ {TokenType::Float, 0, "0123.45"}
+ });
+ addRow("decimal + operator '.'", "0123.", {
+ {TokenType::Integer, 0, "0123"},
+ {TokenType::Operator, 4, "."}
+ });
+ addRow("operator '.' + decimal", ".0123", {
+ {TokenType::Operator, 0, "."},
+ {TokenType::Integer, 1, "0123"}
+ });
+ addRow("without '.', with exp 'e'", "0123e45", {
+ {TokenType::Float, 0, "0123e45"}
+ });
+ addRow("without '.', with exp 'E'", "0123E45", {
+ {TokenType::Float, 0, "0123E45"}
+ });
+ addRow("without '.', with '+'", "0123e+45", {
+ {TokenType::Float, 0, "0123e+45"}
+ });
+ addRow("without '.', with '-'", "0123e-45", {
+ {TokenType::Float, 0, "0123e-45"}
+ });
+ addRow("without '.', with '+', missing decimal", "0123e+", {
+ {TokenType::Integer, 0, "0123"},
+ {TokenType::Variable, 4, "e"},
+ {TokenType::Operator, 5, "+"}
+ });
+ addRow("without '.', missing decimal", "0123e", {
+ {TokenType::Integer, 0, "0123"},
+ {TokenType::Variable, 4, "e"}
+ });
+ addRow("exp 'e'", "01.23e45", {
+ {TokenType::Float, 0, "01.23e45"}
+ });
+ addRow("exp 'E'", "01.23E45", {
+ {TokenType::Float, 0, "01.23E45"}
+ });
+ addRow("with '+'", "01.23e+45", {
+ {TokenType::Float, 0, "01.23e+45"}
+ });
+ addRow("with '-'", "01.23e-45", {
+ {TokenType::Float, 0, "01.23e-45"}
+ });
+ addRow("with '+', missing decimal", "01.23e+", {
+ {TokenType::Float, 0, "01.23"},
+ {TokenType::Variable, 5, "e"},
+ {TokenType::Operator, 6, "+"}
+ });
+ addRow("missing decimal", "01.23e", {
+ {TokenType::Float, 0, "01.23"},
+ {TokenType::Variable, 5, "e"}
+ });
+}
+
+void tst_Tokenizer::number()
+{
+ checkData();
+}
+
+void tst_Tokenizer::keyword_data()
+{
+ setupData();
+
+ addRow("data", "data", {
+ {TokenType::Keyword, 0, "data"}
+ });
+ addRow("not a qualified varid", "Foo.case", {
+ {TokenType::Constructor, 0, "Foo"},
+ {TokenType::Operator, 3, "."},
+ {TokenType::Keyword, 4, "case"}
+ });
+ addRow(":", ":", {
+ {TokenType::Keyword, 0, ":"}
+ });
+ addRow("->", "->", {
+ {TokenType::Keyword, 0, "->"}
+ });
+ addRow("not a qualified varsym", "Foo...", {
+ {TokenType::Constructor, 0, "Foo"},
+ {TokenType::Operator, 3, "..."}
+ });
+}
+
+void tst_Tokenizer::keyword()
+{
+ checkData();
+}
+
+void tst_Tokenizer::variable_data()
+{
+ setupData();
+
+ addRow("simple", "fOo_1'", {
+ {TokenType::Variable, 0, "fOo_1'"}
+ });
+ addRow("start with '_'", "_1", {
+ {TokenType::Variable, 0, "_1"}
+ });
+ addRow("not a keyword", "cases", {
+ {TokenType::Variable, 0, "cases"}
+ });
+ addRow("not a keyword 2", "qualified", {
+ {TokenType::Variable, 0, "qualified"}
+ });
+ addRow("not a keyword 3", "as", {
+ {TokenType::Variable, 0, "as"}
+ });
+ addRow("not a keyword 4", "hiding", {
+ {TokenType::Variable, 0, "hiding"}
+ });
+ addRow(".variable", ".foo", {
+ {TokenType::Operator, 0, "."},
+ {TokenType::Variable, 1, "foo"}
+ });
+ addRow("variable.", "foo.", {
+ {TokenType::Variable, 0, "foo"},
+ {TokenType::Operator, 3, "."}
+ });
+ addRow("variable.variable", "blah.foo", {
+ {TokenType::Variable, 0, "blah"},
+ {TokenType::Operator, 4, "."},
+ {TokenType::Variable, 5, "foo"}
+ });
+ addRow("qualified", "Blah.foo", {
+ {TokenType::Variable, 0, "Blah.foo"}
+ });
+ addRow("qualified2", "Goo.Blah.foo", {
+ {TokenType::Variable, 0, "Goo.Blah.foo"}
+ });
+ addRow("variable + op '..'", "foo..", {
+ {TokenType::Variable, 0, "foo"},
+ {TokenType::Keyword, 3, ".."}
+ });
+ addRow("variable + op '...'", "foo...", {
+ {TokenType::Variable, 0, "foo"},
+ {TokenType::Operator, 3, "..."}
+ });
+}
+
+void tst_Tokenizer::variable()
+{
+ checkData();
+}
+
+void tst_Tokenizer::constructor_data()
+{
+ setupData();
+
+ addRow("simple", "Foo", {
+ {TokenType::Constructor, 0, "Foo"}
+ });
+ addRow("qualified", "Foo.Bar", {
+ {TokenType::Constructor, 0, "Foo.Bar"}
+ });
+ addRow("followed by op '.'", "Foo.Bar.", {
+ {TokenType::Constructor, 0, "Foo.Bar"},
+ {TokenType::Operator, 7, "."}
+ });
+
+}
+
+void tst_Tokenizer::constructor()
+{
+ checkData();
+}
+
+void tst_Tokenizer::op_data()
+{
+ setupData();
+
+ addRow("simple", "+-=", {
+ {TokenType::Operator, 0, "+-="}
+ });
+ addRow("qualified", "Foo.+-=", {
+ {TokenType::Operator, 0, "Foo.+-="}
+ });
+ addRow("qualified '.'", "Foo..", {
+ {TokenType::Operator, 0, "Foo.."}
+ });
+ addRow("constructor plus op", "Foo+", {
+ {TokenType::Constructor, 0, "Foo"},
+ {TokenType::Operator, 3, "+"}
+ });
+}
+
+void tst_Tokenizer::op()
+{
+ checkData();
+}
+
+QTEST_MAIN(tst_Tokenizer)
+
+#include "tst_tokenizer.moc"