Add highlighter

See lexical structure of Haskell https://www.haskell.org/onlinereport/haskell2010/haskellch2.html
author: Eike Ziller <git@eikeziller.de> 2017-04-29 16:17:11 +0200
committer: Eike Ziller <git@eikeziller.de> 2017-10-01 20:11:08 +0200
commit: 5798e33d742c0f413d2d865fdb75739b4374ce98 (patch)
tree: e7d36edf5de22ab74ed4b56e2e2b22be24f50ef6
parent: 2f69373309cfe88084c5777baeff6bb46eecd071 (diff)
8 files changed, 1681 insertions, 2 deletions
diff --git a/plugins/haskell/haskell.pro b/plugins/haskell/haskell.pro
index 9c557b1..aa92d3c 100644
--- a/plugins/haskell/haskell.pro
+++ b/plugins/haskell/haskell.pro
@@ -5,14 +5,18 @@ DEFINES += HASKELL_LIBRARY
 SOURCES += \
     haskellcompletionassist.cpp \
     haskelleditorfactory.cpp \
-    haskellplugin.cpp
+    haskellplugin.cpp \
+    haskellhighlighter.cpp \
+    haskelltokenizer.cpp
 
 HEADERS += \
     haskell_global.h \
     haskellcompletionassist.h \
     haskellconstants.h \
     haskelleditorfactory.h \
-    haskellplugin.h
+    haskellplugin.h \
+    haskellhighlighter.h \
+    haskelltokenizer.h
 
 ## uncomment to build plugin into user config directory
 ## <localappdata>/plugins/<ideversion>
diff --git a/plugins/haskell/haskelleditorfactory.cpp b/plugins/haskell/haskelleditorfactory.cpp
index 220e52e..8119105 100644
--- a/plugins/haskell/haskelleditorfactory.cpp
+++ b/plugins/haskell/haskelleditorfactory.cpp
@@ -27,6 +27,7 @@
 
 #include "haskellcompletionassist.h"
 #include "haskellconstants.h"
+#include "haskellhighlighter.h"
 
 #include <texteditor/textdocument.h>
 #include <texteditor/texteditoractionhandler.h>
@@ -47,6 +48,7 @@ HaskellEditorFactory::HaskellEditorFactory()
     setParenthesesMatchingEnabled(true);
     setMarksVisible(true);
     setCompletionAssistProvider(new HaskellCompletionAssistProvider);
+    setSyntaxHighlighterCreator([] { return new HaskellHighlighter(); });
 }
 
 } // Internal
diff --git a/plugins/haskell/haskellhighlighter.cpp b/plugins/haskell/haskellhighlighter.cpp
new file mode 100644
index 0000000..9899cc4
--- /dev/null
+++ b/plugins/haskell/haskellhighlighter.cpp
@@ -0,0 +1,152 @@
+/****************************************************************************
+**
+** Copyright (C) 2017 The Qt Company Ltd.
+** Contact: https://www.qt.io/licensing/
+**
+** This file is part of Qt Creator.
+**
+** Commercial License Usage
+** Licensees holding valid commercial Qt licenses may use this file in
+** accordance with the commercial license agreement provided with the
+** Software or, alternatively, in accordance with the terms contained in
+** a written agreement between you and The Qt Company. For licensing terms
+** and conditions see https://www.qt.io/terms-conditions. For further
+** information use the contact form at https://www.qt.io/contact-us.
+**
+** GNU General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU
+** General Public License version 3 as published by the Free Software
+** Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
+** included in the packaging of this file. Please review the following
+** information to ensure the GNU General Public License requirements will
+** be met: https://www.gnu.org/licenses/gpl-3.0.html.
+**
+****************************************************************************/
+
+#include "haskellhighlighter.h"
+
+#include "haskelltokenizer.h"
+
+#include <texteditor/fontsettings.h>
+#include <texteditor/texteditorconstants.h>
+#include <texteditor/texteditorsettings.h>
+
+#include <QDebug>
+#include <QVector>
+
+Q_GLOBAL_STATIC_WITH_ARGS(QSet<QString>, IMPORT_HIGHLIGHTS, ({
+    "qualified",
+    "as",
+    "hiding"
+}));
+
+using namespace TextEditor;
+
+namespace Haskell {
+namespace Internal {
+
+HaskellHighlighter::HaskellHighlighter()
+{
+    setDefaultTextFormatCategories();
+    updateFormats(TextEditorSettings::fontSettings());
+}
+
+void HaskellHighlighter::highlightBlock(const QString &text)
+{
+    const Tokens tokens = HaskellTokenizer::tokenize(text, previousBlockState());
+    setCurrentBlockState(tokens.state);
+    const Token *firstNonWS = 0;
+    const Token *secondNonWS = 0;
+    bool inType = false;
+    bool inImport = false;
+    for (const Token & token : tokens) {
+        switch (token.type) {
+        case TokenType::Variable:
+            if (inType)
+                setTokenFormat(token, C_LOCAL);
+            else if (inImport && IMPORT_HIGHLIGHTS->contains(token.text.toString()))
+                setTokenFormat(token, C_KEYWORD);
+//            else
+//                setTokenFormat(token, C_TEXT);
+            break;
+        case TokenType::Constructor:
+        case TokenType::OperatorConstructor:
+            setTokenFormat(token, C_TYPE);
+            break;
+        case TokenType::Operator:
+            setTokenFormat(token, C_OPERATOR);
+            break;
+        case TokenType::Whitespace:
+            setTokenFormat(token, C_VISUAL_WHITESPACE);
+            break;
+        case TokenType::Keyword:
+            if (token.text == "::" && firstNonWS && !secondNonWS) { // toplevel declaration
+                setFormat(firstNonWS->startCol, firstNonWS->length, m_toplevelDeclFormat);
+                inType = true;
+            } else if (token.text == "import") {
+                inImport = true;
+            }
+            setTokenFormat(token, C_KEYWORD);
+            break;
+        case TokenType::Integer:
+        case TokenType::Float:
+            setTokenFormat(token, C_NUMBER);
+            break;
+        case TokenType::String:
+            setTokenFormatWithSpaces(text, token, C_STRING);
+            break;
+        case TokenType::Char:
+            setTokenFormatWithSpaces(text, token, C_STRING);
+            break;
+        case TokenType::EscapeSequence:
+            setTokenFormat(token, C_PRIMITIVE_TYPE);
+            break;
+        case TokenType::SingleLineComment:
+            setTokenFormatWithSpaces(text, token, C_COMMENT);
+            break;
+        case TokenType::MultiLineComment:
+            setTokenFormatWithSpaces(text, token, C_COMMENT);
+            break;
+        case TokenType::Special:
+//            setTokenFormat(token, C_TEXT);
+            break;
+        case TokenType::StringError:
+        case TokenType::CharError:
+        case TokenType::Unknown:
+            setTokenFormat(token, C_PARENTHESES_MISMATCH);
+            break;
+        }
+        if (token.type != TokenType::Whitespace) {
+            if (!firstNonWS)
+                firstNonWS = &token;
+            else if (!secondNonWS)
+                secondNonWS = &token;
+        }
+    }
+}
+
+void HaskellHighlighter::setFontSettings(const FontSettings &fontSettings)
+{
+    SyntaxHighlighter::setFontSettings(fontSettings);
+    updateFormats(fontSettings);
+}
+
+void HaskellHighlighter::updateFormats(const FontSettings &fontSettings)
+{
+    m_toplevelDeclFormat = fontSettings.toTextCharFormat(
+                TextStyles::mixinStyle(C_FUNCTION, C_DECLARATION));
+}
+
+void HaskellHighlighter::setTokenFormat(const Token &token, TextStyle style)
+{
+    setFormat(token.startCol, token.length, formatForCategory(style));
+}
+
+void HaskellHighlighter::setTokenFormatWithSpaces(const QString &text, const Token &token,
+                                                  TextStyle style)
+{
+    setFormatWithSpaces(text, token.startCol, token.length, formatForCategory(style));
+}
+
+} // Internal
+} // Haskell
diff --git a/plugins/haskell/haskellhighlighter.h b/plugins/haskell/haskellhighlighter.h
new file mode 100644
index 0000000..6213333
--- /dev/null
+++ b/plugins/haskell/haskellhighlighter.h
@@ -0,0 +1,58 @@
+/****************************************************************************
+**
+** Copyright (C) 2017 The Qt Company Ltd.
+** Contact: https://www.qt.io/licensing/
+**
+** This file is part of Qt Creator.
+**
+** Commercial License Usage
+** Licensees holding valid commercial Qt licenses may use this file in
+** accordance with the commercial license agreement provided with the
+** Software or, alternatively, in accordance with the terms contained in
+** a written agreement between you and The Qt Company. For licensing terms
+** and conditions see https://www.qt.io/terms-conditions. For further
+** information use the contact form at https://www.qt.io/contact-us.
+**
+** GNU General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU
+** General Public License version 3 as published by the Free Software
+** Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
+** included in the packaging of this file. Please review the following
+** information to ensure the GNU General Public License requirements will
+** be met: https://www.gnu.org/licenses/gpl-3.0.html.
+**
+****************************************************************************/
+
+#pragma once
+
+#include <texteditor/syntaxhighlighter.h>
+
+#include <QHash>
+#include <QTextFormat>
+
+namespace Haskell {
+namespace Internal {
+
+class Token;
+
+class HaskellHighlighter : public TextEditor::SyntaxHighlighter
+{
+    Q_OBJECT
+
+public:
+    HaskellHighlighter();
+
+protected:
+    void highlightBlock(const QString &text) override;
+
+private:
+    void setFontSettings(const TextEditor::FontSettings &fontSettings) override;
+    void updateFormats(const TextEditor::FontSettings &fontSettings);
+    void setTokenFormat(const Token &token, TextEditor::TextStyle style);
+    void setTokenFormatWithSpaces(const QString &text, const Token &token,
+                                  TextEditor::TextStyle style);
+    QTextCharFormat m_toplevelDeclFormat;
+};
+
+} // Internal
+} // Haskell
diff --git a/plugins/haskell/haskelltokenizer.cpp b/plugins/haskell/haskelltokenizer.cpp
new file mode 100644
index 0000000..527e505
--- /dev/null
+++ b/plugins/haskell/haskelltokenizer.cpp
@@ -0,0 +1,631 @@
+/****************************************************************************
+**
+** Copyright (C) 2017 The Qt Company Ltd.
+** Contact: https://www.qt.io/licensing/
+**
+** This file is part of Qt Creator.
+**
+** Commercial License Usage
+** Licensees holding valid commercial Qt licenses may use this file in
+** accordance with the commercial license agreement provided with the
+** Software or, alternatively, in accordance with the terms contained in
+** a written agreement between you and The Qt Company. For licensing terms
+** and conditions see https://www.qt.io/terms-conditions. For further
+** information use the contact form at https://www.qt.io/contact-us.
+**
+** GNU General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU
+** General Public License version 3 as published by the Free Software
+** Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
+** included in the packaging of this file. Please review the following
+** information to ensure the GNU General Public License requirements will
+** be met: https://www.gnu.org/licenses/gpl-3.0.html.
+**
+****************************************************************************/
+
+#include "haskelltokenizer.h"
+
+#include <QSet>
+
+#include <algorithm>
+#include <functional>
+
+Q_GLOBAL_STATIC_WITH_ARGS(QSet<QString>, RESERVED_OP, ({
+    "..",
+    ":",
+    "::",
+    "=",
+    "\\",
+    "|",
+    "<-",
+    "->",
+    "@",
+    "~",
+    "=>",
+
+    // Arrows GHC extension
+    "-<",
+    "-<<",
+    ">-",
+    ">>-",
+    "(|",
+    "|)"
+}));
+
+Q_GLOBAL_STATIC_WITH_ARGS(QSet<QString>, RESERVED_ID, ({
+    "case",
+    "class",
+    "data",
+    "default",
+    "deriving",
+    "do",
+    "else",
+    "foreign",
+    "if",
+    "import",
+    "in",
+    "infix",
+    "infixl",
+    "infixr",
+    "instance",
+    "let",
+    "module",
+    "newtype",
+    "of",
+    "then",
+    "type",
+    "where",
+    "_",
+
+    // from GHC extensions
+    "family",
+    "forall",
+    "mdo",
+    "proc",
+    "rec"
+}));
+
+Q_GLOBAL_STATIC_WITH_ARGS(QSet<QChar>, SPECIAL, ({
+    '(',
+    ')',
+    ',',
+    ';',
+    '[',
+    ']',
+    '`',
+    '{',
+    '}',
+}));
+
+Q_GLOBAL_STATIC_WITH_ARGS(QSet<QChar>, CHAR_ESCAPES,
+                          ({'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '"', '\'', '&'}));
+
+Q_GLOBAL_STATIC_WITH_ARGS(QVector<QString>, ASCII_ESCAPES, ({
+    "NUL",
+    "SOH", // must be before "SO" to match
+    "STX",
+    "ETX",
+    "EOT",
+    "ENQ",
+    "ACK",
+    "BEL",
+    "BS",
+    "HT",
+    "LF",
+    "VT",
+    "FF",
+    "CR",
+    "SO",
+    "SI",
+    "DLE",
+    "DC1",
+    "DC2",
+    "DC3",
+    "DC4",
+    "NAK",
+    "SYN",
+    "ETB",
+    "CAN",
+    "EM",
+    "SUB",
+    "ESC",
+    "FS",
+    "GS",
+    "RS",
+    "US",
+    "SP",
+    "DEL"
+}));
+
+namespace Haskell {
+namespace Internal {
+
+Token token(TokenType type, std::shared_ptr<QString> line, int start, int end)
+{
+    return {type, start, end - start, line->midRef(start, end - start), line};
+}
+
+Tokens::Tokens(std::shared_ptr<QString> source)
+    : source(source)
+{
+}
+
+static int grab(const QString &line, int begin,
+                const std::function<bool(const QChar&)> &test)
+{
+    const int length = line.length();
+    int current = begin;
+    while (current < length && test(line.at(current)))
+        ++current;
+    return current - begin;
+};
+
+
+static bool isIdentifierChar(const QChar &c)
+{
+    return c.isLetterOrNumber() || c == '\'' || c == '_';
+}
+
+static bool isVariableIdentifierStart(const QChar &c)
+{
+    return c == '_' || c.isLower();
+}
+
+static bool isAscSymbol(const QChar &c)
+{
+    return c == '!'
+            || c == '#'
+            || c == '$'
+            || c == '%'
+            || c == '&'
+            || c == '*'
+            || c == '+'
+            || c == '.'
+            || c == '/'
+            || c == '<'
+            || c == '='
+            || c == '>'
+            || c == '?'
+            || c == '@'
+            || c == '\\'
+            || c == '^'
+            || c == '|'
+            || c == '-'
+            || c == '~'
+            || c == ':';
+}
+
+static bool isSymbol(const QChar &c)
+{
+    return isAscSymbol(c)
+            || ((c.isSymbol() || c.isPunct()) && c != '_' && c != '"' && c != '\''
+                                              && !SPECIAL->contains(c));
+}
+
+static bool isDigit(const QChar &c)
+{
+    return c.isDigit();
+}
+
+static bool isOctit(const QChar &c)
+{
+    return c >= '0' && c <= '7';
+}
+
+static bool isHexit(const QChar &c)
+{
+    return c.isDigit()
+            || (c >= 'A' && c <= 'F')
+            || (c >= 'a' && c <= 'f');
+}
+
+static bool isCntrl(const QChar &c)
+{
+    return (c >= 'A' && c <= 'Z')
+            || c == '@'
+            || c == '['
+            || c == '\\'
+            || c == ']'
+            || c == '^'
+            || c == '_';
+}
+
+static QVector<Token> getSpace(std::shared_ptr<QString> line, int start)
+{
+    const auto lineEnd = line->cend();
+    const auto tokenStart = line->cbegin() + start;
+    auto current = tokenStart;
+    while (current != lineEnd && (*current).isSpace())
+        ++current;
+    const int length = int(std::distance(tokenStart, current));
+    if (current > tokenStart)
+        return {{TokenType::Whitespace, start, length, line->midRef(start, length), line}};
+    return {};
+}
+
+static QVector<Token> getNumber(std::shared_ptr<QString> line, int start)
+{
+    const QChar &startC = line->at(start);
+    if (!startC.isDigit())
+        return {};
+    const int length = line->length();
+    int current = start + 1;
+    TokenType type = TokenType::Integer;
+    if (current < length) {
+        if (startC == '0') {
+            // check for octal or hexadecimal
+            const QChar &secondC = line->at(current);
+            if (secondC == 'o' || secondC == 'O') {
+                const int numLen = grab(*line, current + 1, isOctit);
+                if (numLen > 0)
+                    return {token(TokenType::Integer, line, start, current + numLen + 1)};
+            } else if (secondC == 'x' || secondC == 'X') {
+                const int numLen = grab(*line, current + 1, isHexit);
+                if (numLen > 0)
+                    return {token(TokenType::Integer, line, start, current + numLen + 1)};
+            }
+        }
+        // starts with decimal
+        const int numLen = grab(*line, start, isDigit);
+        current = start + numLen;
+        // check for floating point
+        if (current < length && line->at(current) == '.') {
+            const int numLen = grab(*line, current + 1, isDigit);
+            if (numLen > 0) {
+                current += numLen + 1;
+                type = TokenType::Float;
+            }
+        }
+        // check for exponent
+        if (current + 1 < length /*for at least 'e' and digit*/
+                && (line->at(current) == 'e' || line->at(current) == 'E')) {
+            int expEnd = current + 1;
+            if (line->at(expEnd) == '+' || line->at(expEnd) == '-')
+                ++expEnd;
+            const int numLen = grab(*line, expEnd, isDigit);
+            if (numLen > 0) {
+                current = expEnd + numLen;
+                type = TokenType::Float;
+            }
+        }
+    }
+    return {token(type, line, start, current)};
+}
+
+static QVector<Token> getIdOrOpOrSingleLineComment(std::shared_ptr<QString> line, int start)
+{
+    const int length = line->length();
+    if (start >= length)
+        return {};
+    int current = start;
+    // check for {conid.}conid
+    int conidEnd = start;
+    bool canOnlyBeConstructor = false;
+    while (current < length && line->at(current).isUpper()) {
+        current += grab(*line, current, isIdentifierChar);
+        conidEnd = current;
+        // it is definitely a constructor id if it is not followed by a '.'
+        canOnlyBeConstructor = current >= length || line->at(current) != '.';
+        // otherwise it might be a module id, and we skip the dot to check for qualified thing
+        if (!canOnlyBeConstructor)
+            ++current;
+    }
+    if (canOnlyBeConstructor)
+        return {token(TokenType::Constructor, line, start, conidEnd)};
+
+    // check for variable or reserved id
+    if (current < length && isVariableIdentifierStart(line->at(current))) {
+        const int varLen = grab(*line, current, isIdentifierChar);
+        // check for reserved id
+        if (RESERVED_ID->contains(line->mid(current, varLen))) {
+            QVector<Token> result;
+            // possibly add constructor + op '.'
+            if (conidEnd > start) {
+                result.append(token(TokenType::Constructor, line, start, conidEnd));
+                result.append(token(TokenType::Operator, line, conidEnd, current));
+            }
+            result.append(token(TokenType::Keyword, line, current, current + varLen));
+            return result;
+        }
+        return {token(TokenType::Variable, line, start, current + varLen)};
+    }
+    // check for operator
+    if (current < length && isSymbol(line->at(current))) {
+        const int opLen = grab(*line, current, isSymbol);
+        // check for reserved op
+        if (RESERVED_OP->contains(line->mid(current, opLen))) {
+            // because of the case of F... (constructor + op '...') etc
+            // we only add conid if we have one, handling the rest in next iteration
+            if (conidEnd > start)
+                return {token(TokenType::Constructor, line, start, conidEnd)};
+            return {token(TokenType::Keyword, line, start, current + opLen)};
+        }
+        // check for single line comment
+        if (opLen >= 2 && std::all_of(line->begin() + current, line->begin() + current + opLen,
+                                      [](const QChar c) { return c == '-'; })) {
+            QVector<Token> result;
+            // possibly add constructor + op '.'
+            if (conidEnd > start) {
+                result.append(token(TokenType::Constructor, line, start, conidEnd));
+                result.append(token(TokenType::Operator, line, conidEnd, current));
+            }
+            // rest is comment
+            result.append(token(TokenType::SingleLineComment, line, current, length));
+            return result;
+        }
+        // check for (qualified?) operator constructor
+        if (line->at(current) == ':')
+            return {token(TokenType::OperatorConstructor, line, start, current + opLen)};
+        return {token(TokenType::Operator, line, start, current + opLen)};
+    }
+    // Foo.Blah.
+    if (conidEnd > start)
+        return {token(TokenType::Constructor, line, start, conidEnd)};
+    return {};
+}
+
+static int getEscape(const QString &line, int start)
+{
+    if (CHAR_ESCAPES->contains(line.at(start)))
+        return 1;
+
+    // decimal
+    if (line.at(start).isDigit())
+        return grab(line, start + 1, isDigit) + 1;
+    // octal
+    if (line.at(start) == 'o') {
+        const int count = grab(line, start + 1, isOctit);
+        if (count < 1) // no octal number after 'o'
+            return 0;
+        return count + 1;
+    }
+    // hexadecimal
+    if (line.at(start) == 'x') {
+        const int count = grab(line, start + 1, isHexit);
+        if (count < 1) // no octal number after 'o'
+            return 0;
+        return count + 1;
+    }
+    // ascii cntrl
+    if (line.at(start) == '^') {
+        const int count = grab(line, start + 1, isCntrl);
+        if (count < 1) // no octal number after 'o'
+            return 0;
+        return count + 1;
+    }
+    const QStringRef s = line.midRef(start);
+    for (const QString &esc : *ASCII_ESCAPES) {
+        if (s.startsWith(esc))
+            return esc.length();
+    }
+    return 0;
+}
+
+static QVector<Token> getString(std::shared_ptr<QString> line, int start, bool *inStringGap/*in-out*/)
+{
+    // Haskell has the specialty of using \<whitespace>\ within strings for multiline strings
+    const int length = line->length();
+    if (start >= length)
+        return {};
+    QVector<Token> result;
+    int tokenStart = start;
+    int current = tokenStart;
+    bool inString = *inStringGap;
+    do {
+        const QChar c = line->at(current);
+        if (*inStringGap && !c.isSpace() && c != '\\') {
+            // invalid non-whitespace in string gap
+            // add previous string as token, this is at least a whitespace
+            result.append(token(TokenType::String, line, tokenStart, current));
+            // then add wrong non-whitespace
+            tokenStart = current;
+            do { ++current; } while (current < length && !line->at(current).isSpace());
+            result.append(token(TokenType::StringError, line, tokenStart, current));
+            tokenStart = current;
+        } else if (c == '"') {
+            inString = !inString;
+            ++current;
+        } else if (inString) {
+            if (c == '\\') {
+                ++current;
+                if (*inStringGap) {
+                    // ending string gap
+                    *inStringGap = false;
+                } else if (current >= length || line->at(current).isSpace()) {
+                    // starting string gap
+                    *inStringGap = true;
+                    current = std::min(current + 1, length);
+                } else { // there is at least one character after current
+                    const int escapeLength = getEscape(*line, current);
+                    if (escapeLength > 0) {
+                        // valid escape
+                        // add previous string as token without backslash, if necessary
+                        if (tokenStart < current - 1/*backslash*/)
+                            result.append(token(TokenType::String, line, tokenStart, current - 1));
+                        tokenStart = current - 1; // backslash
+                        current += escapeLength;
+                        result.append(token(TokenType::EscapeSequence, line, tokenStart, current));
+                        tokenStart = current;
+                    } else { // invalid escape sequence
+                        // add previous string as token, this is at least backslash
+                        result.append(token(TokenType::String, line, tokenStart, current));
+                        result.append(token(TokenType::StringError, line, current, current + 1));
+                        ++current;
+                        tokenStart = current;
+                    }
+                }
+            } else {
+                ++current;
+            }
+        }
+    } while (current < length && inString);
+    if (current > tokenStart)
+        result.append(token(TokenType::String, line, tokenStart, current));
+    if (inString && !*inStringGap) { // unterminated string
+        // mark last character of last token as Unknown as an error hint
+        if (!result.isEmpty()) { // should actually never be different
+            Token &lastRef = result.last();
+            if (lastRef.length == 1) {
+                lastRef.type = TokenType::StringError;
+            } else {
+                --lastRef.length;
+                lastRef.text = line->midRef(lastRef.startCol, lastRef.length);
+                result.append(token(TokenType::StringError, line, current - 1, current));
+            }
+        }
+    }
+    return result;
+}
+
+static QVector<Token> getMultiLineComment(std::shared_ptr<QString> line, int start,
+                                          int *commentLevel/*in_out*/)
+{
+    // Haskell multiline comments can be nested {- foo {- bar -} blah -}
+    const int length = line->length();
+    int current = start;
+    do {
+        const QStringRef test = line->midRef(current, 2);
+        if (test == "{-") {
+            ++(*commentLevel);
+            current += 2;
+        } else if (test == "-}" && *commentLevel > 0) {
+            --(*commentLevel);
+            current += 2;
+        } else if (*commentLevel > 0) {
+            ++current;
+        }
+    } while (current < length && *commentLevel > 0);
+    if (current > start) {
+        return {token(TokenType::MultiLineComment, line, start, current)};
+    }
+    return {};
+}
+
+static QVector<Token> getChar(std::shared_ptr<QString> line, int start)
+{
+    if (line->at(start) != '\'')
+        return {};
+    QVector<Token> result;
+    const int length = line->length();
+    int tokenStart = start;
+    int current = tokenStart + 1;
+    bool inChar = true;
+    int count = 0;
+    while (current < length && inChar) {
+        if (line->at(current) == '\'') {
+            inChar = false;
+            ++current;
+        } else if (count == 1) {
+            // we already have one character, so start Unknown token
+            if (current > tokenStart)
+                result.append(token(TokenType::Char, line, tokenStart, current));
+            tokenStart = current;
+            ++count;
+            ++current;
+        } else if (count > 1) {
+            ++count;
+            ++current;
+        } else if (line->at(current) == '\\') {
+            if (current + 1 < length) {
+                ++current;
+                ++count;
+                const int escapeLength = getEscape(*line, current);
+                if (line->at(current) != '&' && escapeLength > 0) { // no & escape for chars
+                    // valid escape
+                    // add previous string as token without backslash, if necessary
+                    if (tokenStart < current - 1/*backslash*/)
+                        result.append(token(TokenType::Char, line, tokenStart, current - 1));
+                    tokenStart = current - 1; // backslash
+                    current += escapeLength;
+                    result.append(token(TokenType::EscapeSequence, line, tokenStart, current));
+                    tokenStart = current;
+                } else { // invalid escape sequence
+                    // add previous string as token, this is at least backslash
+                    result.append(token(TokenType::Char, line, tokenStart, current));
+                    result.append(token(TokenType::CharError, line, current, current + 1));
+                    ++current;
+                    tokenStart = current;
+                }
+            } else {
+                ++current;
+            }
+        } else {
+            ++count;
+            ++current;
+        }
+    }
+    if (count > 1 && inChar) {
+        // too long and unterminated, just add Unknown token till end
+        result.append(token(TokenType::CharError, line, tokenStart, current));
+    } else if (count > 1) {
+        // too long but terminated, add Unknown up to ending quote, then quote
+        result.append(token(TokenType::CharError, line, tokenStart, current - 1));
+        result.append(token(TokenType::Char, line, current - 1, current));
+    } else if (inChar || count < 1) {
+        // unterminated, or no character inside, mark last character as error
+        if (current > tokenStart + 1)
+            result.append(token(TokenType::Char, line, tokenStart, current - 1));
+        result.append(token(TokenType::CharError, line, current - 1, current));
+    } else {
+        result.append(token(TokenType::Char, line, tokenStart, current));
+    }
+    return result;
+}
+
+static QVector<Token> getSpecial(std::shared_ptr<QString> line, int start)
+{
+    if (SPECIAL->contains(line->at(start)))
+        return {{TokenType::Special, start, 1, line->midRef(start, 1), line}};
+    return {};
+}
+
+Tokens HaskellTokenizer::tokenize(const QString &line, int startState)
+{
+    Tokens result(std::make_shared<QString>(line));
+    const int length = result.source->length();
+    bool inStringGap = startState == int(Tokens::State::StringGap);
+    int multiLineCommentLevel = std::max(startState - int(Tokens::State::MultiLineCommentGuard), 0);
+    int currentStart = 0;
+    QVector<Token> tokens;
+    while (currentStart < length) {
+        if (multiLineCommentLevel <= 0 &&
+                !(tokens = getString(result.source, currentStart, &inStringGap)).isEmpty()) {
+            result.append(tokens);
+        } else if (!(tokens = getMultiLineComment(result.source, currentStart,
+                                                &multiLineCommentLevel)).isEmpty()) {
+            result.append(tokens);
+        } else if (!(tokens = getChar(result.source, currentStart)).isEmpty()) {
+            result.append(tokens);
+        } else if (!(tokens = getSpace(result.source, currentStart)).isEmpty()) {
+            result.append(tokens);
+        } else if (!(tokens = getNumber(result.source, currentStart)).isEmpty()) {
+            result.append(tokens);
+        } else if (!(tokens = getIdOrOpOrSingleLineComment(result.source, currentStart)).isEmpty()) {
+            result.append(tokens);
+        } else if (!(tokens = getSpecial(result.source, currentStart)).isEmpty()) {
+            result.append(tokens);
+        } else {
+            tokens = {{TokenType::Unknown,
+                       currentStart,
+                       1,
+                       result.source->midRef(currentStart, 1),
+                       result.source}};
+            result.append(tokens);
+        }
+        currentStart += std::accumulate(tokens.cbegin(), tokens.cend(), 0,
+                                        [](int s, const Token &t) { return s + t.length; });
+    }
+    if (inStringGap)
+        result.state = int(Tokens::State::StringGap);
+    else if (multiLineCommentLevel > 0)
+        result.state = int(Tokens::State::MultiLineCommentGuard) + multiLineCommentLevel;
+    return result;
+}
+
+bool Token::isValid() const
+{
+    return type != TokenType::Unknown;
+}
+
+} // Internal
+} // Haskell
diff --git a/plugins/haskell/haskelltokenizer.h b/plugins/haskell/haskelltokenizer.h
new file mode 100644
index 0000000..46b4b00
--- /dev/null
+++ b/plugins/haskell/haskelltokenizer.h
@@ -0,0 +1,91 @@
+/****************************************************************************
+**
+** Copyright (C) 2017 The Qt Company Ltd.
+** Contact: https://www.qt.io/licensing/
+**
+** This file is part of Qt Creator.
+**
+** Commercial License Usage
+** Licensees holding valid commercial Qt licenses may use this file in
+** accordance with the commercial license agreement provided with the
+** Software or, alternatively, in accordance with the terms contained in
+** a written agreement between you and The Qt Company. For licensing terms
+** and conditions see https://www.qt.io/terms-conditions. For further
+** information use the contact form at https://www.qt.io/contact-us.
+**
+** GNU General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU
+** General Public License version 3 as published by the Free Software
+** Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
+** included in the packaging of this file. Please review the following
+** information to ensure the GNU General Public License requirements will
+** be met: https://www.gnu.org/licenses/gpl-3.0.html.
+**
+****************************************************************************/
+
+#pragma once
+
+#include <QChar>
+#include <QString>
+#include <QVector>
+
+#include <memory>
+
+namespace Haskell {
+namespace Internal {
+
+enum class TokenType {
+    Variable,
+    Constructor,
+    Operator,
+    OperatorConstructor,
+    Whitespace,
+    String,
+    StringError,
+    Char,
+    CharError,
+    EscapeSequence,
+    Integer,
+    Float,
+    Keyword,
+    Special,
+    SingleLineComment,
+    MultiLineComment,
+    Unknown
+};
+
+class Token {
+public:
+    bool isValid() const;
+
+    TokenType type = TokenType::Unknown;
+    int startCol = -1;
+    int length = -1;
+    QStringRef text;
+
+    std::shared_ptr<QString> source; // keep the string ref alive
+};
+
+class Tokens : public QVector<Token>
+{
+public:
+    enum class State {
+        None = -1,
+        StringGap = 0, // gap == two backslashes enclosing only whitespace
+        MultiLineCommentGuard // nothing may follow that
+    };
+
+    Tokens(std::shared_ptr<QString> source);
+
+    std::shared_ptr<QString> source;
+    int state = int(State::None);
+};
+
+class HaskellTokenizer
+{
+public:
+    static Tokens tokenize(const QString &line, int startState);
+};
+
+} // Internal
+} // Haskell
diff --git a/tests/auto/tokenizer/tokenizer.pro b/tests/auto/tokenizer/tokenizer.pro
new file mode 100644
index 0000000..a9ec439
--- /dev/null
+++ b/tests/auto/tokenizer/tokenizer.pro
@@ -0,0 +1,11 @@
+include(../../../plugins/haskell/config.pri)
+
+include($$IDE_SOURCE_TREE/tests/auto/qttest.pri)
+
+SOURCES += tst_tokenizer.cpp \
+    $$PWD/../../../plugins/haskell/haskelltokenizer.cpp
+
+HEADERS += \
+    $$PWD/../../../plugins/haskell/haskelltokenizer.h
+
+INCLUDEPATH += $$PWD/../../../plugins/haskell
diff --git a/tests/auto/tokenizer/tst_tokenizer.cpp b/tests/auto/tokenizer/tst_tokenizer.cpp
new file mode 100644
index 0000000..ffa34b2
--- /dev/null
+++ b/tests/auto/tokenizer/tst_tokenizer.cpp
@@ -0,0 +1,730 @@
+/****************************************************************************
+**
+** Copyright (C) 2017 The Qt Company Ltd.
+** Contact: https://www.qt.io/licensing/
+**
+** This file is part of Qt Creator.
+**
+** Commercial License Usage
+** Licensees holding valid commercial Qt licenses may use this file in
+** accordance with the commercial license agreement provided with the
+** Software or, alternatively, in accordance with the terms contained in
+** a written agreement between you and The Qt Company. For licensing terms
+** and conditions see https://www.qt.io/terms-conditions. For further
+** information use the contact form at https://www.qt.io/contact-us.
+**
+** GNU General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU
+** General Public License version 3 as published by the Free Software
+** Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
+** included in the packaging of this file. Please review the following
+** information to ensure the GNU General Public License requirements will
+** be met: https://www.gnu.org/licenses/gpl-3.0.html.
+**
+****************************************************************************/
+
+#include <haskelltokenizer.h>
+
+#include <QObject>
+#include <QtTest>
+
+using namespace Haskell::Internal;
+
+const QSet<char> escapes{'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '"', '\'', '&'};
+
+struct TokenInfo
+{
+    TokenType type;
+    int column;
+    QString text;
+};
+
+Q_DECLARE_METATYPE(TokenInfo)
+
+bool operator==(const TokenInfo &info, const Token &token)
+{
+    return info.type == token.type
+            && info.column == token.startCol
+            && info.text.length() == token.length
+            && info.text == token.text.toString();
+}
+
+bool operator==(const Token &token, const TokenInfo &info)
+{
+    return info == token;
+}
+
+class tst_Tokenizer : public QObject
+{
+    Q_OBJECT
+
+private slots:
+    void singleLineComment_data();
+    void singleLineComment();
+
+    void multiLineComment_data();
+    void multiLineComment();
+
+    void string_data();
+    void string();
+
+    void character_data();
+    void character();
+
+    void number_data();
+    void number();
+
+    void keyword_data();
+    void keyword();
+
+    void variable_data();
+    void variable();
+
+    void constructor_data();
+    void constructor();
+
+    void op_data();
+    void op();
+
+private:
+    void setupData();
+    void addRow(const char *name,
+                const QString &input,
+                const QList<TokenInfo> &tokens,
+                Tokens::State startState = Tokens::State::None,
+                Tokens::State endState = Tokens::State::None);
+    void checkData();
+};
+
+void tst_Tokenizer::setupData()
+{
+    QTest::addColumn<QString>("input");
+    QTest::addColumn<QList<TokenInfo>>("output");
+    QTest::addColumn<int>("startState");
+    QTest::addColumn<int>("endState");
+}
+
+void tst_Tokenizer::addRow(const char *name,
+                           const QString &input,
+                           const QList<TokenInfo> &tokens,
+                           Tokens::State startState,
+                           Tokens::State endState)
+{
+    QTest::newRow(name) << input << tokens << int(startState) << int(endState);
+}
+
+void tst_Tokenizer::checkData()
+{
+    QFETCH(QString, input);
+    QFETCH(QList<TokenInfo>, output);
+    QFETCH(int, startState);
+    QFETCH(int, endState);
+    const Tokens tokens = HaskellTokenizer::tokenize(input, startState);
+    QCOMPARE(tokens.length(), output.length());
+    QCOMPARE(tokens.state, endState);
+    for (int i = 0; i < tokens.length(); ++i) {
+        const Token t = tokens.at(i);
+        const TokenInfo ti = output.at(i);
+        QVERIFY2(t == ti, QString("Token at index %1 does not match, {%2, %3, \"%4\"} != {%5, %6, \"%7\"}")
+                 .arg(i)
+                 .arg(int(t.type)).arg(t.startCol).arg(t.text.toString())
+                 .arg(int(ti.type)).arg(ti.column).arg(ti.text)
+                 .toUtf8().constData());
+    }
+}
+
+void tst_Tokenizer::singleLineComment_data()
+{
+    setupData();
+
+    addRow("simple", " -- foo", {
+               {TokenType::Whitespace, 0, " "},
+               {TokenType::SingleLineComment, 1, "-- foo"}
+           });
+    addRow("dash, id", "--foo", {
+               {TokenType::SingleLineComment, 0, "--foo"}
+           });
+    addRow("dash, space, op", "-- |foo", {
+               {TokenType::SingleLineComment, 0, "-- |foo"}
+           });
+    addRow("multi-dash, space", "---- foo", {
+               {TokenType::SingleLineComment, 0, "---- foo"}
+           });
+    addRow("dash, op", "--| foo", {
+               {TokenType::Operator, 0, "--|"},
+               {TokenType::Whitespace, 3, " "},
+               {TokenType::Variable, 4, "foo"}
+           });
+    addRow("dash, special", "--(foo", {
+               {TokenType::SingleLineComment, 0, "--(foo"}
+           });
+    addRow("not a qualified varsym", "F.-- foo", {
+               {TokenType::Constructor, 0, "F"},
+               {TokenType::Operator, 1, "."},
+               {TokenType::SingleLineComment, 2, "-- foo"}
+           });
+}
+
+void tst_Tokenizer::singleLineComment()
+{
+    checkData();
+}
+
+void tst_Tokenizer::multiLineComment_data()
+{
+    setupData();
+
+    addRow("trailing dashes", "{---foo -}", {
+               {TokenType::MultiLineComment, 0, "{---foo -}"}
+           });
+    addRow("multiline", "{- foo", {
+               {TokenType::MultiLineComment, 0, "{- foo"}
+           },
+           Tokens::State::None,
+           Tokens::State(int(Tokens::State::MultiLineCommentGuard) + 1));
+    addRow("multiline2", "bar -}", {
+               {TokenType::MultiLineComment, 0, "bar -}"}
+           },
+           Tokens::State(int(Tokens::State::MultiLineCommentGuard) + 1),
+           Tokens::State::None);
+    addRow("nested", "{- fo{-o", {
+               {TokenType::MultiLineComment, 0, "{- fo{-o"}
+           },
+           Tokens::State::None,
+           Tokens::State(int(Tokens::State::MultiLineCommentGuard) + 2));
+    addRow("nested2", "bar -}", {
+               {TokenType::MultiLineComment, 0, "bar -}"}
+           },
+           Tokens::State(int(Tokens::State::MultiLineCommentGuard) + 2),
+           Tokens::State(int(Tokens::State::MultiLineCommentGuard) + 1));
+    addRow("nested3", "bar -}", {
+               {TokenType::MultiLineComment, 0, "bar -}"}
+           },
+           Tokens::State(int(Tokens::State::MultiLineCommentGuard) + 1),
+           Tokens::State::None);
+}
+
+void tst_Tokenizer::multiLineComment()
+{
+    checkData();
+}
+
+void tst_Tokenizer::string_data()
+{
+    setupData();
+
+    addRow("simple", "\"foo\"", {
+               {TokenType::String, 0, "\"foo\""}
+           });
+
+    addRow("unterminated", "\"", {
+               {TokenType::StringError, 0, "\""}
+           });
+    addRow("unterminated2", "\"foo", {
+               {TokenType::String, 0, "\"fo"},
+               {TokenType::StringError, 3, "o"}
+           });
+    addRow("unterminated with escape", "\"\\\\", {
+               {TokenType::String, 0, "\""},
+               {TokenType::EscapeSequence, 1, "\\"},
+               {TokenType::StringError, 2, "\\"}
+           });
+
+    // gaps
+    addRow("gap", "\" \\   \\\"", {
+               {TokenType::String, 0, "\" \\   \\\""}
+           });
+    addRow("gap over endline", "\"foo\\", {
+               {TokenType::String, 0, "\"foo\\"}
+           },
+           Tokens::State::None, Tokens::State::StringGap);
+    addRow("gap over endline2", "\\foo\"", {
+               {TokenType::String, 0, "\\foo\""}
+           },
+           Tokens::State::StringGap, Tokens::State::None);
+    addRow("gap error", "\"\\ ab \\\"", {
+               {TokenType::String, 0, "\"\\ "},
+               {TokenType::StringError, 3, "ab"},
+               {TokenType::String, 5, " \\\""}
+           });
+    addRow("gap error with quote", "\"\\ \"", {
+               {TokenType::String, 0, "\"\\ "},
+               {TokenType::StringError, 3, "\""}
+           },
+           Tokens::State::None, Tokens::State::StringGap);
+
+    // char escapes (including wrong ones)
+    for (char c = '!'; c <= '~'; ++c) {
+        // skip uppercase and '^', since these can be part of ascii escapes
+        // and 'o' and 'x' since they start octal and hex escapes
+        // and digits as part of decimal escape
+        if ((c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '^' || c == 'o' || c == 'x')
+            continue;
+        const QChar qc(c);
+        const QByteArray name = QString("charesc '%1'").arg(qc).toUtf8();
+        const QString input = QString("\"\\%1\"").arg(qc);
+        if (escapes.contains(c)) {
+            addRow(name.constData(), input, {
+                       {TokenType::String, 0, "\""},
+                       {TokenType::EscapeSequence, 1, QLatin1String("\\") + qc},
+                       {TokenType::String, 3, "\""}
+                   });
+        } else {
+            addRow(name.constData(), input, {
+                       {TokenType::String, 0, "\"\\"},
+                       {TokenType::StringError, 2, qc},
+                       {TokenType::String, 3, "\""}
+                   });
+        }
+    }
+
+    addRow("decimal escape", "\"\\1234a\"", {
+               {TokenType::String, 0, "\""},
+               {TokenType::EscapeSequence, 1, "\\1234"},
+               {TokenType::String, 6, "a\""}
+           });
+
+    addRow("octal escape", "\"\\o0678a\"", {
+               {TokenType::String, 0, "\""},
+               {TokenType::EscapeSequence, 1, "\\o067"},
+               {TokenType::String, 6, "8a\""}
+           });
+    addRow("octal escape error", "\"\\o8a\"", {
+               {TokenType::String, 0, "\"\\"},
+               {TokenType::StringError, 2, "o"},
+               {TokenType::String, 3, "8a\""}
+           });
+
+    addRow("hexadecimal escape", "\"\\x0678Abg\"", {
+               {TokenType::String, 0, "\""},
+               {TokenType::EscapeSequence, 1, "\\x0678Ab"},
+               {TokenType::String, 9, "g\""}
+           });
+    addRow("hexadecimal escape error", "\"\\xg\"", {
+               {TokenType::String, 0, "\"\\"},
+               {TokenType::StringError, 2, "x"},
+               {TokenType::String, 3, "g\""}
+           });
+
+    // ascii cntrl escapes (including wrong ones)
+    for (char c = '!'; c <= '~'; ++c) {
+        if (c == '"') // is special because it also ends the string
+            continue;
+        const QChar qc(c);
+        const QByteArray name = QString("ascii cntrl '^%1'").arg(qc).toUtf8();
+        const QString input = QString("\"\\^%1\"").arg(qc);
+        if ((qc >= 'A' && qc <= 'Z') || qc == '@' || qc == '[' || qc == '\\' || qc == ']'
+                || qc == '^' || qc == '_') {
+            addRow(name.constData(), input, {
+                       {TokenType::String, 0, "\""},
+                       {TokenType::EscapeSequence, 1, QLatin1String("\\^") + qc},
+                       {TokenType::String, 4, "\""}
+                   });
+        } else {
+            addRow(name.constData(), input, {
+                       {TokenType::String, 0, "\"\\"},
+                       {TokenType::StringError, 2, "^"},
+                       {TokenType::String, 3, QString(qc) + "\""}
+                   });
+        }
+    }
+
+    addRow("ascii escape SOH", "\"\\SOHN\"", {
+               {TokenType::String, 0, "\""},
+               {TokenType::EscapeSequence, 1, "\\SOH"},
+               {TokenType::String, 5, "N\""}
+           });
+    addRow("ascii escape SO", "\"\\SON\"", {
+               {TokenType::String, 0, "\""},
+               {TokenType::EscapeSequence, 1, "\\SO"},
+               {TokenType::String, 4, "N\""}
+           });
+    addRow("ascii escape error", "\"\\TON\"", {
+               {TokenType::String, 0, "\"\\"},
+               {TokenType::StringError, 2, "T"},
+               {TokenType::String, 3, "ON\""}
+           });
+    addRow("ascii escape error 2", "\"\\STO\"", {
+               {TokenType::String, 0, "\"\\"},
+               {TokenType::StringError, 2, "S"},
+               {TokenType::String, 3, "TO\""}
+           });
+}
+
+void tst_Tokenizer::string()
+{
+    checkData();
+}
+
+void tst_Tokenizer::character_data()
+{
+    setupData();
+
+    addRow("simple", "'a'", {
+               {TokenType::Char, 0, "'a'"}
+           });
+    addRow("too many", "'abc'", {
+               {TokenType::Char, 0, "'a"},
+               {TokenType::CharError, 2, "bc"},
+               {TokenType::Char, 4, "'"}
+           });
+    addRow("too few", "''", {
+               {TokenType::Char, 0, "'"},
+               {TokenType::CharError, 1, "'"}
+           });
+    addRow("only quote", "'", {
+               {TokenType::CharError, 0, "'"}
+           });
+    addRow("unterminated", "'a", {
+               {TokenType::Char, 0, "'"},
+               {TokenType::CharError, 1, "a"}
+           });
+    addRow("unterminated too many", "'abc", {
+               {TokenType::Char, 0, "'a"},
+               {TokenType::CharError, 2, "bc"}
+           });
+    addRow("unterminated backslash", "'\\", {
+               {TokenType::Char, 0, "'"},
+               {TokenType::CharError, 1, "\\"}
+           });
+
+    // char escapes (including wrong ones)
+    for (char c = '!'; c <= '~'; ++c) {
+        // skip uppercase and '^', since these can be part of ascii escapes
+        // and 'o' and 'x' since they start octal and hex escapes
+        // and digits as part of decimal escape
+        if ((c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '^' || c == 'o' || c == 'x')
+            continue;
+        const QChar qc(c);
+        const QByteArray name = QString("charesc '%1'").arg(qc).toUtf8();
+        const QString input = QString("'\\%1'").arg(qc);
+        if (c != '&' && escapes.contains(c)) {
+            addRow(name.constData(), input, {
+                       {TokenType::Char, 0, "'"},
+                       {TokenType::EscapeSequence, 1, QLatin1String("\\") + qc},
+                       {TokenType::Char, 3, "'"}
+                   });
+        } else {
+            addRow(name.constData(), input, {
+                       {TokenType::Char, 0, "'\\"},
+                       {TokenType::CharError, 2, qc},
+                       {TokenType::Char, 3, "'"}
+                   });
+        }
+    }
+
+    addRow("decimal escape", "'\\1234'", {
+               {TokenType::Char, 0, "'"},
+               {TokenType::EscapeSequence, 1, "\\1234"},
+               {TokenType::Char, 6, "'"}
+           });
+    addRow("decimal escape too long", "'\\1234a'", {
+               {TokenType::Char, 0, "'"},
+               {TokenType::EscapeSequence, 1, "\\1234"},
+               {TokenType::CharError, 6, "a"},
+               {TokenType::Char, 7, "'"}
+           });
+
+    addRow("octal escape", "'\\o067'", {
+               {TokenType::Char, 0, "'"},
+               {TokenType::EscapeSequence, 1, "\\o067"},
+               {TokenType::Char, 6, "'"}
+           });
+    addRow("octal escape error", "'\\o8'", {
+               {TokenType::Char, 0, "'\\"},
+               {TokenType::CharError, 2, "o"},
+               {TokenType::CharError, 3, "8"},
+               {TokenType::Char, 4, "'"}
+           });
+
+    addRow("hexadecimal escape", "'\\x0678Ab'", {
+               {TokenType::Char, 0, "'"},
+               {TokenType::EscapeSequence, 1, "\\x0678Ab"},
+               {TokenType::Char, 9, "'"}
+           });
+    addRow("hexadecimal escape error", "'\\xg'", {
+               {TokenType::Char, 0, "'\\"},
+               {TokenType::CharError, 2, "x"},
+               {TokenType::CharError, 3, "g"},
+               {TokenType::Char, 4, "'"}
+           });
+
+    // ascii cntrl escapes (including wrong ones)
+    for (char c = '!'; c <= '~'; ++c) {
+        if (c == '\'') // is special because it also ends the string
+            continue;
+        const QChar qc(c);
+        const QByteArray name = QString("ascii cntrl '^%1'").arg(qc).toUtf8();
+        const QString input = QString("'\\^%1'").arg(qc);
+        if ((qc >= 'A' && qc <= 'Z') || qc == '@' || qc == '[' || qc == '\\' || qc == ']'
+                || qc == '^' || qc == '_') {
+            addRow(name.constData(), input, {
+                       {TokenType::Char, 0, "'"},
+                       {TokenType::EscapeSequence, 1, QLatin1String("\\^") + qc},
+                       {TokenType::Char, 4, "'"}
+                   });
+        } else {
+            addRow(name.constData(), input, {
+                       {TokenType::Char, 0, "'\\"},
+                       {TokenType::CharError, 2, "^"},
+                       {TokenType::CharError, 3, qc},
+                       {TokenType::Char, 4, "'"}
+                   });
+        }
+    }
+
+    addRow("ascii escape SOH", "'\\SOH'", {
+               {TokenType::Char, 0, "'"},
+               {TokenType::EscapeSequence, 1, "\\SOH"},
+               {TokenType::Char, 5, "'"}
+           });
+    addRow("ascii escape SO, too long", "'\\SON'", {
+               {TokenType::Char, 0, "'"},
+               {TokenType::EscapeSequence, 1, "\\SO"},
+               {TokenType::CharError, 4, "N"},
+               {TokenType::Char, 5, "'"}
+           });
+    addRow("ascii escape error", "'\\TON'", {
+               {TokenType::Char, 0, "'\\"},
+               {TokenType::CharError, 2, "T"},
+               {TokenType::CharError, 3, "ON"},
+               {TokenType::Char, 5, "'"}
+           });
+}
+
+void tst_Tokenizer::character()
+{
+    checkData();
+}
+
+void tst_Tokenizer::number_data()
+{
+    setupData();
+
+    addRow("decimal", "012345", {
+               {TokenType::Integer, 0, "012345"}
+           });
+    addRow("single digit decimal", "0", {
+               {TokenType::Integer, 0, "0"}
+           });
+    addRow("octal", "0o1234", {
+               {TokenType::Integer, 0, "0o1234"}
+           });
+    // this is a bit weird, but correct: octal 1 followed by decimal 8
+    addRow("number after octal", "0O18", {
+               {TokenType::Integer, 0, "0O1"},
+               {TokenType::Integer, 3, "8"}
+           });
+    addRow("not octal", "0o9", {
+               {TokenType::Integer, 0, "0"},
+               {TokenType::Variable, 1, "o9"},
+           });
+    addRow("hexadecimal", "0x9fA", {
+               {TokenType::Integer, 0, "0x9fA"}
+           });
+    // hex number followed by identifier 'g'
+    addRow("hexadecimal", "0X9fg", {
+               {TokenType::Integer, 0, "0X9f"},
+               {TokenType::Variable, 4, "g"}
+           });
+
+    // 0 followed by identifier
+    addRow("decimal followed by identifier", "0z6", {
+               {TokenType::Integer, 0, "0"},
+               {TokenType::Variable, 1, "z6"}
+           });
+
+    addRow("float", "0123.45", {
+               {TokenType::Float, 0, "0123.45"}
+           });
+    addRow("decimal + operator '.'", "0123.", {
+               {TokenType::Integer, 0, "0123"},
+               {TokenType::Operator, 4, "."}
+           });
+    addRow("operator '.' + decimal", ".0123", {
+               {TokenType::Operator, 0, "."},
+               {TokenType::Integer, 1, "0123"}
+           });
+    addRow("without '.', with exp 'e'", "0123e45", {
+               {TokenType::Float, 0, "0123e45"}
+           });
+    addRow("without '.', with exp 'E'", "0123E45", {
+               {TokenType::Float, 0, "0123E45"}
+           });
+    addRow("without '.', with '+'", "0123e+45", {
+               {TokenType::Float, 0, "0123e+45"}
+           });
+    addRow("without '.', with '-'", "0123e-45", {
+               {TokenType::Float, 0, "0123e-45"}
+           });
+    addRow("without '.', with '+', missing decimal", "0123e+", {
+               {TokenType::Integer, 0, "0123"},
+               {TokenType::Variable, 4, "e"},
+               {TokenType::Operator, 5, "+"}
+           });
+    addRow("without '.', missing decimal", "0123e", {
+               {TokenType::Integer, 0, "0123"},
+               {TokenType::Variable, 4, "e"}
+           });
+    addRow("exp 'e'", "01.23e45", {
+               {TokenType::Float, 0, "01.23e45"}
+           });
+    addRow("exp 'E'", "01.23E45", {
+               {TokenType::Float, 0, "01.23E45"}
+           });
+    addRow("with '+'", "01.23e+45", {
+               {TokenType::Float, 0, "01.23e+45"}
+           });
+    addRow("with '-'", "01.23e-45", {
+               {TokenType::Float, 0, "01.23e-45"}
+           });
+    addRow("with '+', missing decimal", "01.23e+", {
+               {TokenType::Float, 0, "01.23"},
+               {TokenType::Variable, 5, "e"},
+               {TokenType::Operator, 6, "+"}
+           });
+    addRow("missing decimal", "01.23e", {
+               {TokenType::Float, 0, "01.23"},
+               {TokenType::Variable, 5, "e"}
+           });
+}
+
+void tst_Tokenizer::number()
+{
+    checkData();
+}
+
+void tst_Tokenizer::keyword_data()
+{
+    setupData();
+
+    addRow("data", "data", {
+               {TokenType::Keyword, 0, "data"}
+           });
+    addRow("not a qualified varid", "Foo.case", {
+               {TokenType::Constructor, 0, "Foo"},
+               {TokenType::Operator, 3, "."},
+               {TokenType::Keyword, 4, "case"}
+           });
+    addRow(":", ":", {
+               {TokenType::Keyword, 0, ":"}
+           });
+    addRow("->", "->", {
+               {TokenType::Keyword, 0, "->"}
+           });
+    addRow("not a qualified varsym", "Foo...", {
+               {TokenType::Constructor, 0, "Foo"},
+               {TokenType::Operator, 3, "..."}
+           });
+}
+
+void tst_Tokenizer::keyword()
+{
+    checkData();
+}
+
+void tst_Tokenizer::variable_data()
+{
+    setupData();
+
+    addRow("simple", "fOo_1'", {
+               {TokenType::Variable, 0, "fOo_1'"}
+           });
+    addRow("start with '_'", "_1", {
+               {TokenType::Variable, 0, "_1"}
+           });
+    addRow("not a keyword", "cases", {
+               {TokenType::Variable, 0, "cases"}
+           });
+    addRow("not a keyword 2", "qualified", {
+               {TokenType::Variable, 0, "qualified"}
+           });
+    addRow("not a keyword 3", "as", {
+               {TokenType::Variable, 0, "as"}
+           });
+    addRow("not a keyword 4", "hiding", {
+               {TokenType::Variable, 0, "hiding"}
+           });
+    addRow(".variable", ".foo", {
+               {TokenType::Operator, 0, "."},
+               {TokenType::Variable, 1, "foo"}
+           });
+    addRow("variable.", "foo.", {
+               {TokenType::Variable, 0, "foo"},
+               {TokenType::Operator, 3, "."}
+           });
+    addRow("variable.variable", "blah.foo", {
+               {TokenType::Variable, 0, "blah"},
+               {TokenType::Operator, 4, "."},
+               {TokenType::Variable, 5, "foo"}
+           });
+    addRow("qualified", "Blah.foo", {
+               {TokenType::Variable, 0, "Blah.foo"}
+           });
+    addRow("qualified2", "Goo.Blah.foo", {
+               {TokenType::Variable, 0, "Goo.Blah.foo"}
+           });
+    addRow("variable + op '..'", "foo..", {
+               {TokenType::Variable, 0, "foo"},
+               {TokenType::Keyword, 3, ".."}
+           });
+    addRow("variable + op '...'", "foo...", {
+               {TokenType::Variable, 0, "foo"},
+               {TokenType::Operator, 3, "..."}
+           });
+}
+
+void tst_Tokenizer::variable()
+{
+    checkData();
+}
+
+void tst_Tokenizer::constructor_data()
+{
+    setupData();
+
+    addRow("simple", "Foo", {
+               {TokenType::Constructor, 0, "Foo"}
+           });
+    addRow("qualified", "Foo.Bar", {
+               {TokenType::Constructor, 0, "Foo.Bar"}
+           });
+    addRow("followed by op '.'", "Foo.Bar.", {
+               {TokenType::Constructor, 0, "Foo.Bar"},
+               {TokenType::Operator, 7, "."}
+           });
+
+}
+
+void tst_Tokenizer::constructor()
+{
+    checkData();
+}
+
+void tst_Tokenizer::op_data()
+{
+    setupData();
+
+    addRow("simple", "+-=", {
+               {TokenType::Operator, 0, "+-="}
+           });
+    addRow("qualified", "Foo.+-=", {
+               {TokenType::Operator, 0, "Foo.+-="}
+           });
+    addRow("qualified '.'", "Foo..", {
+               {TokenType::Operator, 0, "Foo.."}
+           });
+    addRow("constructor plus op", "Foo+", {
+               {TokenType::Constructor, 0, "Foo"},
+               {TokenType::Operator, 3, "+"}
+           });
+}
+
+void tst_Tokenizer::op()
+{
+    checkData();
+}
+
+QTEST_MAIN(tst_Tokenizer)
+
+#include "tst_tokenizer.moc"
author	Eike Ziller <git@eikeziller.de>	2017-04-29 16:17:11 +0200
committer	Eike Ziller <git@eikeziller.de>	2017-10-01 20:11:08 +0200
commit	5798e33d742c0f413d2d865fdb75739b4374ce98 (patch)
tree	e7d36edf5de22ab74ed4b56e2e2b22be24f50ef6
parent	2f69373309cfe88084c5777baeff6bb46eecd071 (diff)