diff options
author | Eike Ziller <git@eikeziller.de> | 2017-04-29 16:17:11 +0200 |
---|---|---|
committer | Eike Ziller <git@eikeziller.de> | 2017-10-01 20:11:08 +0200 |
commit | 5798e33d742c0f413d2d865fdb75739b4374ce98 (patch) | |
tree | e7d36edf5de22ab74ed4b56e2e2b22be24f50ef6 /plugins/haskell | |
parent | 2f69373309cfe88084c5777baeff6bb46eecd071 (diff) |
Add highlighter
See lexical structure of Haskell
https://www.haskell.org/onlinereport/haskell2010/haskellch2.html
Diffstat (limited to 'plugins/haskell')
-rw-r--r-- | plugins/haskell/haskell.pro | 8 | ||||
-rw-r--r-- | plugins/haskell/haskelleditorfactory.cpp | 2 | ||||
-rw-r--r-- | plugins/haskell/haskellhighlighter.cpp | 152 | ||||
-rw-r--r-- | plugins/haskell/haskellhighlighter.h | 58 | ||||
-rw-r--r-- | plugins/haskell/haskelltokenizer.cpp | 631 | ||||
-rw-r--r-- | plugins/haskell/haskelltokenizer.h | 91 |
6 files changed, 940 insertions, 2 deletions
diff --git a/plugins/haskell/haskell.pro b/plugins/haskell/haskell.pro index 9c557b1..aa92d3c 100644 --- a/plugins/haskell/haskell.pro +++ b/plugins/haskell/haskell.pro @@ -5,14 +5,18 @@ DEFINES += HASKELL_LIBRARY SOURCES += \ haskellcompletionassist.cpp \ haskelleditorfactory.cpp \ - haskellplugin.cpp + haskellplugin.cpp \ + haskellhighlighter.cpp \ + haskelltokenizer.cpp HEADERS += \ haskell_global.h \ haskellcompletionassist.h \ haskellconstants.h \ haskelleditorfactory.h \ - haskellplugin.h + haskellplugin.h \ + haskellhighlighter.h \ + haskelltokenizer.h ## uncomment to build plugin into user config directory ## <localappdata>/plugins/<ideversion> diff --git a/plugins/haskell/haskelleditorfactory.cpp b/plugins/haskell/haskelleditorfactory.cpp index 220e52e..8119105 100644 --- a/plugins/haskell/haskelleditorfactory.cpp +++ b/plugins/haskell/haskelleditorfactory.cpp @@ -27,6 +27,7 @@ #include "haskellcompletionassist.h" #include "haskellconstants.h" +#include "haskellhighlighter.h" #include <texteditor/textdocument.h> #include <texteditor/texteditoractionhandler.h> @@ -47,6 +48,7 @@ HaskellEditorFactory::HaskellEditorFactory() setParenthesesMatchingEnabled(true); setMarksVisible(true); setCompletionAssistProvider(new HaskellCompletionAssistProvider); + setSyntaxHighlighterCreator([] { return new HaskellHighlighter(); }); } } // Internal diff --git a/plugins/haskell/haskellhighlighter.cpp b/plugins/haskell/haskellhighlighter.cpp new file mode 100644 index 0000000..9899cc4 --- /dev/null +++ b/plugins/haskell/haskellhighlighter.cpp @@ -0,0 +1,152 @@ +/**************************************************************************** +** +** Copyright (C) 2017 The Qt Company Ltd. +** Contact: https://www.qt.io/licensing/ +** +** This file is part of Qt Creator. +** +** Commercial License Usage +** Licensees holding valid commercial Qt licenses may use this file in +** accordance with the commercial license agreement provided with the +** Software or, alternatively, in accordance with the terms contained in +** a written agreement between you and The Qt Company. For licensing terms +** and conditions see https://www.qt.io/terms-conditions. For further +** information use the contact form at https://www.qt.io/contact-us. +** +** GNU General Public License Usage +** Alternatively, this file may be used under the terms of the GNU +** General Public License version 3 as published by the Free Software +** Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT +** included in the packaging of this file. Please review the following +** information to ensure the GNU General Public License requirements will +** be met: https://www.gnu.org/licenses/gpl-3.0.html. +** +****************************************************************************/ + +#include "haskellhighlighter.h" + +#include "haskelltokenizer.h" + +#include <texteditor/fontsettings.h> +#include <texteditor/texteditorconstants.h> +#include <texteditor/texteditorsettings.h> + +#include <QDebug> +#include <QVector> + +Q_GLOBAL_STATIC_WITH_ARGS(QSet<QString>, IMPORT_HIGHLIGHTS, ({ + "qualified", + "as", + "hiding" +})); + +using namespace TextEditor; + +namespace Haskell { +namespace Internal { + +HaskellHighlighter::HaskellHighlighter() +{ + setDefaultTextFormatCategories(); + updateFormats(TextEditorSettings::fontSettings()); +} + +void HaskellHighlighter::highlightBlock(const QString &text) +{ + const Tokens tokens = HaskellTokenizer::tokenize(text, previousBlockState()); + setCurrentBlockState(tokens.state); + const Token *firstNonWS = 0; + const Token *secondNonWS = 0; + bool inType = false; + bool inImport = false; + for (const Token & token : tokens) { + switch (token.type) { + case TokenType::Variable: + if (inType) + setTokenFormat(token, C_LOCAL); + else if (inImport && IMPORT_HIGHLIGHTS->contains(token.text.toString())) + setTokenFormat(token, C_KEYWORD); +// else +// setTokenFormat(token, C_TEXT); + break; + case TokenType::Constructor: + case TokenType::OperatorConstructor: + setTokenFormat(token, C_TYPE); + break; + case TokenType::Operator: + setTokenFormat(token, C_OPERATOR); + break; + case TokenType::Whitespace: + setTokenFormat(token, C_VISUAL_WHITESPACE); + break; + case TokenType::Keyword: + if (token.text == "::" && firstNonWS && !secondNonWS) { // toplevel declaration + setFormat(firstNonWS->startCol, firstNonWS->length, m_toplevelDeclFormat); + inType = true; + } else if (token.text == "import") { + inImport = true; + } + setTokenFormat(token, C_KEYWORD); + break; + case TokenType::Integer: + case TokenType::Float: + setTokenFormat(token, C_NUMBER); + break; + case TokenType::String: + setTokenFormatWithSpaces(text, token, C_STRING); + break; + case TokenType::Char: + setTokenFormatWithSpaces(text, token, C_STRING); + break; + case TokenType::EscapeSequence: + setTokenFormat(token, C_PRIMITIVE_TYPE); + break; + case TokenType::SingleLineComment: + setTokenFormatWithSpaces(text, token, C_COMMENT); + break; + case TokenType::MultiLineComment: + setTokenFormatWithSpaces(text, token, C_COMMENT); + break; + case TokenType::Special: +// setTokenFormat(token, C_TEXT); + break; + case TokenType::StringError: + case TokenType::CharError: + case TokenType::Unknown: + setTokenFormat(token, C_PARENTHESES_MISMATCH); + break; + } + if (token.type != TokenType::Whitespace) { + if (!firstNonWS) + firstNonWS = &token; + else if (!secondNonWS) + secondNonWS = &token; + } + } +} + +void HaskellHighlighter::setFontSettings(const FontSettings &fontSettings) +{ + SyntaxHighlighter::setFontSettings(fontSettings); + updateFormats(fontSettings); +} + +void HaskellHighlighter::updateFormats(const FontSettings &fontSettings) +{ + m_toplevelDeclFormat = fontSettings.toTextCharFormat( + TextStyles::mixinStyle(C_FUNCTION, C_DECLARATION)); +} + +void HaskellHighlighter::setTokenFormat(const Token &token, TextStyle style) +{ + setFormat(token.startCol, token.length, formatForCategory(style)); +} + +void HaskellHighlighter::setTokenFormatWithSpaces(const QString &text, const Token &token, + TextStyle style) +{ + setFormatWithSpaces(text, token.startCol, token.length, formatForCategory(style)); +} + +} // Internal +} // Haskell diff --git a/plugins/haskell/haskellhighlighter.h b/plugins/haskell/haskellhighlighter.h new file mode 100644 index 0000000..6213333 --- /dev/null +++ b/plugins/haskell/haskellhighlighter.h @@ -0,0 +1,58 @@ +/**************************************************************************** +** +** Copyright (C) 2017 The Qt Company Ltd. +** Contact: https://www.qt.io/licensing/ +** +** This file is part of Qt Creator. +** +** Commercial License Usage +** Licensees holding valid commercial Qt licenses may use this file in +** accordance with the commercial license agreement provided with the +** Software or, alternatively, in accordance with the terms contained in +** a written agreement between you and The Qt Company. For licensing terms +** and conditions see https://www.qt.io/terms-conditions. For further +** information use the contact form at https://www.qt.io/contact-us. +** +** GNU General Public License Usage +** Alternatively, this file may be used under the terms of the GNU +** General Public License version 3 as published by the Free Software +** Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT +** included in the packaging of this file. Please review the following +** information to ensure the GNU General Public License requirements will +** be met: https://www.gnu.org/licenses/gpl-3.0.html. +** +****************************************************************************/ + +#pragma once + +#include <texteditor/syntaxhighlighter.h> + +#include <QHash> +#include <QTextFormat> + +namespace Haskell { +namespace Internal { + +class Token; + +class HaskellHighlighter : public TextEditor::SyntaxHighlighter +{ + Q_OBJECT + +public: + HaskellHighlighter(); + +protected: + void highlightBlock(const QString &text) override; + +private: + void setFontSettings(const TextEditor::FontSettings &fontSettings) override; + void updateFormats(const TextEditor::FontSettings &fontSettings); + void setTokenFormat(const Token &token, TextEditor::TextStyle style); + void setTokenFormatWithSpaces(const QString &text, const Token &token, + TextEditor::TextStyle style); + QTextCharFormat m_toplevelDeclFormat; +}; + +} // Internal +} // Haskell diff --git a/plugins/haskell/haskelltokenizer.cpp b/plugins/haskell/haskelltokenizer.cpp new file mode 100644 index 0000000..527e505 --- /dev/null +++ b/plugins/haskell/haskelltokenizer.cpp @@ -0,0 +1,631 @@ +/**************************************************************************** +** +** Copyright (C) 2017 The Qt Company Ltd. +** Contact: https://www.qt.io/licensing/ +** +** This file is part of Qt Creator. +** +** Commercial License Usage +** Licensees holding valid commercial Qt licenses may use this file in +** accordance with the commercial license agreement provided with the +** Software or, alternatively, in accordance with the terms contained in +** a written agreement between you and The Qt Company. For licensing terms +** and conditions see https://www.qt.io/terms-conditions. For further +** information use the contact form at https://www.qt.io/contact-us. +** +** GNU General Public License Usage +** Alternatively, this file may be used under the terms of the GNU +** General Public License version 3 as published by the Free Software +** Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT +** included in the packaging of this file. Please review the following +** information to ensure the GNU General Public License requirements will +** be met: https://www.gnu.org/licenses/gpl-3.0.html. +** +****************************************************************************/ + +#include "haskelltokenizer.h" + +#include <QSet> + +#include <algorithm> +#include <functional> + +Q_GLOBAL_STATIC_WITH_ARGS(QSet<QString>, RESERVED_OP, ({ + "..", + ":", + "::", + "=", + "\\", + "|", + "<-", + "->", + "@", + "~", + "=>", + + // Arrows GHC extension + "-<", + "-<<", + ">-", + ">>-", + "(|", + "|)" +})); + +Q_GLOBAL_STATIC_WITH_ARGS(QSet<QString>, RESERVED_ID, ({ + "case", + "class", + "data", + "default", + "deriving", + "do", + "else", + "foreign", + "if", + "import", + "in", + "infix", + "infixl", + "infixr", + "instance", + "let", + "module", + "newtype", + "of", + "then", + "type", + "where", + "_", + + // from GHC extensions + "family", + "forall", + "mdo", + "proc", + "rec" +})); + +Q_GLOBAL_STATIC_WITH_ARGS(QSet<QChar>, SPECIAL, ({ + '(', + ')', + ',', + ';', + '[', + ']', + '`', + '{', + '}', +})); + +Q_GLOBAL_STATIC_WITH_ARGS(QSet<QChar>, CHAR_ESCAPES, + ({'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '"', '\'', '&'})); + +Q_GLOBAL_STATIC_WITH_ARGS(QVector<QString>, ASCII_ESCAPES, ({ + "NUL", + "SOH", // must be before "SO" to match + "STX", + "ETX", + "EOT", + "ENQ", + "ACK", + "BEL", + "BS", + "HT", + "LF", + "VT", + "FF", + "CR", + "SO", + "SI", + "DLE", + "DC1", + "DC2", + "DC3", + "DC4", + "NAK", + "SYN", + "ETB", + "CAN", + "EM", + "SUB", + "ESC", + "FS", + "GS", + "RS", + "US", + "SP", + "DEL" +})); + +namespace Haskell { +namespace Internal { + +Token token(TokenType type, std::shared_ptr<QString> line, int start, int end) +{ + return {type, start, end - start, line->midRef(start, end - start), line}; +} + +Tokens::Tokens(std::shared_ptr<QString> source) + : source(source) +{ +} + +static int grab(const QString &line, int begin, + const std::function<bool(const QChar&)> &test) +{ + const int length = line.length(); + int current = begin; + while (current < length && test(line.at(current))) + ++current; + return current - begin; +}; + + +static bool isIdentifierChar(const QChar &c) +{ + return c.isLetterOrNumber() || c == '\'' || c == '_'; +} + +static bool isVariableIdentifierStart(const QChar &c) +{ + return c == '_' || c.isLower(); +} + +static bool isAscSymbol(const QChar &c) +{ + return c == '!' + || c == '#' + || c == '$' + || c == '%' + || c == '&' + || c == '*' + || c == '+' + || c == '.' + || c == '/' + || c == '<' + || c == '=' + || c == '>' + || c == '?' + || c == '@' + || c == '\\' + || c == '^' + || c == '|' + || c == '-' + || c == '~' + || c == ':'; +} + +static bool isSymbol(const QChar &c) +{ + return isAscSymbol(c) + || ((c.isSymbol() || c.isPunct()) && c != '_' && c != '"' && c != '\'' + && !SPECIAL->contains(c)); +} + +static bool isDigit(const QChar &c) +{ + return c.isDigit(); +} + +static bool isOctit(const QChar &c) +{ + return c >= '0' && c <= '7'; +} + +static bool isHexit(const QChar &c) +{ + return c.isDigit() + || (c >= 'A' && c <= 'F') + || (c >= 'a' && c <= 'f'); +} + +static bool isCntrl(const QChar &c) +{ + return (c >= 'A' && c <= 'Z') + || c == '@' + || c == '[' + || c == '\\' + || c == ']' + || c == '^' + || c == '_'; +} + +static QVector<Token> getSpace(std::shared_ptr<QString> line, int start) +{ + const auto lineEnd = line->cend(); + const auto tokenStart = line->cbegin() + start; + auto current = tokenStart; + while (current != lineEnd && (*current).isSpace()) + ++current; + const int length = int(std::distance(tokenStart, current)); + if (current > tokenStart) + return {{TokenType::Whitespace, start, length, line->midRef(start, length), line}}; + return {}; +} + +static QVector<Token> getNumber(std::shared_ptr<QString> line, int start) +{ + const QChar &startC = line->at(start); + if (!startC.isDigit()) + return {}; + const int length = line->length(); + int current = start + 1; + TokenType type = TokenType::Integer; + if (current < length) { + if (startC == '0') { + // check for octal or hexadecimal + const QChar &secondC = line->at(current); + if (secondC == 'o' || secondC == 'O') { + const int numLen = grab(*line, current + 1, isOctit); + if (numLen > 0) + return {token(TokenType::Integer, line, start, current + numLen + 1)}; + } else if (secondC == 'x' || secondC == 'X') { + const int numLen = grab(*line, current + 1, isHexit); + if (numLen > 0) + return {token(TokenType::Integer, line, start, current + numLen + 1)}; + } + } + // starts with decimal + const int numLen = grab(*line, start, isDigit); + current = start + numLen; + // check for floating point + if (current < length && line->at(current) == '.') { + const int numLen = grab(*line, current + 1, isDigit); + if (numLen > 0) { + current += numLen + 1; + type = TokenType::Float; + } + } + // check for exponent + if (current + 1 < length /*for at least 'e' and digit*/ + && (line->at(current) == 'e' || line->at(current) == 'E')) { + int expEnd = current + 1; + if (line->at(expEnd) == '+' || line->at(expEnd) == '-') + ++expEnd; + const int numLen = grab(*line, expEnd, isDigit); + if (numLen > 0) { + current = expEnd + numLen; + type = TokenType::Float; + } + } + } + return {token(type, line, start, current)}; +} + +static QVector<Token> getIdOrOpOrSingleLineComment(std::shared_ptr<QString> line, int start) +{ + const int length = line->length(); + if (start >= length) + return {}; + int current = start; + // check for {conid.}conid + int conidEnd = start; + bool canOnlyBeConstructor = false; + while (current < length && line->at(current).isUpper()) { + current += grab(*line, current, isIdentifierChar); + conidEnd = current; + // it is definitely a constructor id if it is not followed by a '.' + canOnlyBeConstructor = current >= length || line->at(current) != '.'; + // otherwise it might be a module id, and we skip the dot to check for qualified thing + if (!canOnlyBeConstructor) + ++current; + } + if (canOnlyBeConstructor) + return {token(TokenType::Constructor, line, start, conidEnd)}; + + // check for variable or reserved id + if (current < length && isVariableIdentifierStart(line->at(current))) { + const int varLen = grab(*line, current, isIdentifierChar); + // check for reserved id + if (RESERVED_ID->contains(line->mid(current, varLen))) { + QVector<Token> result; + // possibly add constructor + op '.' + if (conidEnd > start) { + result.append(token(TokenType::Constructor, line, start, conidEnd)); + result.append(token(TokenType::Operator, line, conidEnd, current)); + } + result.append(token(TokenType::Keyword, line, current, current + varLen)); + return result; + } + return {token(TokenType::Variable, line, start, current + varLen)}; + } + // check for operator + if (current < length && isSymbol(line->at(current))) { + const int opLen = grab(*line, current, isSymbol); + // check for reserved op + if (RESERVED_OP->contains(line->mid(current, opLen))) { + // because of the case of F... (constructor + op '...') etc + // we only add conid if we have one, handling the rest in next iteration + if (conidEnd > start) + return {token(TokenType::Constructor, line, start, conidEnd)}; + return {token(TokenType::Keyword, line, start, current + opLen)}; + } + // check for single line comment + if (opLen >= 2 && std::all_of(line->begin() + current, line->begin() + current + opLen, + [](const QChar c) { return c == '-'; })) { + QVector<Token> result; + // possibly add constructor + op '.' + if (conidEnd > start) { + result.append(token(TokenType::Constructor, line, start, conidEnd)); + result.append(token(TokenType::Operator, line, conidEnd, current)); + } + // rest is comment + result.append(token(TokenType::SingleLineComment, line, current, length)); + return result; + } + // check for (qualified?) operator constructor + if (line->at(current) == ':') + return {token(TokenType::OperatorConstructor, line, start, current + opLen)}; + return {token(TokenType::Operator, line, start, current + opLen)}; + } + // Foo.Blah. + if (conidEnd > start) + return {token(TokenType::Constructor, line, start, conidEnd)}; + return {}; +} + +static int getEscape(const QString &line, int start) +{ + if (CHAR_ESCAPES->contains(line.at(start))) + return 1; + + // decimal + if (line.at(start).isDigit()) + return grab(line, start + 1, isDigit) + 1; + // octal + if (line.at(start) == 'o') { + const int count = grab(line, start + 1, isOctit); + if (count < 1) // no octal number after 'o' + return 0; + return count + 1; + } + // hexadecimal + if (line.at(start) == 'x') { + const int count = grab(line, start + 1, isHexit); + if (count < 1) // no octal number after 'o' + return 0; + return count + 1; + } + // ascii cntrl + if (line.at(start) == '^') { + const int count = grab(line, start + 1, isCntrl); + if (count < 1) // no octal number after 'o' + return 0; + return count + 1; + } + const QStringRef s = line.midRef(start); + for (const QString &esc : *ASCII_ESCAPES) { + if (s.startsWith(esc)) + return esc.length(); + } + return 0; +} + +static QVector<Token> getString(std::shared_ptr<QString> line, int start, bool *inStringGap/*in-out*/) +{ + // Haskell has the specialty of using \<whitespace>\ within strings for multiline strings + const int length = line->length(); + if (start >= length) + return {}; + QVector<Token> result; + int tokenStart = start; + int current = tokenStart; + bool inString = *inStringGap; + do { + const QChar c = line->at(current); + if (*inStringGap && !c.isSpace() && c != '\\') { + // invalid non-whitespace in string gap + // add previous string as token, this is at least a whitespace + result.append(token(TokenType::String, line, tokenStart, current)); + // then add wrong non-whitespace + tokenStart = current; + do { ++current; } while (current < length && !line->at(current).isSpace()); + result.append(token(TokenType::StringError, line, tokenStart, current)); + tokenStart = current; + } else if (c == '"') { + inString = !inString; + ++current; + } else if (inString) { + if (c == '\\') { + ++current; + if (*inStringGap) { + // ending string gap + *inStringGap = false; + } else if (current >= length || line->at(current).isSpace()) { + // starting string gap + *inStringGap = true; + current = std::min(current + 1, length); + } else { // there is at least one character after current + const int escapeLength = getEscape(*line, current); + if (escapeLength > 0) { + // valid escape + // add previous string as token without backslash, if necessary + if (tokenStart < current - 1/*backslash*/) + result.append(token(TokenType::String, line, tokenStart, current - 1)); + tokenStart = current - 1; // backslash + current += escapeLength; + result.append(token(TokenType::EscapeSequence, line, tokenStart, current)); + tokenStart = current; + } else { // invalid escape sequence + // add previous string as token, this is at least backslash + result.append(token(TokenType::String, line, tokenStart, current)); + result.append(token(TokenType::StringError, line, current, current + 1)); + ++current; + tokenStart = current; + } + } + } else { + ++current; + } + } + } while (current < length && inString); + if (current > tokenStart) + result.append(token(TokenType::String, line, tokenStart, current)); + if (inString && !*inStringGap) { // unterminated string + // mark last character of last token as Unknown as an error hint + if (!result.isEmpty()) { // should actually never be different + Token &lastRef = result.last(); + if (lastRef.length == 1) { + lastRef.type = TokenType::StringError; + } else { + --lastRef.length; + lastRef.text = line->midRef(lastRef.startCol, lastRef.length); + result.append(token(TokenType::StringError, line, current - 1, current)); + } + } + } + return result; +} + +static QVector<Token> getMultiLineComment(std::shared_ptr<QString> line, int start, + int *commentLevel/*in_out*/) +{ + // Haskell multiline comments can be nested {- foo {- bar -} blah -} + const int length = line->length(); + int current = start; + do { + const QStringRef test = line->midRef(current, 2); + if (test == "{-") { + ++(*commentLevel); + current += 2; + } else if (test == "-}" && *commentLevel > 0) { + --(*commentLevel); + current += 2; + } else if (*commentLevel > 0) { + ++current; + } + } while (current < length && *commentLevel > 0); + if (current > start) { + return {token(TokenType::MultiLineComment, line, start, current)}; + } + return {}; +} + +static QVector<Token> getChar(std::shared_ptr<QString> line, int start) +{ + if (line->at(start) != '\'') + return {}; + QVector<Token> result; + const int length = line->length(); + int tokenStart = start; + int current = tokenStart + 1; + bool inChar = true; + int count = 0; + while (current < length && inChar) { + if (line->at(current) == '\'') { + inChar = false; + ++current; + } else if (count == 1) { + // we already have one character, so start Unknown token + if (current > tokenStart) + result.append(token(TokenType::Char, line, tokenStart, current)); + tokenStart = current; + ++count; + ++current; + } else if (count > 1) { + ++count; + ++current; + } else if (line->at(current) == '\\') { + if (current + 1 < length) { + ++current; + ++count; + const int escapeLength = getEscape(*line, current); + if (line->at(current) != '&' && escapeLength > 0) { // no & escape for chars + // valid escape + // add previous string as token without backslash, if necessary + if (tokenStart < current - 1/*backslash*/) + result.append(token(TokenType::Char, line, tokenStart, current - 1)); + tokenStart = current - 1; // backslash + current += escapeLength; + result.append(token(TokenType::EscapeSequence, line, tokenStart, current)); + tokenStart = current; + } else { // invalid escape sequence + // add previous string as token, this is at least backslash + result.append(token(TokenType::Char, line, tokenStart, current)); + result.append(token(TokenType::CharError, line, current, current + 1)); + ++current; + tokenStart = current; + } + } else { + ++current; + } + } else { + ++count; + ++current; + } + } + if (count > 1 && inChar) { + // too long and unterminated, just add Unknown token till end + result.append(token(TokenType::CharError, line, tokenStart, current)); + } else if (count > 1) { + // too long but terminated, add Unknown up to ending quote, then quote + result.append(token(TokenType::CharError, line, tokenStart, current - 1)); + result.append(token(TokenType::Char, line, current - 1, current)); + } else if (inChar || count < 1) { + // unterminated, or no character inside, mark last character as error + if (current > tokenStart + 1) + result.append(token(TokenType::Char, line, tokenStart, current - 1)); + result.append(token(TokenType::CharError, line, current - 1, current)); + } else { + result.append(token(TokenType::Char, line, tokenStart, current)); + } + return result; +} + +static QVector<Token> getSpecial(std::shared_ptr<QString> line, int start) +{ + if (SPECIAL->contains(line->at(start))) + return {{TokenType::Special, start, 1, line->midRef(start, 1), line}}; + return {}; +} + +Tokens HaskellTokenizer::tokenize(const QString &line, int startState) +{ + Tokens result(std::make_shared<QString>(line)); + const int length = result.source->length(); + bool inStringGap = startState == int(Tokens::State::StringGap); + int multiLineCommentLevel = std::max(startState - int(Tokens::State::MultiLineCommentGuard), 0); + int currentStart = 0; + QVector<Token> tokens; + while (currentStart < length) { + if (multiLineCommentLevel <= 0 && + !(tokens = getString(result.source, currentStart, &inStringGap)).isEmpty()) { + result.append(tokens); + } else if (!(tokens = getMultiLineComment(result.source, currentStart, + &multiLineCommentLevel)).isEmpty()) { + result.append(tokens); + } else if (!(tokens = getChar(result.source, currentStart)).isEmpty()) { + result.append(tokens); + } else if (!(tokens = getSpace(result.source, currentStart)).isEmpty()) { + result.append(tokens); + } else if (!(tokens = getNumber(result.source, currentStart)).isEmpty()) { + result.append(tokens); + } else if (!(tokens = getIdOrOpOrSingleLineComment(result.source, currentStart)).isEmpty()) { + result.append(tokens); + } else if (!(tokens = getSpecial(result.source, currentStart)).isEmpty()) { + result.append(tokens); + } else { + tokens = {{TokenType::Unknown, + currentStart, + 1, + result.source->midRef(currentStart, 1), + result.source}}; + result.append(tokens); + } + currentStart += std::accumulate(tokens.cbegin(), tokens.cend(), 0, + [](int s, const Token &t) { return s + t.length; }); + } + if (inStringGap) + result.state = int(Tokens::State::StringGap); + else if (multiLineCommentLevel > 0) + result.state = int(Tokens::State::MultiLineCommentGuard) + multiLineCommentLevel; + return result; +} + +bool Token::isValid() const +{ + return type != TokenType::Unknown; +} + +} // Internal +} // Haskell diff --git a/plugins/haskell/haskelltokenizer.h b/plugins/haskell/haskelltokenizer.h new file mode 100644 index 0000000..46b4b00 --- /dev/null +++ b/plugins/haskell/haskelltokenizer.h @@ -0,0 +1,91 @@ +/**************************************************************************** +** +** Copyright (C) 2017 The Qt Company Ltd. +** Contact: https://www.qt.io/licensing/ +** +** This file is part of Qt Creator. +** +** Commercial License Usage +** Licensees holding valid commercial Qt licenses may use this file in +** accordance with the commercial license agreement provided with the +** Software or, alternatively, in accordance with the terms contained in +** a written agreement between you and The Qt Company. For licensing terms +** and conditions see https://www.qt.io/terms-conditions. For further +** information use the contact form at https://www.qt.io/contact-us. +** +** GNU General Public License Usage +** Alternatively, this file may be used under the terms of the GNU +** General Public License version 3 as published by the Free Software +** Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT +** included in the packaging of this file. Please review the following +** information to ensure the GNU General Public License requirements will +** be met: https://www.gnu.org/licenses/gpl-3.0.html. +** +****************************************************************************/ + +#pragma once + +#include <QChar> +#include <QString> +#include <QVector> + +#include <memory> + +namespace Haskell { +namespace Internal { + +enum class TokenType { + Variable, + Constructor, + Operator, + OperatorConstructor, + Whitespace, + String, + StringError, + Char, + CharError, + EscapeSequence, + Integer, + Float, + Keyword, + Special, + SingleLineComment, + MultiLineComment, + Unknown +}; + +class Token { +public: + bool isValid() const; + + TokenType type = TokenType::Unknown; + int startCol = -1; + int length = -1; + QStringRef text; + + std::shared_ptr<QString> source; // keep the string ref alive +}; + +class Tokens : public QVector<Token> +{ +public: + enum class State { + None = -1, + StringGap = 0, // gap == two backslashes enclosing only whitespace + MultiLineCommentGuard // nothing may follow that + }; + + Tokens(std::shared_ptr<QString> source); + + std::shared_ptr<QString> source; + int state = int(State::None); +}; + +class HaskellTokenizer +{ +public: + static Tokens tokenize(const QString &line, int startState); +}; + +} // Internal +} // Haskell |