/**************************************************************************** ** ** Copyright (C) 2017 The Qt Company Ltd. ** Contact: https://www.qt.io/licensing/ ** ** This file is part of Qt Creator. ** ** Commercial License Usage ** Licensees holding valid commercial Qt licenses may use this file in ** accordance with the commercial license agreement provided with the ** Software or, alternatively, in accordance with the terms contained in ** a written agreement between you and The Qt Company. For licensing terms ** and conditions see https://www.qt.io/terms-conditions. For further ** information use the contact form at https://www.qt.io/contact-us. ** ** GNU General Public License Usage ** Alternatively, this file may be used under the terms of the GNU ** General Public License version 3 as published by the Free Software ** Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT ** included in the packaging of this file. Please review the following ** information to ensure the GNU General Public License requirements will ** be met: https://www.gnu.org/licenses/gpl-3.0.html. ** ****************************************************************************/ #include "haskelltokenizer.h" #include #include #include Q_GLOBAL_STATIC_WITH_ARGS(QSet, RESERVED_OP, ({ "..", ":", "::", "=", "\\", "|", "<-", "->", "@", "~", "=>", // Arrows GHC extension "-<", "-<<", ">-", ">>-", "(|", "|)" })); Q_GLOBAL_STATIC_WITH_ARGS(QSet, RESERVED_ID, ({ "case", "class", "data", "default", "deriving", "do", "else", "foreign", "if", "import", "in", "infix", "infixl", "infixr", "instance", "let", "module", "newtype", "of", "then", "type", "where", "_", // from GHC extensions "family", "forall", "mdo", "proc", "rec" })); Q_GLOBAL_STATIC_WITH_ARGS(QSet, SPECIAL, ({ '(', ')', ',', ';', '[', ']', '`', '{', '}', })); Q_GLOBAL_STATIC_WITH_ARGS(QSet, CHAR_ESCAPES, ({'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '"', '\'', '&'})); Q_GLOBAL_STATIC_WITH_ARGS(QVector, ASCII_ESCAPES, ({ "NUL", "SOH", // must be before "SO" to match "STX", "ETX", "EOT", "ENQ", "ACK", "BEL", "BS", "HT", "LF", "VT", "FF", "CR", "SO", "SI", "DLE", "DC1", "DC2", "DC3", "DC4", "NAK", "SYN", "ETB", "CAN", "EM", "SUB", "ESC", "FS", "GS", "RS", "US", "SP", "DEL" })); namespace Haskell { namespace Internal { Token token(TokenType type, std::shared_ptr line, int start, int end) { return {type, start, end - start, line->midRef(start, end - start), line}; } Tokens::Tokens(std::shared_ptr source) : source(source) { } Token Tokens::tokenAtColumn(int col) const { auto it = std::upper_bound(begin(), end(), col, [](int c, const Token &i) { return c < i.startCol; }); if (it == begin()) return Token(); --it; if (it->startCol + it->length > col) return *it; return Token(); } static int grab(const QString &line, int begin, const std::function &test) { const int length = line.length(); int current = begin; while (current < length && test(line.at(current))) ++current; return current - begin; }; static bool isIdentifierChar(const QChar &c) { return c.isLetterOrNumber() || c == '\'' || c == '_'; } static bool isVariableIdentifierStart(const QChar &c) { return c == '_' || c.isLower(); } static bool isAscSymbol(const QChar &c) { return c == '!' || c == '#' || c == '$' || c == '%' || c == '&' || c == '*' || c == '+' || c == '.' || c == '/' || c == '<' || c == '=' || c == '>' || c == '?' || c == '@' || c == '\\' || c == '^' || c == '|' || c == '-' || c == '~' || c == ':'; } static bool isSymbol(const QChar &c) { return isAscSymbol(c) || ((c.isSymbol() || c.isPunct()) && c != '_' && c != '"' && c != '\'' && !SPECIAL->contains(c)); } static bool isDigit(const QChar &c) { return c.isDigit(); } static bool isOctit(const QChar &c) { return c >= '0' && c <= '7'; } static bool isHexit(const QChar &c) { return c.isDigit() || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f'); } static bool isCntrl(const QChar &c) { return (c >= 'A' && c <= 'Z') || c == '@' || c == '[' || c == '\\' || c == ']' || c == '^' || c == '_'; } static QVector getSpace(std::shared_ptr line, int start) { const auto lineEnd = line->cend(); const auto tokenStart = line->cbegin() + start; auto current = tokenStart; while (current != lineEnd && (*current).isSpace()) ++current; const int length = int(std::distance(tokenStart, current)); if (current > tokenStart) return {{TokenType::Whitespace, start, length, line->midRef(start, length), line}}; return {}; } static QVector getNumber(std::shared_ptr line, int start) { const QChar &startC = line->at(start); if (!startC.isDigit()) return {}; const int length = line->length(); int current = start + 1; TokenType type = TokenType::Integer; if (current < length) { if (startC == '0') { // check for octal or hexadecimal const QChar &secondC = line->at(current); if (secondC == 'o' || secondC == 'O') { const int numLen = grab(*line, current + 1, isOctit); if (numLen > 0) return {token(TokenType::Integer, line, start, current + numLen + 1)}; } else if (secondC == 'x' || secondC == 'X') { const int numLen = grab(*line, current + 1, isHexit); if (numLen > 0) return {token(TokenType::Integer, line, start, current + numLen + 1)}; } } // starts with decimal const int numLen = grab(*line, start, isDigit); current = start + numLen; // check for floating point if (current < length && line->at(current) == '.') { const int numLen = grab(*line, current + 1, isDigit); if (numLen > 0) { current += numLen + 1; type = TokenType::Float; } } // check for exponent if (current + 1 < length /*for at least 'e' and digit*/ && (line->at(current) == 'e' || line->at(current) == 'E')) { int expEnd = current + 1; if (line->at(expEnd) == '+' || line->at(expEnd) == '-') ++expEnd; const int numLen = grab(*line, expEnd, isDigit); if (numLen > 0) { current = expEnd + numLen; type = TokenType::Float; } } } return {token(type, line, start, current)}; } static QVector getIdOrOpOrSingleLineComment(std::shared_ptr line, int start) { const int length = line->length(); if (start >= length) return {}; int current = start; // check for {conid.}conid int conidEnd = start; bool canOnlyBeConstructor = false; while (current < length && line->at(current).isUpper()) { current += grab(*line, current, isIdentifierChar); conidEnd = current; // it is definitely a constructor id if it is not followed by a '.' canOnlyBeConstructor = current >= length || line->at(current) != '.'; // otherwise it might be a module id, and we skip the dot to check for qualified thing if (!canOnlyBeConstructor) ++current; } if (canOnlyBeConstructor) return {token(TokenType::Constructor, line, start, conidEnd)}; // check for variable or reserved id if (current < length && isVariableIdentifierStart(line->at(current))) { const int varLen = grab(*line, current, isIdentifierChar); // check for reserved id if (RESERVED_ID->contains(line->mid(current, varLen))) { QVector result; // possibly add constructor + op '.' if (conidEnd > start) { result.append(token(TokenType::Constructor, line, start, conidEnd)); result.append(token(TokenType::Operator, line, conidEnd, current)); } result.append(token(TokenType::Keyword, line, current, current + varLen)); return result; } return {token(TokenType::Variable, line, start, current + varLen)}; } // check for operator if (current < length && isSymbol(line->at(current))) { const int opLen = grab(*line, current, isSymbol); // check for reserved op if (RESERVED_OP->contains(line->mid(current, opLen))) { // because of the case of F... (constructor + op '...') etc // we only add conid if we have one, handling the rest in next iteration if (conidEnd > start) return {token(TokenType::Constructor, line, start, conidEnd)}; return {token(TokenType::Keyword, line, start, current + opLen)}; } // check for single line comment if (opLen >= 2 && std::all_of(line->begin() + current, line->begin() + current + opLen, [](const QChar c) { return c == '-'; })) { QVector result; // possibly add constructor + op '.' if (conidEnd > start) { result.append(token(TokenType::Constructor, line, start, conidEnd)); result.append(token(TokenType::Operator, line, conidEnd, current)); } // rest is comment result.append(token(TokenType::SingleLineComment, line, current, length)); return result; } // check for (qualified?) operator constructor if (line->at(current) == ':') return {token(TokenType::OperatorConstructor, line, start, current + opLen)}; return {token(TokenType::Operator, line, start, current + opLen)}; } // Foo.Blah. if (conidEnd > start) return {token(TokenType::Constructor, line, start, conidEnd)}; return {}; } static int getEscape(const QString &line, int start) { if (CHAR_ESCAPES->contains(line.at(start))) return 1; // decimal if (line.at(start).isDigit()) return grab(line, start + 1, isDigit) + 1; // octal if (line.at(start) == 'o') { const int count = grab(line, start + 1, isOctit); if (count < 1) // no octal number after 'o' return 0; return count + 1; } // hexadecimal if (line.at(start) == 'x') { const int count = grab(line, start + 1, isHexit); if (count < 1) // no octal number after 'o' return 0; return count + 1; } // ascii cntrl if (line.at(start) == '^') { const int count = grab(line, start + 1, isCntrl); if (count < 1) // no octal number after 'o' return 0; return count + 1; } const QStringRef s = line.midRef(start); for (const QString &esc : *ASCII_ESCAPES) { if (s.startsWith(esc)) return esc.length(); } return 0; } static QVector getString(std::shared_ptr line, int start, bool *inStringGap/*in-out*/) { // Haskell has the specialty of using \\ within strings for multiline strings const int length = line->length(); if (start >= length) return {}; QVector result; int tokenStart = start; int current = tokenStart; bool inString = *inStringGap; do { const QChar c = line->at(current); if (*inStringGap && !c.isSpace() && c != '\\') { // invalid non-whitespace in string gap // add previous string as token, this is at least a whitespace result.append(token(TokenType::String, line, tokenStart, current)); // then add wrong non-whitespace tokenStart = current; do { ++current; } while (current < length && !line->at(current).isSpace()); result.append(token(TokenType::StringError, line, tokenStart, current)); tokenStart = current; } else if (c == '"') { inString = !inString; ++current; } else if (inString) { if (c == '\\') { ++current; if (*inStringGap) { // ending string gap *inStringGap = false; } else if (current >= length || line->at(current).isSpace()) { // starting string gap *inStringGap = true; current = std::min(current + 1, length); } else { // there is at least one character after current const int escapeLength = getEscape(*line, current); if (escapeLength > 0) { // valid escape // add previous string as token without backslash, if necessary if (tokenStart < current - 1/*backslash*/) result.append(token(TokenType::String, line, tokenStart, current - 1)); tokenStart = current - 1; // backslash current += escapeLength; result.append(token(TokenType::EscapeSequence, line, tokenStart, current)); tokenStart = current; } else { // invalid escape sequence // add previous string as token, this is at least backslash result.append(token(TokenType::String, line, tokenStart, current)); result.append(token(TokenType::StringError, line, current, current + 1)); ++current; tokenStart = current; } } } else { ++current; } } } while (current < length && inString); if (current > tokenStart) result.append(token(TokenType::String, line, tokenStart, current)); if (inString && !*inStringGap) { // unterminated string // mark last character of last token as Unknown as an error hint if (!result.isEmpty()) { // should actually never be different Token &lastRef = result.last(); if (lastRef.length == 1) { lastRef.type = TokenType::StringError; } else { --lastRef.length; lastRef.text = line->midRef(lastRef.startCol, lastRef.length); result.append(token(TokenType::StringError, line, current - 1, current)); } } } return result; } static QVector getMultiLineComment(std::shared_ptr line, int start, int *commentLevel/*in_out*/) { // Haskell multiline comments can be nested {- foo {- bar -} blah -} const int length = line->length(); int current = start; do { const QStringRef test = line->midRef(current, 2); if (test == "{-") { ++(*commentLevel); current += 2; } else if (test == "-}" && *commentLevel > 0) { --(*commentLevel); current += 2; } else if (*commentLevel > 0) { ++current; } } while (current < length && *commentLevel > 0); if (current > start) { return {token(TokenType::MultiLineComment, line, start, current)}; } return {}; } static QVector getChar(std::shared_ptr line, int start) { if (line->at(start) != '\'') return {}; QVector result; const int length = line->length(); int tokenStart = start; int current = tokenStart + 1; bool inChar = true; int count = 0; while (current < length && inChar) { if (line->at(current) == '\'') { inChar = false; ++current; } else if (count == 1) { // we already have one character, so start Unknown token if (current > tokenStart) result.append(token(TokenType::Char, line, tokenStart, current)); tokenStart = current; ++count; ++current; } else if (count > 1) { ++count; ++current; } else if (line->at(current) == '\\') { if (current + 1 < length) { ++current; ++count; const int escapeLength = getEscape(*line, current); if (line->at(current) != '&' && escapeLength > 0) { // no & escape for chars // valid escape // add previous string as token without backslash, if necessary if (tokenStart < current - 1/*backslash*/) result.append(token(TokenType::Char, line, tokenStart, current - 1)); tokenStart = current - 1; // backslash current += escapeLength; result.append(token(TokenType::EscapeSequence, line, tokenStart, current)); tokenStart = current; } else { // invalid escape sequence // add previous string as token, this is at least backslash result.append(token(TokenType::Char, line, tokenStart, current)); result.append(token(TokenType::CharError, line, current, current + 1)); ++current; tokenStart = current; } } else { ++current; } } else { ++count; ++current; } } if (count > 1 && inChar) { // too long and unterminated, just add Unknown token till end result.append(token(TokenType::CharError, line, tokenStart, current)); } else if (count > 1) { // too long but terminated, add Unknown up to ending quote, then quote result.append(token(TokenType::CharError, line, tokenStart, current - 1)); result.append(token(TokenType::Char, line, current - 1, current)); } else if (inChar || count < 1) { // unterminated, or no character inside, mark last character as error if (current > tokenStart + 1) result.append(token(TokenType::Char, line, tokenStart, current - 1)); result.append(token(TokenType::CharError, line, current - 1, current)); } else { result.append(token(TokenType::Char, line, tokenStart, current)); } return result; } static QVector getSpecial(std::shared_ptr line, int start) { if (SPECIAL->contains(line->at(start))) return {{TokenType::Special, start, 1, line->midRef(start, 1), line}}; return {}; } Tokens HaskellTokenizer::tokenize(const QString &line, int startState) { Tokens result(std::make_shared(line)); const int length = result.source->length(); bool inStringGap = startState == int(Tokens::State::StringGap); int multiLineCommentLevel = std::max(startState - int(Tokens::State::MultiLineCommentGuard), 0); int currentStart = 0; QVector tokens; while (currentStart < length) { if (multiLineCommentLevel <= 0 && !(tokens = getString(result.source, currentStart, &inStringGap)).isEmpty()) { result.append(tokens); } else if (!(tokens = getMultiLineComment(result.source, currentStart, &multiLineCommentLevel)).isEmpty()) { result.append(tokens); } else if (!(tokens = getChar(result.source, currentStart)).isEmpty()) { result.append(tokens); } else if (!(tokens = getSpace(result.source, currentStart)).isEmpty()) { result.append(tokens); } else if (!(tokens = getNumber(result.source, currentStart)).isEmpty()) { result.append(tokens); } else if (!(tokens = getIdOrOpOrSingleLineComment(result.source, currentStart)).isEmpty()) { result.append(tokens); } else if (!(tokens = getSpecial(result.source, currentStart)).isEmpty()) { result.append(tokens); } else { tokens = {{TokenType::Unknown, currentStart, 1, result.source->midRef(currentStart, 1), result.source}}; result.append(tokens); } currentStart += std::accumulate(tokens.cbegin(), tokens.cend(), 0, [](int s, const Token &t) { return s + t.length; }); } if (inStringGap) result.state = int(Tokens::State::StringGap); else if (multiLineCommentLevel > 0) result.state = int(Tokens::State::MultiLineCommentGuard) + multiLineCommentLevel; return result; } bool Token::isValid() const { return type != TokenType::Unknown; } } // Internal } // Haskell