From 70122b3061ee3fbb07442beb0158edf849ceb98e Mon Sep 17 00:00:00 2001 From: Nikolai Kosjar Date: Tue, 25 Feb 2014 13:44:11 -0300 Subject: C++: Support for UTF-8 in the lexer This will save us toLatin1() conversations in CppTools (which already holds UTF-8 encoded QByteArrays) and thus loss of information (see QTCREATORBUG-7356). It also gives us support for non-latin1 identifiers. API-wise the following functions are added to Token. In follow-up patches these will become handy in combination with QStrings. utf16chars() - aequivalent of bytes() utf16charsBegin() - aequivalent of bytesBegin() utf16charsEnd() - aequivalent of bytesEnd() Next steps: * Adapt functions from TranslationUnit. They should work with utf16 chars in order to calculate lines and columns correctly also for UTF-8 multi-byte code points. * Adapt the higher level clients: * Cpp{Tools,Editor} should expect UTF-8 encoded Literals. * Cpp{Tools,Editor}: When dealing with identifiers on the QString/QTextDocument layer, code points represendet by two QChars need to be respected, too. * Ensure Macro::offsets() and Document::MacroUse::{begin,end}() report offsets usable in CppEditor/CppTools. Addresses QTCREATORBUG-7356. Change-Id: I0791b5236be8215d24fb8e38a1f7cb0d279454c0 Reviewed-by: Erik Verbruggen --- src/libs/3rdparty/cplusplus/Lexer.h | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) (limited to 'src/libs/3rdparty/cplusplus/Lexer.h') diff --git a/src/libs/3rdparty/cplusplus/Lexer.h b/src/libs/3rdparty/cplusplus/Lexer.h index 43a877e7a84..8d63d2ba1db 100644 --- a/src/libs/3rdparty/cplusplus/Lexer.h +++ b/src/libs/3rdparty/cplusplus/Lexer.h @@ -62,6 +62,7 @@ public: void setLanguageFeatures(LanguageFeatures features) { _languageFeatures = features; } private: + void pushLineStartOffset(); void scan_helper(Token *tok); void setSource(const char *firstChar, const char *lastChar); static int classify(const char *string, int length, LanguageFeatures features); @@ -77,15 +78,32 @@ private: void scanBackslash(Kind type); void scanCppComment(Kind type); - inline void yyinp() + static bool isByteOfMultiByteCodePoint(unsigned char byte) + { return byte & 0x80; } // Check if most significant bit is set + + void yyinp() { - _yychar = *++_currentChar; + ++_currentCharUtf16; + + // Process multi-byte UTF-8 code point (non-latin1) + if (CPLUSPLUS_UNLIKELY(isByteOfMultiByteCodePoint(_yychar))) { + unsigned trailingBytesCurrentCodePoint = 1; + for (unsigned char c = _yychar << 2; isByteOfMultiByteCodePoint(c); c <<= 1) + ++trailingBytesCurrentCodePoint; + // Code points >= 0x00010000 are represented by two UTF16 code units + if (trailingBytesCurrentCodePoint >= 3) + ++_currentCharUtf16; + _yychar = *(_currentChar += trailingBytesCurrentCodePoint + 1); + + // Process single-byte UTF-8 code point (latin1) + } else { + _yychar = *++_currentChar; + } + if (CPLUSPLUS_UNLIKELY(_yychar == '\n')) pushLineStartOffset(); } - void pushLineStartOffset(); - private: struct Flags { unsigned _scanCommentTokens: 1; @@ -105,6 +123,10 @@ private: const char *_lastChar; const char *_tokenStart; unsigned char _yychar; + + unsigned _currentCharUtf16; + unsigned _tokenStartUtf16; + union { unsigned char _state; State s; @@ -113,6 +135,7 @@ private: unsigned _flags; Flags f; }; + unsigned _currentLine; LanguageFeatures _languageFeatures; }; -- cgit v1.2.3