aboutsummaryrefslogtreecommitdiffstats
path: root/src/libs/3rdparty/cplusplus/Lexer.h
diff options
context:
space:
mode:
authorNikolai Kosjar <nikolai.kosjar@digia.com>2014-02-25 13:44:11 -0300
committerNikolai Kosjar <nikolai.kosjar@digia.com>2014-05-23 14:23:15 +0200
commit70122b3061ee3fbb07442beb0158edf849ceb98e (patch)
treee8c272ec1df948acd27378a44764dd683ab5b426 /src/libs/3rdparty/cplusplus/Lexer.h
parent4fefb1ca2a5270752acf00d586393f472fb1b9a3 (diff)
C++: Support for UTF-8 in the lexer
This will save us toLatin1() conversations in CppTools (which already holds UTF-8 encoded QByteArrays) and thus loss of information (see QTCREATORBUG-7356). It also gives us support for non-latin1 identifiers. API-wise the following functions are added to Token. In follow-up patches these will become handy in combination with QStrings. utf16chars() - aequivalent of bytes() utf16charsBegin() - aequivalent of bytesBegin() utf16charsEnd() - aequivalent of bytesEnd() Next steps: * Adapt functions from TranslationUnit. They should work with utf16 chars in order to calculate lines and columns correctly also for UTF-8 multi-byte code points. * Adapt the higher level clients: * Cpp{Tools,Editor} should expect UTF-8 encoded Literals. * Cpp{Tools,Editor}: When dealing with identifiers on the QString/QTextDocument layer, code points represendet by two QChars need to be respected, too. * Ensure Macro::offsets() and Document::MacroUse::{begin,end}() report offsets usable in CppEditor/CppTools. Addresses QTCREATORBUG-7356. Change-Id: I0791b5236be8215d24fb8e38a1f7cb0d279454c0 Reviewed-by: Erik Verbruggen <erik.verbruggen@digia.com>
Diffstat (limited to 'src/libs/3rdparty/cplusplus/Lexer.h')
-rw-r--r--src/libs/3rdparty/cplusplus/Lexer.h31
1 files changed, 27 insertions, 4 deletions
diff --git a/src/libs/3rdparty/cplusplus/Lexer.h b/src/libs/3rdparty/cplusplus/Lexer.h
index 43a877e7a8..8d63d2ba1d 100644
--- a/src/libs/3rdparty/cplusplus/Lexer.h
+++ b/src/libs/3rdparty/cplusplus/Lexer.h
@@ -62,6 +62,7 @@ public:
void setLanguageFeatures(LanguageFeatures features) { _languageFeatures = features; }
private:
+ void pushLineStartOffset();
void scan_helper(Token *tok);
void setSource(const char *firstChar, const char *lastChar);
static int classify(const char *string, int length, LanguageFeatures features);
@@ -77,15 +78,32 @@ private:
void scanBackslash(Kind type);
void scanCppComment(Kind type);
- inline void yyinp()
+ static bool isByteOfMultiByteCodePoint(unsigned char byte)
+ { return byte & 0x80; } // Check if most significant bit is set
+
+ void yyinp()
{
- _yychar = *++_currentChar;
+ ++_currentCharUtf16;
+
+ // Process multi-byte UTF-8 code point (non-latin1)
+ if (CPLUSPLUS_UNLIKELY(isByteOfMultiByteCodePoint(_yychar))) {
+ unsigned trailingBytesCurrentCodePoint = 1;
+ for (unsigned char c = _yychar << 2; isByteOfMultiByteCodePoint(c); c <<= 1)
+ ++trailingBytesCurrentCodePoint;
+ // Code points >= 0x00010000 are represented by two UTF16 code units
+ if (trailingBytesCurrentCodePoint >= 3)
+ ++_currentCharUtf16;
+ _yychar = *(_currentChar += trailingBytesCurrentCodePoint + 1);
+
+ // Process single-byte UTF-8 code point (latin1)
+ } else {
+ _yychar = *++_currentChar;
+ }
+
if (CPLUSPLUS_UNLIKELY(_yychar == '\n'))
pushLineStartOffset();
}
- void pushLineStartOffset();
-
private:
struct Flags {
unsigned _scanCommentTokens: 1;
@@ -105,6 +123,10 @@ private:
const char *_lastChar;
const char *_tokenStart;
unsigned char _yychar;
+
+ unsigned _currentCharUtf16;
+ unsigned _tokenStartUtf16;
+
union {
unsigned char _state;
State s;
@@ -113,6 +135,7 @@ private:
unsigned _flags;
Flags f;
};
+
unsigned _currentLine;
LanguageFeatures _languageFeatures;
};