C++: Support for UTF-8 in the lexer

This will save us toLatin1() conversations in CppTools (which already holds UTF-8 encoded QByteArrays) and thus loss of information (see QTCREATORBUG-7356). It also gives us support for non-latin1 identifiers. API-wise the following functions are added to Token. In follow-up patches these will become handy in combination with QStrings. utf16chars() - aequivalent of bytes() utf16charsBegin() - aequivalent of bytesBegin() utf16charsEnd() - aequivalent of bytesEnd() Next steps: * Adapt functions from TranslationUnit. They should work with utf16 chars in order to calculate lines and columns correctly also for UTF-8 multi-byte code points. * Adapt the higher level clients: * Cpp{Tools,Editor} should expect UTF-8 encoded Literals. * Cpp{Tools,Editor}: When dealing with identifiers on the QString/QTextDocument layer, code points represendet by two QChars need to be respected, too. * Ensure Macro::offsets() and Document::MacroUse::{begin,end}() report offsets usable in CppEditor/CppTools. Addresses QTCREATORBUG-7356. Change-Id: I0791b5236be8215d24fb8e38a1f7cb0d279454c0 Reviewed-by: Erik Verbruggen <erik.verbruggen@digia.com>
author: Nikolai Kosjar <nikolai.kosjar@digia.com> 2014-02-25 13:44:11 -0300
committer: Nikolai Kosjar <nikolai.kosjar@digia.com> 2014-05-23 14:23:15 +0200
commit: 70122b3061ee3fbb07442beb0158edf849ceb98e (patch)
tree: e8c272ec1df948acd27378a44764dd683ab5b426 /src/libs/3rdparty/cplusplus/Token.h
parent: 4fefb1ca2a5270752acf00d586393f472fb1b9a3 (diff)
1 files changed, 11 insertions, 9 deletions
diff --git a/src/libs/3rdparty/cplusplus/Token.h b/src/libs/3rdparty/cplusplus/Token.h
index 02d7f5ebe9..ec10483852 100644
--- a/src/libs/3rdparty/cplusplus/Token.h
+++ b/src/libs/3rdparty/cplusplus/Token.h
@@ -285,7 +285,7 @@ enum Kind {
 class CPLUSPLUS_EXPORT Token
 {
 public:
-    Token() : flags(0), byteOffset(0), ptr(0) {}
+    Token() : flags(0), byteOffset(0), utf16charOffset(0), ptr(0) {}
 
     inline bool is(unsigned k) const    { return f.kind == k; }
     inline bool isNot(unsigned k) const { return f.kind != k; }
@@ -298,13 +298,14 @@ public:
     inline bool joined() const { return f.joined; }
     inline bool expanded() const { return f.expanded; }
     inline bool generated() const { return f.generated; }
-    inline unsigned bytes() const { return f.bytes; }
 
-    inline unsigned bytesBegin() const
-    { return byteOffset; }
+    inline unsigned bytes() const { return f.bytes; }
+    inline unsigned bytesBegin() const { return byteOffset; }
+    inline unsigned bytesEnd() const { return byteOffset + f.bytes; }
 
-    inline unsigned bytesEnd() const
-    { return byteOffset + f.bytes; }
+    inline unsigned utf16chars() const { return f.utf16chars; }
+    inline unsigned utf16charsBegin() const { return utf16charOffset; }
+    inline unsigned utf16charsEnd() const { return utf16charOffset + f.utf16chars; }
 
     inline bool isLiteral() const
     { return f.kind >= T_FIRST_LITERAL && f.kind <= T_LAST_LITERAL; }
@@ -354,15 +355,17 @@ public:
         unsigned generated     : 1;
         // Unused...
         unsigned pad           : 3;
-        // The token length in bytes.
+        // The token length in bytes and UTF16 chars.
         unsigned bytes         : 16;
+        unsigned utf16chars    : 16;
     };
     union {
-        unsigned flags;
+        unsigned long flags;
         Flags f;
     };
 
     unsigned byteOffset;
+    unsigned utf16charOffset;
 
     union {
         void *ptr;
@@ -393,5 +396,4 @@ struct LanguageFeatures
 
 } // namespace CPlusPlus
 
-
 #endif // CPLUSPLUS_TOKEN_H
author	Nikolai Kosjar <nikolai.kosjar@digia.com>	2014-02-25 13:44:11 -0300
committer	Nikolai Kosjar <nikolai.kosjar@digia.com>	2014-05-23 14:23:15 +0200
commit	70122b3061ee3fbb07442beb0158edf849ceb98e (patch)
tree	e8c272ec1df948acd27378a44764dd683ab5b426 /src/libs/3rdparty/cplusplus/Token.h
parent	4fefb1ca2a5270752acf00d586393f472fb1b9a3 (diff)