From 70122b3061ee3fbb07442beb0158edf849ceb98e Mon Sep 17 00:00:00 2001
From: Nikolai Kosjar <nikolai.kosjar@digia.com>
Date: Tue, 25 Feb 2014 13:44:11 -0300
Subject: C++: Support for UTF-8 in the lexer

This will save us toLatin1() conversations in CppTools (which already
holds UTF-8 encoded QByteArrays) and thus loss of information (see
QTCREATORBUG-7356). It also gives us support for non-latin1 identifiers.

API-wise the following functions are added to Token. In follow-up
patches these will become handy in combination with QStrings.
    utf16chars() - aequivalent of bytes()
    utf16charsBegin() - aequivalent of bytesBegin()
    utf16charsEnd() - aequivalent of bytesEnd()

Next steps:
 * Adapt functions from TranslationUnit. They should work with utf16
   chars in order to calculate lines and columns correctly also for
   UTF-8 multi-byte code points.
 * Adapt the higher level clients:
    * Cpp{Tools,Editor} should expect UTF-8 encoded Literals.
    * Cpp{Tools,Editor}: When dealing with identifiers on the
      QString/QTextDocument layer, code points
      represendet by two QChars need to be respected, too.
 * Ensure Macro::offsets() and Document::MacroUse::{begin,end}() report
   offsets usable in CppEditor/CppTools.

Addresses QTCREATORBUG-7356.

Change-Id: I0791b5236be8215d24fb8e38a1f7cb0d279454c0
Reviewed-by: Erik Verbruggen <erik.verbruggen@digia.com>
---
 src/libs/3rdparty/cplusplus/Lexer.h | 31 +++++++++++++++++++++++++++----
 1 file changed, 27 insertions(+), 4 deletions(-)

(limited to 'src/libs/3rdparty/cplusplus/Lexer.h')

diff --git a/src/libs/3rdparty/cplusplus/Lexer.h b/src/libs/3rdparty/cplusplus/Lexer.h
index 43a877e7a84..8d63d2ba1db 100644
--- a/src/libs/3rdparty/cplusplus/Lexer.h
+++ b/src/libs/3rdparty/cplusplus/Lexer.h
@@ -62,6 +62,7 @@ public:
     void setLanguageFeatures(LanguageFeatures features) { _languageFeatures = features; }
 
 private:
+    void pushLineStartOffset();
     void scan_helper(Token *tok);
     void setSource(const char *firstChar, const char *lastChar);
     static int classify(const char *string, int length, LanguageFeatures features);
@@ -77,15 +78,32 @@ private:
     void scanBackslash(Kind type);
     void scanCppComment(Kind type);
 
-    inline void yyinp()
+    static bool isByteOfMultiByteCodePoint(unsigned char byte)
+    { return byte & 0x80; } // Check if most significant bit is set
+
+    void yyinp()
     {
-        _yychar = *++_currentChar;
+        ++_currentCharUtf16;
+
+        // Process multi-byte UTF-8 code point (non-latin1)
+        if (CPLUSPLUS_UNLIKELY(isByteOfMultiByteCodePoint(_yychar))) {
+            unsigned trailingBytesCurrentCodePoint = 1;
+            for (unsigned char c = _yychar << 2; isByteOfMultiByteCodePoint(c); c <<= 1)
+                ++trailingBytesCurrentCodePoint;
+            // Code points >= 0x00010000 are represented by two UTF16 code units
+            if (trailingBytesCurrentCodePoint >= 3)
+                ++_currentCharUtf16;
+            _yychar = *(_currentChar += trailingBytesCurrentCodePoint + 1);
+
+        // Process single-byte UTF-8 code point (latin1)
+        } else {
+            _yychar = *++_currentChar;
+        }
+
         if (CPLUSPLUS_UNLIKELY(_yychar == '\n'))
             pushLineStartOffset();
     }
 
-    void pushLineStartOffset();
-
 private:
     struct Flags {
         unsigned _scanCommentTokens: 1;
@@ -105,6 +123,10 @@ private:
     const char *_lastChar;
     const char *_tokenStart;
     unsigned char _yychar;
+
+    unsigned _currentCharUtf16;
+    unsigned _tokenStartUtf16;
+
     union {
         unsigned char _state;
         State s;
@@ -113,6 +135,7 @@ private:
         unsigned _flags;
         Flags f;
     };
+
     unsigned _currentLine;
     LanguageFeatures _languageFeatures;
 };
-- 
cgit v1.2.3