From b555f45c10a050be9410f9dc6286a44753cc7d98 Mon Sep 17 00:00:00 2001
From: Lars Knoll <lars.knoll@qt.io>
Date: Tue, 6 Feb 2018 00:04:27 +0100
Subject: Implement parsing of ECMAScript 6 Unicode escape sequences

ECMAScript 6 added the \u{XXXX} syntax to encode arbitrary
Unicode code points. Support this properly in our lexer.

One issue currently is that codepoints outside the BMP will
not yet be handled correctly.

Change-Id: Id46f9ec6fdbb264a5a919d84a16857afc9e8ca6e
Reviewed-by: Simon Hausmann <simon.hausmann@qt.io>
---
 src/qml/parser/qqmljslexer.cpp | 56 +++++++++++++++++++++++++++---------------
 1 file changed, 36 insertions(+), 20 deletions(-)

(limited to 'src/qml/parser/qqmljslexer.cpp')

diff --git a/src/qml/parser/qqmljslexer.cpp b/src/qml/parser/qqmljslexer.cpp
index a3382fa92e..a8c9ead80b 100644
--- a/src/qml/parser/qqmljslexer.cpp
+++ b/src/qml/parser/qqmljslexer.cpp
@@ -315,36 +315,52 @@ int Lexer::lex()
     return _tokenKind;
 }
 
-bool Lexer::isUnicodeEscapeSequence(const QChar *chars)
-{
-    if (isHexDigit(chars[0]) && isHexDigit(chars[1]) && isHexDigit(chars[2]) && isHexDigit(chars[3]))
-        return true;
-
-    return false;
-}
-
 QChar Lexer::decodeUnicodeEscapeCharacter(bool *ok)
 {
-    if (_char == QLatin1Char('u') && isUnicodeEscapeSequence(&_codePtr[0])) {
-        scanChar(); // skip u
+    Q_ASSERT(_char == QLatin1Char('u'));
+    scanChar(); // skip u
+    if (_codePtr + 4 <= _endPtr && isHexDigit(_char)) {
+        ushort codePoint = 0;
+        for (int i = 0; i < 4; ++i) {
+            int digit = hexDigit(_char);
+            if (digit < 0)
+                goto error;
+            codePoint *= 16;
+            codePoint += digit;
+            scanChar();
+        }
 
-        const QChar c1 = _char;
-        scanChar();
+        *ok = true;
+        return QChar(codePoint);
+    } else if (_codePtr < _endPtr && _char == QLatin1Char('{')) {
+        scanChar(); // skip '{'
+        uint codePoint = 0;
+        if (!isHexDigit(_char))
+            // need at least one hex digit
+            goto error;
 
-        const QChar c2 = _char;
-        scanChar();
+        while (_codePtr <= _endPtr) {
+            int digit = hexDigit(_char);
+            if (digit < 0)
+                break;
+            codePoint *= 16;
+            codePoint += digit;
+            if (codePoint > 0x10ffff)
+                goto error;
+            scanChar();
+        }
 
-        const QChar c3 = _char;
-        scanChar();
+        if (_char != QLatin1Char('}'))
+            goto error;
 
-        const QChar c4 = _char;
-        scanChar();
+        scanChar(); // skip '}'
 
-        *ok = true;
 
-        return convertUnicode(c1, c2, c3, c4);
+        *ok = true;
+        return QChar(codePoint);
     }
 
+  error:
     _errorCode = IllegalUnicodeEscapeSequence;
     _errorMessage = QCoreApplication::translate("QQmlParser", "Illegal unicode escape sequence");
 
-- 
cgit v1.2.3