diff options
author | Lars Knoll <lars.knoll@qt.io> | 2018-02-08 13:42:50 +0100 |
---|---|---|
committer | Lars Knoll <lars.knoll@qt.io> | 2018-02-09 07:55:13 +0000 |
commit | 0ba8764849eb44e62dd8ddc22d1335174ac5beb4 (patch) | |
tree | 57461dab5c93569a2289726d036943a41c0a73a5 /src/qml/parser/qqmljslexer.cpp | |
parent | 6059f6f2a485ca3dddfedc55536e4f62f349a990 (diff) |
Properly parse identifiers containing chars outside the BMP
If the identifier contains characters outside the Unicode
BMP, we need to correctly decode them from surrogate pairs
or unicode escape sequences before testing whether they are
valid characters for an identifier.
Change-Id: I5d6ceaf27a353d3a708da86f08cfb9796eb8b1d3
Reviewed-by: Simon Hausmann <simon.hausmann@qt.io>
Diffstat (limited to 'src/qml/parser/qqmljslexer.cpp')
-rw-r--r-- | src/qml/parser/qqmljslexer.cpp | 117 |
1 files changed, 76 insertions, 41 deletions
diff --git a/src/qml/parser/qqmljslexer.cpp b/src/qml/parser/qqmljslexer.cpp index a8c9ead80b..6a3bd6d887 100644 --- a/src/qml/parser/qqmljslexer.cpp +++ b/src/qml/parser/qqmljslexer.cpp @@ -315,12 +315,12 @@ int Lexer::lex() return _tokenKind; } -QChar Lexer::decodeUnicodeEscapeCharacter(bool *ok) +uint Lexer::decodeUnicodeEscapeCharacter(bool *ok) { Q_ASSERT(_char == QLatin1Char('u')); scanChar(); // skip u if (_codePtr + 4 <= _endPtr && isHexDigit(_char)) { - ushort codePoint = 0; + uint codePoint = 0; for (int i = 0; i < 4; ++i) { int digit = hexDigit(_char); if (digit < 0) @@ -331,7 +331,7 @@ QChar Lexer::decodeUnicodeEscapeCharacter(bool *ok) } *ok = true; - return QChar(codePoint); + return codePoint; } else if (_codePtr < _endPtr && _char == QLatin1Char('{')) { scanChar(); // skip '{' uint codePoint = 0; @@ -357,7 +357,7 @@ QChar Lexer::decodeUnicodeEscapeCharacter(bool *ok) *ok = true; - return QChar(codePoint); + return codePoint; } error: @@ -365,7 +365,7 @@ QChar Lexer::decodeUnicodeEscapeCharacter(bool *ok) _errorMessage = QCoreApplication::translate("QQmlParser", "Illegal unicode escape sequence"); *ok = false; - return QChar(); + return 0; } QChar Lexer::decodeHexEscapeCharacter(bool *ok) @@ -389,15 +389,15 @@ QChar Lexer::decodeHexEscapeCharacter(bool *ok) return QChar(); } -static inline bool isIdentifierStart(QChar ch) +static inline bool isIdentifierStart(uint ch) { // fast path for ascii - if ((ch.unicode() >= 'a' && ch.unicode() <= 'z') || - (ch.unicode() >= 'A' && ch.unicode() <= 'Z') || + if ((ch >= 'a' && ch <= 'z') || + (ch >= 'A' && ch <= 'Z') || ch == '$' || ch == '_') return true; - switch (ch.category()) { + switch (QChar::category(ch)) { case QChar::Number_Letter: case QChar::Letter_Uppercase: case QChar::Letter_Lowercase: @@ -411,17 +411,17 @@ static inline bool isIdentifierStart(QChar ch) return false; } -static bool isIdentifierPart(QChar ch) +static bool isIdentifierPart(uint ch) { // fast path for ascii - if ((ch.unicode() >= 'a' && ch.unicode() <= 'z') || - (ch.unicode() >= 'A' && ch.unicode() <= 'Z') || - (ch.unicode() >= '0' && ch.unicode() <= '9') || + if ((ch >= 'a' && ch <= 'z') || + (ch >= 'A' && ch <= 'Z') || + (ch >= '0' && ch <= '9') || ch == '$' || ch == '_' || - ch.unicode() == 0x200c /* ZWNJ */ || ch.unicode() == 0x200d /* ZWJ */) + ch == 0x200c /* ZWNJ */ || ch == 0x200d /* ZWJ */) return true; - switch (ch.category()) { + switch (QChar::category(ch)) { case QChar::Mark_NonSpacing: case QChar::Mark_SpacingCombining: @@ -731,9 +731,16 @@ again: // unicode escape sequence case 'u': { bool ok = false; - u = decodeUnicodeEscapeCharacter(&ok); + uint codePoint = decodeUnicodeEscapeCharacter(&ok); if (!ok) return T_ERROR; + if (QChar::requiresSurrogates(codePoint)) { + // need to use a surrogate pair + _tokenText += QChar(QChar::highSurrogate(codePoint)); + u = QChar::lowSurrogate(codePoint); + } else { + u = codePoint; + } } break; // hex escape sequence @@ -815,9 +822,12 @@ again: return scanNumber(ch); default: { - QChar c = ch; + uint c = ch.unicode(); bool identifierWithEscapeChars = false; - if (c == QLatin1Char('\\') && _char == QLatin1Char('u')) { + if (QChar::isHighSurrogate(c) && QChar::isLowSurrogate(_char.unicode())) { + c = QChar::surrogateToUcs4(ushort(c), _char.unicode()); + scanChar(); + } else if (c == '\\' && _char == QLatin1Char('u')) { identifierWithEscapeChars = true; bool ok = false; c = decodeUnicodeEscapeCharacter(&ok); @@ -827,13 +837,21 @@ again: if (isIdentifierStart(c)) { if (identifierWithEscapeChars) { _tokenText.resize(0); - _tokenText += c; + if (QChar::requiresSurrogates(c)) { + _tokenText += QChar(QChar::highSurrogate(c)); + _tokenText += QChar(QChar::lowSurrogate(c)); + } else { + _tokenText += QChar(c); + } _validTokenText = true; } - while (true) { - c = _char; - if (_char == QLatin1Char('\\') && _codePtr[0] == QLatin1Char('u')) { - if (! identifierWithEscapeChars) { + while (_codePtr <= _endPtr) { + c = _char.unicode(); + if (QChar::isHighSurrogate(c) && QChar::isLowSurrogate(_codePtr->unicode())) { + scanChar(); + c = QChar::surrogateToUcs4(ushort(c), _char.unicode()); + } else if (_char == QLatin1Char('\\') && _codePtr[0] == QLatin1Char('u')) { + if (!identifierWithEscapeChars) { identifierWithEscapeChars = true; _tokenText.resize(0); _tokenText.insert(0, _tokenStartPtr, _codePtr - _tokenStartPtr - 1); @@ -845,33 +863,50 @@ again: c = decodeUnicodeEscapeCharacter(&ok); if (!ok) return T_ERROR; - if (isIdentifierPart(c)) - _tokenText += c; - continue; - } else if (isIdentifierPart(c)) { - if (identifierWithEscapeChars) - _tokenText += c; - scanChar(); + if (!isIdentifierPart(c)) + break; + + if (identifierWithEscapeChars) { + if (QChar::requiresSurrogates(c)) { + _tokenText += QChar(QChar::highSurrogate(c)); + _tokenText += QChar(QChar::lowSurrogate(c)); + } else { + _tokenText += QChar(c); + } + } continue; } - _tokenLength = _codePtr - _tokenStartPtr - 1; + if (!isIdentifierPart(c)) + break; - int kind = T_IDENTIFIER; + if (identifierWithEscapeChars) { + if (QChar::requiresSurrogates(c)) { + _tokenText += QChar(QChar::highSurrogate(c)); + _tokenText += QChar(QChar::lowSurrogate(c)); + } else { + _tokenText += QChar(c); + } + } + scanChar(); + } - if (! identifierWithEscapeChars) - kind = classify(_tokenStartPtr, _tokenLength, _qmlMode); + _tokenLength = _codePtr - _tokenStartPtr - 1; - if (_engine) { - if (kind == T_IDENTIFIER && identifierWithEscapeChars) - _tokenSpell = _engine->newStringRef(_tokenText); - else - _tokenSpell = _engine->midRef(_tokenStartPtr - _code.unicode(), _tokenLength); - } + int kind = T_IDENTIFIER; + + if (!identifierWithEscapeChars) + kind = classify(_tokenStartPtr, _tokenLength, _qmlMode); - return kind; + if (_engine) { + if (kind == T_IDENTIFIER && identifierWithEscapeChars) + _tokenSpell = _engine->newStringRef(_tokenText); + else + _tokenSpell = _engine->midRef(_tokenStartPtr - _code.unicode(), _tokenLength); } + + return kind; } } |