aboutsummaryrefslogtreecommitdiffstats
path: root/src/3rdparty/masm/yarr/YarrParser.h
diff options
context:
space:
mode:
authorLaszlo Agocs <laszlo.agocs@qt.io>2019-03-21 20:42:38 +0100
committerLaszlo Agocs <laszlo.agocs@qt.io>2019-03-21 20:42:47 +0100
commit6767114285db9d0e16dc278d08f231e8561546b4 (patch)
tree0945902a2242fd7ec0a1f7fd3e6acbb769e723bd /src/3rdparty/masm/yarr/YarrParser.h
parentee076afedccbe1d37306a7972051f84eb036d655 (diff)
parentc32b109e9dea44c6775c2dbf8f164870c1dc8971 (diff)
Merge remote-tracking branch 'origin/dev' into wip/scenegraphng
Diffstat (limited to 'src/3rdparty/masm/yarr/YarrParser.h')
-rw-r--r--src/3rdparty/masm/yarr/YarrParser.h555
1 files changed, 434 insertions, 121 deletions
diff --git a/src/3rdparty/masm/yarr/YarrParser.h b/src/3rdparty/masm/yarr/YarrParser.h
index 8c5d71b5fe..f1ffc92189 100644
--- a/src/3rdparty/masm/yarr/YarrParser.h
+++ b/src/3rdparty/masm/yarr/YarrParser.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2009 Apple Inc. All rights reserved.
+ * Copyright (C) 2009, 2014-2016 Apple Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -23,46 +23,25 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#ifndef YarrParser_h
-#define YarrParser_h
+#pragma once
#include "Yarr.h"
+#include "YarrPattern.h"
+#include "YarrUnicodeProperties.h"
#include <wtf/ASCIICType.h>
+#include <wtf/HashSet.h>
+#include <wtf/Optional.h>
+#include <wtf/text/StringBuilder.h>
#include <wtf/text/WTFString.h>
-#include <wtf/unicode/Unicode.h>
namespace JSC { namespace Yarr {
-#define REGEXP_ERROR_PREFIX "Invalid regular expression: "
-
-enum BuiltInCharacterClassID {
- DigitClassID,
- SpaceClassID,
- WordClassID,
- NewlineClassID,
-};
-
// The Parser class should not be used directly - only via the Yarr::parse() method.
template<class Delegate, typename CharType>
class Parser {
private:
template<class FriendDelegate>
- friend const char* parse(FriendDelegate&, const String& pattern, unsigned backReferenceLimit);
-
- enum ErrorCode {
- NoError,
- PatternTooLarge,
- QuantifierOutOfOrder,
- QuantifierWithoutAtom,
- QuantifierTooLarge,
- MissingParentheses,
- ParenthesesUnmatched,
- ParenthesesTypeInvalid,
- CharacterClassUnmatched,
- CharacterClassOutOfOrder,
- EscapeUnterminated,
- NumberOfErrorCodes
- };
+ friend ErrorCode parse(FriendDelegate&, const String& pattern, bool isUnicode, unsigned backReferenceLimit);
/*
* CharacterClassParserDelegate:
@@ -77,7 +56,7 @@ private:
public:
CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err)
: m_delegate(delegate)
- , m_err(err)
+ , m_errorCode(err)
, m_state(Empty)
, m_character(0)
{
@@ -102,7 +81,7 @@ private:
* mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/
* is different to /[a\-z]/).
*/
- void atomPatternCharacter(UChar ch, bool hyphenIsRange = false)
+ void atomPatternCharacter(UChar32 ch, bool hyphenIsRange = false)
{
switch (m_state) {
case AfterCharacterClass:
@@ -119,6 +98,7 @@ private:
return;
}
// Otherwise just fall through - cached character so treat this as Empty.
+ FALLTHROUGH;
case Empty:
m_character = ch;
@@ -136,7 +116,7 @@ private:
case CachedCharacterHyphen:
if (ch < m_character) {
- m_err = CharacterClassOutOfOrder;
+ m_errorCode = ErrorCode::CharacterClassOutOfOrder;
return;
}
m_delegate.atomCharacterClassRange(m_character, ch);
@@ -168,7 +148,7 @@ private:
case CachedCharacter:
// Flush the currently cached character, then fall through.
m_delegate.atomCharacterClassAtom(m_character);
-
+ FALLTHROUGH;
case Empty:
case AfterCharacterClass:
m_state = AfterCharacterClass;
@@ -186,7 +166,7 @@ private:
case CachedCharacterHyphen:
m_delegate.atomCharacterClassAtom(m_character);
m_delegate.atomCharacterClassAtom('-');
- // fall through
+ FALLTHROUGH;
case AfterCharacterClassHyphen:
m_delegate.atomCharacterClassBuiltIn(classID, invert);
m_state = Empty;
@@ -214,10 +194,13 @@ private:
// invoked with inCharacterClass set.
NO_RETURN_DUE_TO_ASSERT void assertionWordBoundary(bool) { RELEASE_ASSERT_NOT_REACHED(); }
NO_RETURN_DUE_TO_ASSERT void atomBackReference(unsigned) { RELEASE_ASSERT_NOT_REACHED(); }
+ NO_RETURN_DUE_TO_ASSERT void atomNamedBackReference(const String&) { RELEASE_ASSERT_NOT_REACHED(); }
+ bool isValidNamedForwardReference(const String&) { RELEASE_ASSERT_NOT_REACHED(); return false; }
+ NO_RETURN_DUE_TO_ASSERT void atomNamedForwardReference(const String&) { RELEASE_ASSERT_NOT_REACHED(); }
private:
Delegate& m_delegate;
- ErrorCode& m_err;
+ ErrorCode& m_errorCode;
enum CharacterClassConstructionState {
Empty,
CachedCharacter,
@@ -225,20 +208,31 @@ private:
AfterCharacterClass,
AfterCharacterClassHyphen,
} m_state;
- UChar m_character;
+ UChar32 m_character;
};
- Parser(Delegate& delegate, const String& pattern, unsigned backReferenceLimit)
+ Parser(Delegate& delegate, const String& pattern, bool isUnicode, unsigned backReferenceLimit)
: m_delegate(delegate)
, m_backReferenceLimit(backReferenceLimit)
- , m_err(NoError)
- , m_data(pattern.getCharacters<CharType>())
+ , m_data(pattern.characters<CharType>())
, m_size(pattern.length())
- , m_index(0)
- , m_parenthesesNestingDepth(0)
+ , m_isUnicode(isUnicode)
{
}
+ // The handling of IdentityEscapes is different depending on the unicode flag.
+ // For Unicode patterns, IdentityEscapes only include SyntaxCharacters or '/'.
+ // For non-unicode patterns, most any character can be escaped.
+ bool isIdentityEscapeAnError(int ch)
+ {
+ if (m_isUnicode && !strchr("^$\\.*+?()[]{}|/", ch)) {
+ m_errorCode = ErrorCode::InvalidIdentityEscape;
+ return true;
+ }
+
+ return false;
+ }
+
/*
* parseEscape():
*
@@ -262,12 +256,12 @@ private:
template<bool inCharacterClass, class EscapeDelegate>
bool parseEscape(EscapeDelegate& delegate)
{
- ASSERT(!m_err);
+ ASSERT(!hasError(m_errorCode));
ASSERT(peek() == '\\');
consume();
if (atEndOfPattern()) {
- m_err = EscapeUnterminated;
+ m_errorCode = ErrorCode::EscapeUnterminated;
return false;
}
@@ -275,18 +269,24 @@ private:
// Assertions
case 'b':
consume();
- if (inCharacterClass)
+ if (inCharacterClass) {
+ if (isIdentityEscapeAnError('b'))
+ break;
+
delegate.atomPatternCharacter('\b');
- else {
+ } else {
delegate.assertionWordBoundary(false);
return false;
}
break;
case 'B':
consume();
- if (inCharacterClass)
+ if (inCharacterClass) {
+ if (isIdentityEscapeAnError('B'))
+ break;
+
delegate.atomPatternCharacter('B');
- else {
+ } else {
delegate.assertionWordBoundary(true);
return false;
}
@@ -295,27 +295,27 @@ private:
// CharacterClassEscape
case 'd':
consume();
- delegate.atomBuiltInCharacterClass(DigitClassID, false);
+ delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::DigitClassID, false);
break;
case 's':
consume();
- delegate.atomBuiltInCharacterClass(SpaceClassID, false);
+ delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::SpaceClassID, false);
break;
case 'w':
consume();
- delegate.atomBuiltInCharacterClass(WordClassID, false);
+ delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::WordClassID, false);
break;
case 'D':
consume();
- delegate.atomBuiltInCharacterClass(DigitClassID, true);
+ delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::DigitClassID, true);
break;
case 'S':
consume();
- delegate.atomBuiltInCharacterClass(SpaceClassID, true);
+ delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::SpaceClassID, true);
break;
case 'W':
consume();
- delegate.atomBuiltInCharacterClass(WordClassID, true);
+ delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::WordClassID, true);
break;
// DecimalEscape
@@ -340,15 +340,21 @@ private:
}
restoreState(state);
+
+ if (m_isUnicode) {
+ m_errorCode = ErrorCode::InvalidBackreference;
+ return false;
+ }
}
-
- // Not a backreference, and not octal.
+
+ // Not a backreference, and not octal. Just a number.
if (peek() >= '8') {
- delegate.atomPatternCharacter('\\');
+ delegate.atomPatternCharacter(consume());
break;
}
// Fall-through to handle this as an octal escape.
+ FALLTHROUGH;
}
// Octal escape
@@ -400,32 +406,168 @@ private:
case 'x': {
consume();
int x = tryConsumeHex(2);
- if (x == -1)
+ if (x == -1) {
+ if (isIdentityEscapeAnError('x'))
+ break;
+
delegate.atomPatternCharacter('x');
- else
+ } else
delegate.atomPatternCharacter(x);
break;
}
+ // Named backreference
+ case 'k': {
+ consume();
+ ParseState state = saveState();
+ if (!atEndOfPattern() && !inCharacterClass) {
+ if (consume() == '<') {
+ auto groupName = tryConsumeGroupName();
+ if (groupName) {
+ if (m_captureGroupNames.contains(groupName.value())) {
+ delegate.atomNamedBackReference(groupName.value());
+ break;
+ }
+
+ if (delegate.isValidNamedForwardReference(groupName.value())) {
+ delegate.atomNamedForwardReference(groupName.value());
+ break;
+ }
+ }
+ if (m_isUnicode) {
+ m_errorCode = ErrorCode::InvalidBackreference;
+ break;
+ }
+ }
+ }
+ restoreState(state);
+ delegate.atomPatternCharacter('k');
+ break;
+ }
+
+ // Unicode property escapes
+ case 'p':
+ case 'P': {
+ int escapeChar = consume();
+
+ if (!m_isUnicode) {
+ if (isIdentityEscapeAnError(escapeChar))
+ break;
+ delegate.atomPatternCharacter(escapeChar);
+ break;
+ }
+
+ if (!atEndOfPattern() && peek() == '{') {
+ consume();
+ auto optClassID = tryConsumeUnicodePropertyExpression();
+ if (!optClassID) {
+ // tryConsumeUnicodePropertyExpression() will set m_errorCode for a malformed property expression
+ break;
+ }
+ delegate.atomBuiltInCharacterClass(optClassID.value(), escapeChar == 'P');
+ } else
+ m_errorCode = ErrorCode::InvalidUnicodePropertyExpression;
+ break;
+ }
+
// UnicodeEscape
case 'u': {
consume();
+ if (atEndOfPattern()) {
+ if (isIdentityEscapeAnError('u'))
+ break;
+
+ delegate.atomPatternCharacter('u');
+ break;
+ }
+
+ if (m_isUnicode && peek() == '{') {
+ consume();
+ UChar32 codePoint = 0;
+ do {
+ if (atEndOfPattern() || !isASCIIHexDigit(peek())) {
+ m_errorCode = ErrorCode::InvalidUnicodeEscape;
+ break;
+ }
+
+ codePoint = (codePoint << 4) | toASCIIHexValue(consume());
+
+ if (codePoint > UCHAR_MAX_VALUE)
+ m_errorCode = ErrorCode::InvalidUnicodeEscape;
+ } while (!atEndOfPattern() && peek() != '}');
+ if (!atEndOfPattern() && peek() == '}')
+ consume();
+ else if (!hasError(m_errorCode))
+ m_errorCode = ErrorCode::InvalidUnicodeEscape;
+ if (hasError(m_errorCode))
+ return false;
+
+ delegate.atomPatternCharacter(codePoint);
+ break;
+ }
int u = tryConsumeHex(4);
- if (u == -1)
+ if (u == -1) {
+ if (isIdentityEscapeAnError('u'))
+ break;
+
delegate.atomPatternCharacter('u');
- else
+ } else {
+ // If we have the first of a surrogate pair, look for the second.
+ if (U16_IS_LEAD(u) && m_isUnicode && (patternRemaining() >= 6) && peek() == '\\') {
+ ParseState state = saveState();
+ consume();
+
+ if (tryConsume('u')) {
+ int surrogate2 = tryConsumeHex(4);
+ if (U16_IS_TRAIL(surrogate2)) {
+ u = U16_GET_SUPPLEMENTARY(u, surrogate2);
+ delegate.atomPatternCharacter(u);
+ break;
+ }
+ }
+
+ restoreState(state);
+ }
delegate.atomPatternCharacter(u);
+ }
break;
}
// IdentityEscape
default:
+ int ch = peek();
+
+ if (ch == '-' && m_isUnicode && inCharacterClass) {
+ // \- is allowed for ClassEscape with unicode flag.
+ delegate.atomPatternCharacter(consume());
+ break;
+ }
+
+ if (isIdentityEscapeAnError(ch))
+ break;
+
delegate.atomPatternCharacter(consume());
}
return true;
}
+ UChar32 consumePossibleSurrogatePair()
+ {
+ UChar32 ch = consume();
+ if (U16_IS_LEAD(ch) && m_isUnicode && (patternRemaining() > 0)) {
+ ParseState state = saveState();
+
+ UChar32 surrogate2 = consume();
+ if (U16_IS_TRAIL(surrogate2))
+ ch = U16_GET_SUPPLEMENTARY(ch, surrogate2);
+ else
+ restoreState(state);
+ }
+
+ return ch;
+ }
+
/*
* parseAtomEscape(), parseCharacterClassEscape():
*
@@ -449,11 +591,11 @@ private:
*/
void parseCharacterClass()
{
- ASSERT(!m_err);
+ ASSERT(!hasError(m_errorCode));
ASSERT(peek() == '[');
consume();
- CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err);
+ CharacterClassParserDelegate characterClassConstructor(m_delegate, m_errorCode);
characterClassConstructor.begin(tryConsume('^'));
@@ -469,14 +611,14 @@ private:
break;
default:
- characterClassConstructor.atomPatternCharacter(consume(), true);
+ characterClassConstructor.atomPatternCharacter(consumePossibleSurrogatePair(), true);
}
- if (m_err)
+ if (hasError(m_errorCode))
return;
}
- m_err = CharacterClassUnmatched;
+ m_errorCode = ErrorCode::CharacterClassUnmatched;
}
/*
@@ -486,13 +628,13 @@ private:
*/
void parseParenthesesBegin()
{
- ASSERT(!m_err);
+ ASSERT(!hasError(m_errorCode));
ASSERT(peek() == '(');
consume();
if (tryConsume('?')) {
if (atEndOfPattern()) {
- m_err = ParenthesesTypeInvalid;
+ m_errorCode = ErrorCode::ParenthesesTypeInvalid;
return;
}
@@ -508,9 +650,23 @@ private:
case '!':
m_delegate.atomParentheticalAssertionBegin(true);
break;
-
+
+ case '<': {
+ auto groupName = tryConsumeGroupName();
+ if (groupName) {
+ auto setAddResult = m_captureGroupNames.add(groupName.value());
+ if (setAddResult.isNewEntry)
+ m_delegate.atomParenthesesSubpatternBegin(true, groupName);
+ else
+ m_errorCode = ErrorCode::DuplicateGroupName;
+ } else
+ m_errorCode = ErrorCode::InvalidGroupName;
+
+ break;
+ }
+
default:
- m_err = ParenthesesTypeInvalid;
+ m_errorCode = ErrorCode::ParenthesesTypeInvalid;
}
} else
m_delegate.atomParenthesesSubpatternBegin();
@@ -525,14 +681,14 @@ private:
*/
void parseParenthesesEnd()
{
- ASSERT(!m_err);
+ ASSERT(!hasError(m_errorCode));
ASSERT(peek() == ')');
consume();
if (m_parenthesesNestingDepth > 0)
m_delegate.atomParenthesesEnd();
else
- m_err = ParenthesesUnmatched;
+ m_errorCode = ErrorCode::ParenthesesUnmatched;
--m_parenthesesNestingDepth;
}
@@ -544,18 +700,18 @@ private:
*/
void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max)
{
- ASSERT(!m_err);
+ ASSERT(!hasError(m_errorCode));
ASSERT(min <= max);
if (min == UINT_MAX) {
- m_err = QuantifierTooLarge;
+ m_errorCode = ErrorCode::QuantifierTooLarge;
return;
}
if (lastTokenWasAnAtom)
m_delegate.quantifyAtom(min, max, !tryConsume('?'));
else
- m_err = QuantifierWithoutAtom;
+ m_errorCode = ErrorCode::QuantifierWithoutAtom;
}
/*
@@ -603,7 +759,7 @@ private:
case '.':
consume();
- m_delegate.atomBuiltInCharacterClass(NewlineClassID, true);
+ m_delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::DotClassID, false);
lastTokenWasAnAtom = true;
break;
@@ -649,58 +805,44 @@ private:
if (min <= max)
parseQuantifier(lastTokenWasAnAtom, min, max);
else
- m_err = QuantifierOutOfOrder;
+ m_errorCode = ErrorCode::QuantifierOutOfOrder;
lastTokenWasAnAtom = false;
break;
}
}
restoreState(state);
- } // if we did not find a complete quantifer, fall through to the default case.
+ }
+ // if we did not find a complete quantifer, fall through to the default case.
+ FALLTHROUGH;
default:
- m_delegate.atomPatternCharacter(consume());
+ m_delegate.atomPatternCharacter(consumePossibleSurrogatePair());
lastTokenWasAnAtom = true;
}
- if (m_err)
+ if (hasError(m_errorCode))
return;
}
if (m_parenthesesNestingDepth > 0)
- m_err = MissingParentheses;
+ m_errorCode = ErrorCode::MissingParentheses;
}
/*
* parse():
*
- * This method calls parseTokens() to parse over the input and converts any
- * error code to a const char* for a result.
+ * This method calls parseTokens() to parse over the input and returns error code for a result.
*/
- const char* parse()
+ ErrorCode parse()
{
if (m_size > MAX_PATTERN_SIZE)
- m_err = PatternTooLarge;
+ m_errorCode = ErrorCode::PatternTooLarge;
else
parseTokens();
- ASSERT(atEndOfPattern() || m_err);
-
- // The order of this array must match the ErrorCode enum.
- static const char* errorMessages[NumberOfErrorCodes] = {
- 0, // NoError
- REGEXP_ERROR_PREFIX "regular expression too large",
- REGEXP_ERROR_PREFIX "numbers out of order in {} quantifier",
- REGEXP_ERROR_PREFIX "nothing to repeat",
- REGEXP_ERROR_PREFIX "number too large in {} quantifier",
- REGEXP_ERROR_PREFIX "missing )",
- REGEXP_ERROR_PREFIX "unmatched parentheses",
- REGEXP_ERROR_PREFIX "unrecognized character after (?",
- REGEXP_ERROR_PREFIX "missing terminating ] for character class",
- REGEXP_ERROR_PREFIX "range out of order in character class",
- REGEXP_ERROR_PREFIX "\\ at end of pattern"
- };
-
- return errorMessages[m_err];
+ ASSERT(atEndOfPattern() || hasError(m_errorCode));
+
+ return m_errorCode;
}
// Misc helper functions:
@@ -723,6 +865,12 @@ private:
return m_index == m_size;
}
+ unsigned patternRemaining()
+ {
+ ASSERT(m_index <= m_size);
+ return m_size - m_index;
+ }
+
int peek()
{
ASSERT(m_index < m_size);
@@ -740,6 +888,87 @@ private:
return peek() - '0';
}
+ int tryConsumeUnicodeEscape()
+ {
+ if (!tryConsume('u'))
+ return -1;
+
+ if (m_isUnicode && tryConsume('{')) {
+ int codePoint = 0;
+ do {
+ if (atEndOfPattern() || !isASCIIHexDigit(peek())) {
+ m_errorCode = ErrorCode::InvalidUnicodeEscape;
+ return -1;
+ }
+
+ codePoint = (codePoint << 4) | toASCIIHexValue(consume());
+
+ if (codePoint > UCHAR_MAX_VALUE) {
+ m_errorCode = ErrorCode::InvalidUnicodeEscape;
+ return -1;
+ }
+ } while (!atEndOfPattern() && peek() != '}');
+ if (!atEndOfPattern() && peek() == '}')
+ consume();
+ else if (!hasError(m_errorCode))
+ m_errorCode = ErrorCode::InvalidUnicodeEscape;
+ if (hasError(m_errorCode))
+ return -1;
+
+ return codePoint;
+ }
+
+ int u = tryConsumeHex(4);
+ if (u == -1)
+ return -1;
+
+ // If we have the first of a surrogate pair, look for the second.
+ if (U16_IS_LEAD(u) && m_isUnicode && (patternRemaining() >= 6) && peek() == '\\') {
+ ParseState state = saveState();
+ consume();
+
+ if (tryConsume('u')) {
+ int surrogate2 = tryConsumeHex(4);
+ if (U16_IS_TRAIL(surrogate2)) {
+ u = U16_GET_SUPPLEMENTARY(u, surrogate2);
+ return u;
+ }
+ }
+
+ restoreState(state);
+ }
+
+ return u;
+ }
+
+ int tryConsumeIdentifierCharacter()
+ {
+ int ch = peek();
+
+ if (ch == '\\') {
+ consume();
+ ch = tryConsumeUnicodeEscape();
+ } else
+ consume();
+
+ return ch;
+ }
+
+ bool isIdentifierStart(int ch)
+ {
+ return (WTF::isASCII(ch) && (WTF::isASCIIAlpha(ch) || ch == '_' || ch == '$')) || (U_GET_GC_MASK(ch) & U_GC_L_MASK);
+ }
+
+ bool isIdentifierPart(int ch)
+ {
+ return (WTF::isASCII(ch) && (WTF::isASCIIAlpha(ch) || ch == '_' || ch == '$')) || (U_GET_GC_MASK(ch) & (U_GC_L_MASK | U_GC_MN_MASK | U_GC_MC_MASK | U_GC_ND_MASK | U_GC_PC_MASK)) || ch == 0x200C || ch == 0x200D;
+ }
+
+ bool isUnicodePropertyValueExpressionChar(int ch)
+ {
+ return WTF::isASCIIAlphanumeric(ch) || ch == '_' || ch == '=';
+ }
+
int consume()
{
ASSERT(m_index < m_size);
@@ -754,13 +983,10 @@ private:
unsigned consumeNumber()
{
- unsigned n = consumeDigit();
- // check for overflow.
- for (unsigned newValue; peekIsDigit() && ((newValue = n * 10 + peekDigit()) >= n); ) {
- n = newValue;
- consume();
- }
- return n;
+ Checked<unsigned, RecordOverflow> n = consumeDigit();
+ while (peekIsDigit())
+ n = n * 10 + consumeDigit();
+ return n.hasOverflowed() ? quantifyInfinite : n.unsafeGet();
}
unsigned consumeOctal()
@@ -796,13 +1022,99 @@ private:
return n;
}
+ std::optional<String> tryConsumeGroupName()
+ {
+ if (atEndOfPattern())
+ return std::nullopt;
+
+ ParseState state = saveState();
+
+ int ch = tryConsumeIdentifierCharacter();
+
+ if (isIdentifierStart(ch)) {
+ StringBuilder identifierBuilder;
+ identifierBuilder.append(ch);
+
+ while (!atEndOfPattern()) {
+ ch = tryConsumeIdentifierCharacter();
+ if (ch == '>')
+ return std::optional<String>(identifierBuilder.toString());
+
+ if (!isIdentifierPart(ch))
+ break;
+
+ identifierBuilder.append(ch);
+ }
+ }
+
+ restoreState(state);
+
+ return std::nullopt;
+ }
+
+ std::optional<BuiltInCharacterClassID> tryConsumeUnicodePropertyExpression()
+ {
+ if (atEndOfPattern() || !isUnicodePropertyValueExpressionChar(peek())) {
+ m_errorCode = ErrorCode::InvalidUnicodePropertyExpression;
+ return std::nullopt;
+ }
+
+ StringBuilder expressionBuilder;
+ String unicodePropertyName;
+ bool foundEquals = false;
+ unsigned errors = 0;
+
+ expressionBuilder.append(consume());
+
+ while (!atEndOfPattern()) {
+ int ch = peek();
+ if (ch == '}') {
+ consume();
+ if (errors) {
+ m_errorCode = ErrorCode::InvalidUnicodePropertyExpression;
+ return std::nullopt;
+ }
+
+ if (foundEquals) {
+ auto result = unicodeMatchPropertyValue(unicodePropertyName, expressionBuilder.toString());
+ if (!result)
+ m_errorCode = ErrorCode::InvalidUnicodePropertyExpression;
+ return result;
+ }
+
+ auto result = unicodeMatchProperty(expressionBuilder.toString());
+ if (!result)
+ m_errorCode = ErrorCode::InvalidUnicodePropertyExpression;
+ return result;
+ }
+
+ consume();
+ if (ch == '=') {
+ if (!foundEquals) {
+ foundEquals = true;
+ unicodePropertyName = expressionBuilder.toString();
+ expressionBuilder.clear();
+ } else
+ errors++;
+ } else if (!isUnicodePropertyValueExpressionChar(ch))
+ errors++;
+ else
+ expressionBuilder.append(ch);
+ }
+
+ m_errorCode = ErrorCode::InvalidUnicodePropertyExpression;
+ return std::nullopt;
+ }
+
Delegate& m_delegate;
unsigned m_backReferenceLimit;
- ErrorCode m_err;
+ ErrorCode m_errorCode { ErrorCode::NoError };
const CharType* m_data;
unsigned m_size;
- unsigned m_index;
- unsigned m_parenthesesNestingDepth;
+ unsigned m_index { 0 };
+ bool m_isUnicode;
+ unsigned m_parenthesesNestingDepth { 0 };
+ HashSet<String> m_captureGroupNames;
// Derived by empirical testing of compile time in PCRE and WREC.
static const unsigned MAX_PATTERN_SIZE = 1024 * 1024;
@@ -822,17 +1134,20 @@ private:
* void assertionEOL();
* void assertionWordBoundary(bool invert);
*
- * void atomPatternCharacter(UChar ch);
+ * void atomPatternCharacter(UChar32 ch);
* void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert);
* void atomCharacterClassBegin(bool invert)
- * void atomCharacterClassAtom(UChar ch)
- * void atomCharacterClassRange(UChar begin, UChar end)
+ * void atomCharacterClassAtom(UChar32 ch)
+ * void atomCharacterClassRange(UChar32 begin, UChar32 end)
* void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
* void atomCharacterClassEnd()
- * void atomParenthesesSubpatternBegin(bool capture = true);
+ * void atomParenthesesSubpatternBegin(bool capture = true, Optional<String> groupName);
* void atomParentheticalAssertionBegin(bool invert = false);
* void atomParenthesesEnd();
* void atomBackReference(unsigned subpatternId);
+ * void atomNamedBackReference(const String& subpatternName);
+ * bool isValidNamedForwardReference(const String& subpatternName);
+ * void atomNamedForwardReference(const String& subpatternName);
*
* void quantifyAtom(unsigned min, unsigned max, bool greedy);
*
@@ -868,13 +1183,11 @@ private:
*/
template<class Delegate>
-const char* parse(Delegate& delegate, const String& pattern, unsigned backReferenceLimit = quantifyInfinite)
+ErrorCode parse(Delegate& delegate, const String& pattern, bool isUnicode, unsigned backReferenceLimit = quantifyInfinite)
{
if (pattern.is8Bit())
- return Parser<Delegate, LChar>(delegate, pattern, backReferenceLimit).parse();
- return Parser<Delegate, UChar>(delegate, pattern, backReferenceLimit).parse();
+ return Parser<Delegate, LChar>(delegate, pattern, isUnicode, backReferenceLimit).parse();
+ return Parser<Delegate, UChar>(delegate, pattern, isUnicode, backReferenceLimit).parse();
}
} } // namespace JSC::Yarr
-
-#endif // YarrParser_h