diff options
author | Laszlo Agocs <laszlo.agocs@qt.io> | 2019-03-21 20:42:38 +0100 |
---|---|---|
committer | Laszlo Agocs <laszlo.agocs@qt.io> | 2019-03-21 20:42:47 +0100 |
commit | 6767114285db9d0e16dc278d08f231e8561546b4 (patch) | |
tree | 0945902a2242fd7ec0a1f7fd3e6acbb769e723bd /src/3rdparty/masm/yarr/YarrParser.h | |
parent | ee076afedccbe1d37306a7972051f84eb036d655 (diff) | |
parent | c32b109e9dea44c6775c2dbf8f164870c1dc8971 (diff) |
Merge remote-tracking branch 'origin/dev' into wip/scenegraphng
Change-Id: Ib5662d80d5b2f58cf4634c54c054545ba9df807e
Diffstat (limited to 'src/3rdparty/masm/yarr/YarrParser.h')
-rw-r--r-- | src/3rdparty/masm/yarr/YarrParser.h | 555 |
1 files changed, 434 insertions, 121 deletions
diff --git a/src/3rdparty/masm/yarr/YarrParser.h b/src/3rdparty/masm/yarr/YarrParser.h index 8c5d71b5fe..f1ffc92189 100644 --- a/src/3rdparty/masm/yarr/YarrParser.h +++ b/src/3rdparty/masm/yarr/YarrParser.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2009 Apple Inc. All rights reserved. + * Copyright (C) 2009, 2014-2016 Apple Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -23,46 +23,25 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef YarrParser_h -#define YarrParser_h +#pragma once #include "Yarr.h" +#include "YarrPattern.h" +#include "YarrUnicodeProperties.h" #include <wtf/ASCIICType.h> +#include <wtf/HashSet.h> +#include <wtf/Optional.h> +#include <wtf/text/StringBuilder.h> #include <wtf/text/WTFString.h> -#include <wtf/unicode/Unicode.h> namespace JSC { namespace Yarr { -#define REGEXP_ERROR_PREFIX "Invalid regular expression: " - -enum BuiltInCharacterClassID { - DigitClassID, - SpaceClassID, - WordClassID, - NewlineClassID, -}; - // The Parser class should not be used directly - only via the Yarr::parse() method. template<class Delegate, typename CharType> class Parser { private: template<class FriendDelegate> - friend const char* parse(FriendDelegate&, const String& pattern, unsigned backReferenceLimit); - - enum ErrorCode { - NoError, - PatternTooLarge, - QuantifierOutOfOrder, - QuantifierWithoutAtom, - QuantifierTooLarge, - MissingParentheses, - ParenthesesUnmatched, - ParenthesesTypeInvalid, - CharacterClassUnmatched, - CharacterClassOutOfOrder, - EscapeUnterminated, - NumberOfErrorCodes - }; + friend ErrorCode parse(FriendDelegate&, const String& pattern, bool isUnicode, unsigned backReferenceLimit); /* * CharacterClassParserDelegate: @@ -77,7 +56,7 @@ private: public: CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err) : m_delegate(delegate) - , m_err(err) + , m_errorCode(err) , m_state(Empty) , m_character(0) { @@ -102,7 +81,7 @@ private: * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/ * is different to /[a\-z]/). */ - void atomPatternCharacter(UChar ch, bool hyphenIsRange = false) + void atomPatternCharacter(UChar32 ch, bool hyphenIsRange = false) { switch (m_state) { case AfterCharacterClass: @@ -119,6 +98,7 @@ private: return; } // Otherwise just fall through - cached character so treat this as Empty. + FALLTHROUGH; case Empty: m_character = ch; @@ -136,7 +116,7 @@ private: case CachedCharacterHyphen: if (ch < m_character) { - m_err = CharacterClassOutOfOrder; + m_errorCode = ErrorCode::CharacterClassOutOfOrder; return; } m_delegate.atomCharacterClassRange(m_character, ch); @@ -168,7 +148,7 @@ private: case CachedCharacter: // Flush the currently cached character, then fall through. m_delegate.atomCharacterClassAtom(m_character); - + FALLTHROUGH; case Empty: case AfterCharacterClass: m_state = AfterCharacterClass; @@ -186,7 +166,7 @@ private: case CachedCharacterHyphen: m_delegate.atomCharacterClassAtom(m_character); m_delegate.atomCharacterClassAtom('-'); - // fall through + FALLTHROUGH; case AfterCharacterClassHyphen: m_delegate.atomCharacterClassBuiltIn(classID, invert); m_state = Empty; @@ -214,10 +194,13 @@ private: // invoked with inCharacterClass set. NO_RETURN_DUE_TO_ASSERT void assertionWordBoundary(bool) { RELEASE_ASSERT_NOT_REACHED(); } NO_RETURN_DUE_TO_ASSERT void atomBackReference(unsigned) { RELEASE_ASSERT_NOT_REACHED(); } + NO_RETURN_DUE_TO_ASSERT void atomNamedBackReference(const String&) { RELEASE_ASSERT_NOT_REACHED(); } + bool isValidNamedForwardReference(const String&) { RELEASE_ASSERT_NOT_REACHED(); return false; } + NO_RETURN_DUE_TO_ASSERT void atomNamedForwardReference(const String&) { RELEASE_ASSERT_NOT_REACHED(); } private: Delegate& m_delegate; - ErrorCode& m_err; + ErrorCode& m_errorCode; enum CharacterClassConstructionState { Empty, CachedCharacter, @@ -225,20 +208,31 @@ private: AfterCharacterClass, AfterCharacterClassHyphen, } m_state; - UChar m_character; + UChar32 m_character; }; - Parser(Delegate& delegate, const String& pattern, unsigned backReferenceLimit) + Parser(Delegate& delegate, const String& pattern, bool isUnicode, unsigned backReferenceLimit) : m_delegate(delegate) , m_backReferenceLimit(backReferenceLimit) - , m_err(NoError) - , m_data(pattern.getCharacters<CharType>()) + , m_data(pattern.characters<CharType>()) , m_size(pattern.length()) - , m_index(0) - , m_parenthesesNestingDepth(0) + , m_isUnicode(isUnicode) { } + // The handling of IdentityEscapes is different depending on the unicode flag. + // For Unicode patterns, IdentityEscapes only include SyntaxCharacters or '/'. + // For non-unicode patterns, most any character can be escaped. + bool isIdentityEscapeAnError(int ch) + { + if (m_isUnicode && !strchr("^$\\.*+?()[]{}|/", ch)) { + m_errorCode = ErrorCode::InvalidIdentityEscape; + return true; + } + + return false; + } + /* * parseEscape(): * @@ -262,12 +256,12 @@ private: template<bool inCharacterClass, class EscapeDelegate> bool parseEscape(EscapeDelegate& delegate) { - ASSERT(!m_err); + ASSERT(!hasError(m_errorCode)); ASSERT(peek() == '\\'); consume(); if (atEndOfPattern()) { - m_err = EscapeUnterminated; + m_errorCode = ErrorCode::EscapeUnterminated; return false; } @@ -275,18 +269,24 @@ private: // Assertions case 'b': consume(); - if (inCharacterClass) + if (inCharacterClass) { + if (isIdentityEscapeAnError('b')) + break; + delegate.atomPatternCharacter('\b'); - else { + } else { delegate.assertionWordBoundary(false); return false; } break; case 'B': consume(); - if (inCharacterClass) + if (inCharacterClass) { + if (isIdentityEscapeAnError('B')) + break; + delegate.atomPatternCharacter('B'); - else { + } else { delegate.assertionWordBoundary(true); return false; } @@ -295,27 +295,27 @@ private: // CharacterClassEscape case 'd': consume(); - delegate.atomBuiltInCharacterClass(DigitClassID, false); + delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::DigitClassID, false); break; case 's': consume(); - delegate.atomBuiltInCharacterClass(SpaceClassID, false); + delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::SpaceClassID, false); break; case 'w': consume(); - delegate.atomBuiltInCharacterClass(WordClassID, false); + delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::WordClassID, false); break; case 'D': consume(); - delegate.atomBuiltInCharacterClass(DigitClassID, true); + delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::DigitClassID, true); break; case 'S': consume(); - delegate.atomBuiltInCharacterClass(SpaceClassID, true); + delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::SpaceClassID, true); break; case 'W': consume(); - delegate.atomBuiltInCharacterClass(WordClassID, true); + delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::WordClassID, true); break; // DecimalEscape @@ -340,15 +340,21 @@ private: } restoreState(state); + + if (m_isUnicode) { + m_errorCode = ErrorCode::InvalidBackreference; + return false; + } } - - // Not a backreference, and not octal. + + // Not a backreference, and not octal. Just a number. if (peek() >= '8') { - delegate.atomPatternCharacter('\\'); + delegate.atomPatternCharacter(consume()); break; } // Fall-through to handle this as an octal escape. + FALLTHROUGH; } // Octal escape @@ -400,32 +406,168 @@ private: case 'x': { consume(); int x = tryConsumeHex(2); - if (x == -1) + if (x == -1) { + if (isIdentityEscapeAnError('x')) + break; + delegate.atomPatternCharacter('x'); - else + } else delegate.atomPatternCharacter(x); break; } + // Named backreference + case 'k': { + consume(); + ParseState state = saveState(); + if (!atEndOfPattern() && !inCharacterClass) { + if (consume() == '<') { + auto groupName = tryConsumeGroupName(); + if (groupName) { + if (m_captureGroupNames.contains(groupName.value())) { + delegate.atomNamedBackReference(groupName.value()); + break; + } + + if (delegate.isValidNamedForwardReference(groupName.value())) { + delegate.atomNamedForwardReference(groupName.value()); + break; + } + } + if (m_isUnicode) { + m_errorCode = ErrorCode::InvalidBackreference; + break; + } + } + } + restoreState(state); + delegate.atomPatternCharacter('k'); + break; + } + + // Unicode property escapes + case 'p': + case 'P': { + int escapeChar = consume(); + + if (!m_isUnicode) { + if (isIdentityEscapeAnError(escapeChar)) + break; + delegate.atomPatternCharacter(escapeChar); + break; + } + + if (!atEndOfPattern() && peek() == '{') { + consume(); + auto optClassID = tryConsumeUnicodePropertyExpression(); + if (!optClassID) { + // tryConsumeUnicodePropertyExpression() will set m_errorCode for a malformed property expression + break; + } + delegate.atomBuiltInCharacterClass(optClassID.value(), escapeChar == 'P'); + } else + m_errorCode = ErrorCode::InvalidUnicodePropertyExpression; + break; + } + // UnicodeEscape case 'u': { consume(); + if (atEndOfPattern()) { + if (isIdentityEscapeAnError('u')) + break; + + delegate.atomPatternCharacter('u'); + break; + } + + if (m_isUnicode && peek() == '{') { + consume(); + UChar32 codePoint = 0; + do { + if (atEndOfPattern() || !isASCIIHexDigit(peek())) { + m_errorCode = ErrorCode::InvalidUnicodeEscape; + break; + } + + codePoint = (codePoint << 4) | toASCIIHexValue(consume()); + + if (codePoint > UCHAR_MAX_VALUE) + m_errorCode = ErrorCode::InvalidUnicodeEscape; + } while (!atEndOfPattern() && peek() != '}'); + if (!atEndOfPattern() && peek() == '}') + consume(); + else if (!hasError(m_errorCode)) + m_errorCode = ErrorCode::InvalidUnicodeEscape; + if (hasError(m_errorCode)) + return false; + + delegate.atomPatternCharacter(codePoint); + break; + } int u = tryConsumeHex(4); - if (u == -1) + if (u == -1) { + if (isIdentityEscapeAnError('u')) + break; + delegate.atomPatternCharacter('u'); - else + } else { + // If we have the first of a surrogate pair, look for the second. + if (U16_IS_LEAD(u) && m_isUnicode && (patternRemaining() >= 6) && peek() == '\\') { + ParseState state = saveState(); + consume(); + + if (tryConsume('u')) { + int surrogate2 = tryConsumeHex(4); + if (U16_IS_TRAIL(surrogate2)) { + u = U16_GET_SUPPLEMENTARY(u, surrogate2); + delegate.atomPatternCharacter(u); + break; + } + } + + restoreState(state); + } delegate.atomPatternCharacter(u); + } break; } // IdentityEscape default: + int ch = peek(); + + if (ch == '-' && m_isUnicode && inCharacterClass) { + // \- is allowed for ClassEscape with unicode flag. + delegate.atomPatternCharacter(consume()); + break; + } + + if (isIdentityEscapeAnError(ch)) + break; + delegate.atomPatternCharacter(consume()); } return true; } + UChar32 consumePossibleSurrogatePair() + { + UChar32 ch = consume(); + if (U16_IS_LEAD(ch) && m_isUnicode && (patternRemaining() > 0)) { + ParseState state = saveState(); + + UChar32 surrogate2 = consume(); + if (U16_IS_TRAIL(surrogate2)) + ch = U16_GET_SUPPLEMENTARY(ch, surrogate2); + else + restoreState(state); + } + + return ch; + } + /* * parseAtomEscape(), parseCharacterClassEscape(): * @@ -449,11 +591,11 @@ private: */ void parseCharacterClass() { - ASSERT(!m_err); + ASSERT(!hasError(m_errorCode)); ASSERT(peek() == '['); consume(); - CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err); + CharacterClassParserDelegate characterClassConstructor(m_delegate, m_errorCode); characterClassConstructor.begin(tryConsume('^')); @@ -469,14 +611,14 @@ private: break; default: - characterClassConstructor.atomPatternCharacter(consume(), true); + characterClassConstructor.atomPatternCharacter(consumePossibleSurrogatePair(), true); } - if (m_err) + if (hasError(m_errorCode)) return; } - m_err = CharacterClassUnmatched; + m_errorCode = ErrorCode::CharacterClassUnmatched; } /* @@ -486,13 +628,13 @@ private: */ void parseParenthesesBegin() { - ASSERT(!m_err); + ASSERT(!hasError(m_errorCode)); ASSERT(peek() == '('); consume(); if (tryConsume('?')) { if (atEndOfPattern()) { - m_err = ParenthesesTypeInvalid; + m_errorCode = ErrorCode::ParenthesesTypeInvalid; return; } @@ -508,9 +650,23 @@ private: case '!': m_delegate.atomParentheticalAssertionBegin(true); break; - + + case '<': { + auto groupName = tryConsumeGroupName(); + if (groupName) { + auto setAddResult = m_captureGroupNames.add(groupName.value()); + if (setAddResult.isNewEntry) + m_delegate.atomParenthesesSubpatternBegin(true, groupName); + else + m_errorCode = ErrorCode::DuplicateGroupName; + } else + m_errorCode = ErrorCode::InvalidGroupName; + + break; + } + default: - m_err = ParenthesesTypeInvalid; + m_errorCode = ErrorCode::ParenthesesTypeInvalid; } } else m_delegate.atomParenthesesSubpatternBegin(); @@ -525,14 +681,14 @@ private: */ void parseParenthesesEnd() { - ASSERT(!m_err); + ASSERT(!hasError(m_errorCode)); ASSERT(peek() == ')'); consume(); if (m_parenthesesNestingDepth > 0) m_delegate.atomParenthesesEnd(); else - m_err = ParenthesesUnmatched; + m_errorCode = ErrorCode::ParenthesesUnmatched; --m_parenthesesNestingDepth; } @@ -544,18 +700,18 @@ private: */ void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max) { - ASSERT(!m_err); + ASSERT(!hasError(m_errorCode)); ASSERT(min <= max); if (min == UINT_MAX) { - m_err = QuantifierTooLarge; + m_errorCode = ErrorCode::QuantifierTooLarge; return; } if (lastTokenWasAnAtom) m_delegate.quantifyAtom(min, max, !tryConsume('?')); else - m_err = QuantifierWithoutAtom; + m_errorCode = ErrorCode::QuantifierWithoutAtom; } /* @@ -603,7 +759,7 @@ private: case '.': consume(); - m_delegate.atomBuiltInCharacterClass(NewlineClassID, true); + m_delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::DotClassID, false); lastTokenWasAnAtom = true; break; @@ -649,58 +805,44 @@ private: if (min <= max) parseQuantifier(lastTokenWasAnAtom, min, max); else - m_err = QuantifierOutOfOrder; + m_errorCode = ErrorCode::QuantifierOutOfOrder; lastTokenWasAnAtom = false; break; } } restoreState(state); - } // if we did not find a complete quantifer, fall through to the default case. + } + // if we did not find a complete quantifer, fall through to the default case. + FALLTHROUGH; default: - m_delegate.atomPatternCharacter(consume()); + m_delegate.atomPatternCharacter(consumePossibleSurrogatePair()); lastTokenWasAnAtom = true; } - if (m_err) + if (hasError(m_errorCode)) return; } if (m_parenthesesNestingDepth > 0) - m_err = MissingParentheses; + m_errorCode = ErrorCode::MissingParentheses; } /* * parse(): * - * This method calls parseTokens() to parse over the input and converts any - * error code to a const char* for a result. + * This method calls parseTokens() to parse over the input and returns error code for a result. */ - const char* parse() + ErrorCode parse() { if (m_size > MAX_PATTERN_SIZE) - m_err = PatternTooLarge; + m_errorCode = ErrorCode::PatternTooLarge; else parseTokens(); - ASSERT(atEndOfPattern() || m_err); - - // The order of this array must match the ErrorCode enum. - static const char* errorMessages[NumberOfErrorCodes] = { - 0, // NoError - REGEXP_ERROR_PREFIX "regular expression too large", - REGEXP_ERROR_PREFIX "numbers out of order in {} quantifier", - REGEXP_ERROR_PREFIX "nothing to repeat", - REGEXP_ERROR_PREFIX "number too large in {} quantifier", - REGEXP_ERROR_PREFIX "missing )", - REGEXP_ERROR_PREFIX "unmatched parentheses", - REGEXP_ERROR_PREFIX "unrecognized character after (?", - REGEXP_ERROR_PREFIX "missing terminating ] for character class", - REGEXP_ERROR_PREFIX "range out of order in character class", - REGEXP_ERROR_PREFIX "\\ at end of pattern" - }; - - return errorMessages[m_err]; + ASSERT(atEndOfPattern() || hasError(m_errorCode)); + + return m_errorCode; } // Misc helper functions: @@ -723,6 +865,12 @@ private: return m_index == m_size; } + unsigned patternRemaining() + { + ASSERT(m_index <= m_size); + return m_size - m_index; + } + int peek() { ASSERT(m_index < m_size); @@ -740,6 +888,87 @@ private: return peek() - '0'; } + int tryConsumeUnicodeEscape() + { + if (!tryConsume('u')) + return -1; + + if (m_isUnicode && tryConsume('{')) { + int codePoint = 0; + do { + if (atEndOfPattern() || !isASCIIHexDigit(peek())) { + m_errorCode = ErrorCode::InvalidUnicodeEscape; + return -1; + } + + codePoint = (codePoint << 4) | toASCIIHexValue(consume()); + + if (codePoint > UCHAR_MAX_VALUE) { + m_errorCode = ErrorCode::InvalidUnicodeEscape; + return -1; + } + } while (!atEndOfPattern() && peek() != '}'); + if (!atEndOfPattern() && peek() == '}') + consume(); + else if (!hasError(m_errorCode)) + m_errorCode = ErrorCode::InvalidUnicodeEscape; + if (hasError(m_errorCode)) + return -1; + + return codePoint; + } + + int u = tryConsumeHex(4); + if (u == -1) + return -1; + + // If we have the first of a surrogate pair, look for the second. + if (U16_IS_LEAD(u) && m_isUnicode && (patternRemaining() >= 6) && peek() == '\\') { + ParseState state = saveState(); + consume(); + + if (tryConsume('u')) { + int surrogate2 = tryConsumeHex(4); + if (U16_IS_TRAIL(surrogate2)) { + u = U16_GET_SUPPLEMENTARY(u, surrogate2); + return u; + } + } + + restoreState(state); + } + + return u; + } + + int tryConsumeIdentifierCharacter() + { + int ch = peek(); + + if (ch == '\\') { + consume(); + ch = tryConsumeUnicodeEscape(); + } else + consume(); + + return ch; + } + + bool isIdentifierStart(int ch) + { + return (WTF::isASCII(ch) && (WTF::isASCIIAlpha(ch) || ch == '_' || ch == '$')) || (U_GET_GC_MASK(ch) & U_GC_L_MASK); + } + + bool isIdentifierPart(int ch) + { + return (WTF::isASCII(ch) && (WTF::isASCIIAlpha(ch) || ch == '_' || ch == '$')) || (U_GET_GC_MASK(ch) & (U_GC_L_MASK | U_GC_MN_MASK | U_GC_MC_MASK | U_GC_ND_MASK | U_GC_PC_MASK)) || ch == 0x200C || ch == 0x200D; + } + + bool isUnicodePropertyValueExpressionChar(int ch) + { + return WTF::isASCIIAlphanumeric(ch) || ch == '_' || ch == '='; + } + int consume() { ASSERT(m_index < m_size); @@ -754,13 +983,10 @@ private: unsigned consumeNumber() { - unsigned n = consumeDigit(); - // check for overflow. - for (unsigned newValue; peekIsDigit() && ((newValue = n * 10 + peekDigit()) >= n); ) { - n = newValue; - consume(); - } - return n; + Checked<unsigned, RecordOverflow> n = consumeDigit(); + while (peekIsDigit()) + n = n * 10 + consumeDigit(); + return n.hasOverflowed() ? quantifyInfinite : n.unsafeGet(); } unsigned consumeOctal() @@ -796,13 +1022,99 @@ private: return n; } + std::optional<String> tryConsumeGroupName() + { + if (atEndOfPattern()) + return std::nullopt; + + ParseState state = saveState(); + + int ch = tryConsumeIdentifierCharacter(); + + if (isIdentifierStart(ch)) { + StringBuilder identifierBuilder; + identifierBuilder.append(ch); + + while (!atEndOfPattern()) { + ch = tryConsumeIdentifierCharacter(); + if (ch == '>') + return std::optional<String>(identifierBuilder.toString()); + + if (!isIdentifierPart(ch)) + break; + + identifierBuilder.append(ch); + } + } + + restoreState(state); + + return std::nullopt; + } + + std::optional<BuiltInCharacterClassID> tryConsumeUnicodePropertyExpression() + { + if (atEndOfPattern() || !isUnicodePropertyValueExpressionChar(peek())) { + m_errorCode = ErrorCode::InvalidUnicodePropertyExpression; + return std::nullopt; + } + + StringBuilder expressionBuilder; + String unicodePropertyName; + bool foundEquals = false; + unsigned errors = 0; + + expressionBuilder.append(consume()); + + while (!atEndOfPattern()) { + int ch = peek(); + if (ch == '}') { + consume(); + if (errors) { + m_errorCode = ErrorCode::InvalidUnicodePropertyExpression; + return std::nullopt; + } + + if (foundEquals) { + auto result = unicodeMatchPropertyValue(unicodePropertyName, expressionBuilder.toString()); + if (!result) + m_errorCode = ErrorCode::InvalidUnicodePropertyExpression; + return result; + } + + auto result = unicodeMatchProperty(expressionBuilder.toString()); + if (!result) + m_errorCode = ErrorCode::InvalidUnicodePropertyExpression; + return result; + } + + consume(); + if (ch == '=') { + if (!foundEquals) { + foundEquals = true; + unicodePropertyName = expressionBuilder.toString(); + expressionBuilder.clear(); + } else + errors++; + } else if (!isUnicodePropertyValueExpressionChar(ch)) + errors++; + else + expressionBuilder.append(ch); + } + + m_errorCode = ErrorCode::InvalidUnicodePropertyExpression; + return std::nullopt; + } + Delegate& m_delegate; unsigned m_backReferenceLimit; - ErrorCode m_err; + ErrorCode m_errorCode { ErrorCode::NoError }; const CharType* m_data; unsigned m_size; - unsigned m_index; - unsigned m_parenthesesNestingDepth; + unsigned m_index { 0 }; + bool m_isUnicode; + unsigned m_parenthesesNestingDepth { 0 }; + HashSet<String> m_captureGroupNames; // Derived by empirical testing of compile time in PCRE and WREC. static const unsigned MAX_PATTERN_SIZE = 1024 * 1024; @@ -822,17 +1134,20 @@ private: * void assertionEOL(); * void assertionWordBoundary(bool invert); * - * void atomPatternCharacter(UChar ch); + * void atomPatternCharacter(UChar32 ch); * void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert); * void atomCharacterClassBegin(bool invert) - * void atomCharacterClassAtom(UChar ch) - * void atomCharacterClassRange(UChar begin, UChar end) + * void atomCharacterClassAtom(UChar32 ch) + * void atomCharacterClassRange(UChar32 begin, UChar32 end) * void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) * void atomCharacterClassEnd() - * void atomParenthesesSubpatternBegin(bool capture = true); + * void atomParenthesesSubpatternBegin(bool capture = true, Optional<String> groupName); * void atomParentheticalAssertionBegin(bool invert = false); * void atomParenthesesEnd(); * void atomBackReference(unsigned subpatternId); + * void atomNamedBackReference(const String& subpatternName); + * bool isValidNamedForwardReference(const String& subpatternName); + * void atomNamedForwardReference(const String& subpatternName); * * void quantifyAtom(unsigned min, unsigned max, bool greedy); * @@ -868,13 +1183,11 @@ private: */ template<class Delegate> -const char* parse(Delegate& delegate, const String& pattern, unsigned backReferenceLimit = quantifyInfinite) +ErrorCode parse(Delegate& delegate, const String& pattern, bool isUnicode, unsigned backReferenceLimit = quantifyInfinite) { if (pattern.is8Bit()) - return Parser<Delegate, LChar>(delegate, pattern, backReferenceLimit).parse(); - return Parser<Delegate, UChar>(delegate, pattern, backReferenceLimit).parse(); + return Parser<Delegate, LChar>(delegate, pattern, isUnicode, backReferenceLimit).parse(); + return Parser<Delegate, UChar>(delegate, pattern, isUnicode, backReferenceLimit).parse(); } } } // namespace JSC::Yarr - -#endif // YarrParser_h |