/* * Copyright (C) 2009, 2014-2016 Apple Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #pragma once #include "Yarr.h" #include "YarrPattern.h" #include "YarrUnicodeProperties.h" #include #include #include #include #include namespace JSC { namespace Yarr { // The Parser class should not be used directly - only via the Yarr::parse() method. template class Parser { private: template friend ErrorCode parse(FriendDelegate&, const String& pattern, bool isUnicode, unsigned backReferenceLimit); /* * CharacterClassParserDelegate: * * The class CharacterClassParserDelegate is used in the parsing of character * classes. This class handles detection of character ranges. This class * implements enough of the delegate interface such that it can be passed to * parseEscape() as an EscapeDelegate. This allows parseEscape() to be reused * to perform the parsing of escape characters in character sets. */ class CharacterClassParserDelegate { public: CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err) : m_delegate(delegate) , m_errorCode(err) , m_state(Empty) , m_character(0) { } /* * begin(): * * Called at beginning of construction. */ void begin(bool invert) { m_delegate.atomCharacterClassBegin(invert); } /* * atomPatternCharacter(): * * This method is called either from parseCharacterClass() (for an unescaped * character in a character class), or from parseEscape(). In the former case * the value true will be passed for the argument 'hyphenIsRange', and in this * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/ * is different to /[a\-z]/). */ void atomPatternCharacter(UChar32 ch, bool hyphenIsRange = false) { switch (m_state) { case AfterCharacterClass: // Following a builtin character class we need look out for a hyphen. // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/. // If we see a hyphen following a charater class then unlike usual // we'll report it to the delegate immediately, and put ourself into // a poisoned state. Any following calls to add another character or // character class will result in an error. (A hypen following a // character-class is itself valid, but only at the end of a regex). if (hyphenIsRange && ch == '-') { m_delegate.atomCharacterClassAtom('-'); m_state = AfterCharacterClassHyphen; return; } // Otherwise just fall through - cached character so treat this as Empty. FALLTHROUGH; case Empty: m_character = ch; m_state = CachedCharacter; return; case CachedCharacter: if (hyphenIsRange && ch == '-') m_state = CachedCharacterHyphen; else { m_delegate.atomCharacterClassAtom(m_character); m_character = ch; } return; case CachedCharacterHyphen: if (ch < m_character) { m_errorCode = ErrorCode::CharacterClassOutOfOrder; return; } m_delegate.atomCharacterClassRange(m_character, ch); m_state = Empty; return; // See coment in atomBuiltInCharacterClass below. // This too is technically an error, per ECMA-262, and again we // we chose to allow this. Note a subtlely here that while we // diverge from the spec's definition of CharacterRange we do // remain in compliance with the grammar. For example, consider // the expression /[\d-a-z]/. We comply with the grammar in // this case by not allowing a-z to be matched as a range. case AfterCharacterClassHyphen: m_delegate.atomCharacterClassAtom(ch); m_state = Empty; return; } } /* * atomBuiltInCharacterClass(): * * Adds a built-in character class, called by parseEscape(). */ void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) { switch (m_state) { case CachedCharacter: // Flush the currently cached character, then fall through. m_delegate.atomCharacterClassAtom(m_character); FALLTHROUGH; case Empty: case AfterCharacterClass: m_state = AfterCharacterClass; m_delegate.atomCharacterClassBuiltIn(classID, invert); return; // If we hit either of these cases, we have an invalid range that // looks something like /[x-\d]/ or /[\d-\d]/. // According to ECMA-262 this should be a syntax error, but // empirical testing shows this to break teh webz. Instead we // comply with to the ECMA-262 grammar, and assume the grammar to // have matched the range correctly, but tweak our interpretation // of CharacterRange. Effectively we implicitly handle the hyphen // as if it were escaped, e.g. /[\w-_]/ is treated as /[\w\-_]/. case CachedCharacterHyphen: m_delegate.atomCharacterClassAtom(m_character); m_delegate.atomCharacterClassAtom('-'); FALLTHROUGH; case AfterCharacterClassHyphen: m_delegate.atomCharacterClassBuiltIn(classID, invert); m_state = Empty; return; } } /* * end(): * * Called at end of construction. */ void end() { if (m_state == CachedCharacter) m_delegate.atomCharacterClassAtom(m_character); else if (m_state == CachedCharacterHyphen) { m_delegate.atomCharacterClassAtom(m_character); m_delegate.atomCharacterClassAtom('-'); } m_delegate.atomCharacterClassEnd(); } // parseEscape() should never call these delegate methods when // invoked with inCharacterClass set. NO_RETURN_DUE_TO_ASSERT void assertionWordBoundary(bool) { RELEASE_ASSERT_NOT_REACHED(); } NO_RETURN_DUE_TO_ASSERT void atomBackReference(unsigned) { RELEASE_ASSERT_NOT_REACHED(); } NO_RETURN_DUE_TO_ASSERT void atomNamedBackReference(String) { RELEASE_ASSERT_NOT_REACHED(); } private: Delegate& m_delegate; ErrorCode& m_errorCode; enum CharacterClassConstructionState { Empty, CachedCharacter, CachedCharacterHyphen, AfterCharacterClass, AfterCharacterClassHyphen, } m_state; UChar32 m_character; }; Parser(Delegate& delegate, const String& pattern, bool isUnicode, unsigned backReferenceLimit) : m_delegate(delegate) , m_backReferenceLimit(backReferenceLimit) , m_data(pattern.characters()) , m_size(pattern.length()) , m_isUnicode(isUnicode) { } // The handling of IdentityEscapes is different depending on the unicode flag. // For Unicode patterns, IdentityEscapes only include SyntaxCharacters or '/'. // For non-unicode patterns, most any character can be escaped. bool isIdentityEscapeAnError(int ch) { if (m_isUnicode && !strchr("^$\\.*+?()[]{}|/", ch)) { m_errorCode = ErrorCode::InvalidIdentityEscape; return true; } return false; } /* * parseEscape(): * * Helper for parseTokens() AND parseCharacterClass(). * Unlike the other parser methods, this function does not report tokens * directly to the member delegate (m_delegate), instead tokens are * emitted to the delegate provided as an argument. In the case of atom * escapes, parseTokens() will call parseEscape() passing m_delegate as * an argument, and as such the escape will be reported to the delegate. * * However this method may also be used by parseCharacterClass(), in which * case a CharacterClassParserDelegate will be passed as the delegate that * tokens should be added to. A boolean flag is also provided to indicate * whether that an escape in a CharacterClass is being parsed (some parsing * rules change in this context). * * The boolean value returned by this method indicates whether the token * parsed was an atom (outside of a characted class \b and \B will be * interpreted as assertions). */ template bool parseEscape(EscapeDelegate& delegate) { ASSERT(!hasError(m_errorCode)); ASSERT(peek() == '\\'); consume(); if (atEndOfPattern()) { m_errorCode = ErrorCode::EscapeUnterminated; return false; } switch (peek()) { // Assertions case 'b': consume(); if (inCharacterClass) { if (isIdentityEscapeAnError('b')) break; delegate.atomPatternCharacter('\b'); } else { delegate.assertionWordBoundary(false); return false; } break; case 'B': consume(); if (inCharacterClass) { if (isIdentityEscapeAnError('B')) break; delegate.atomPatternCharacter('B'); } else { delegate.assertionWordBoundary(true); return false; } break; // CharacterClassEscape case 'd': consume(); delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::DigitClassID, false); break; case 's': consume(); delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::SpaceClassID, false); break; case 'w': consume(); delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::WordClassID, false); break; case 'D': consume(); delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::DigitClassID, true); break; case 'S': consume(); delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::SpaceClassID, true); break; case 'W': consume(); delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::WordClassID, true); break; // DecimalEscape case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape. // First, try to parse this as backreference. if (!inCharacterClass) { ParseState state = saveState(); unsigned backReference = consumeNumber(); if (backReference <= m_backReferenceLimit) { delegate.atomBackReference(backReference); break; } restoreState(state); if (m_isUnicode) { m_errorCode = ErrorCode::InvalidBackreference; return false; } } // Not a backreference, and not octal. Just a number. if (peek() >= '8') { delegate.atomPatternCharacter(consume()); break; } // Fall-through to handle this as an octal escape. FALLTHROUGH; } // Octal escape case '0': delegate.atomPatternCharacter(consumeOctal()); break; // ControlEscape case 'f': consume(); delegate.atomPatternCharacter('\f'); break; case 'n': consume(); delegate.atomPatternCharacter('\n'); break; case 'r': consume(); delegate.atomPatternCharacter('\r'); break; case 't': consume(); delegate.atomPatternCharacter('\t'); break; case 'v': consume(); delegate.atomPatternCharacter('\v'); break; // ControlLetter case 'c': { ParseState state = saveState(); consume(); if (!atEndOfPattern()) { int control = consume(); // To match Firefox, inside a character class, we also accept numbers and '_' as control characters. if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) { delegate.atomPatternCharacter(control & 0x1f); break; } } restoreState(state); delegate.atomPatternCharacter('\\'); break; } // HexEscape case 'x': { consume(); int x = tryConsumeHex(2); if (x == -1) { if (isIdentityEscapeAnError('x')) break; delegate.atomPatternCharacter('x'); } else delegate.atomPatternCharacter(x); break; } // Named backreference case 'k': { consume(); ParseState state = saveState(); if (!atEndOfPattern() && !inCharacterClass) { if (consume() == '<') { auto groupName = tryConsumeGroupName(); if (groupName && m_captureGroupNames.contains(groupName.value())) { delegate.atomNamedBackReference(groupName.value()); break; } if (m_isUnicode) { m_errorCode = ErrorCode::InvalidBackreference; break; } } } restoreState(state); delegate.atomPatternCharacter('k'); break; } // Unicode property escapes case 'p': case 'P': { int escapeChar = consume(); if (!m_isUnicode) { if (isIdentityEscapeAnError(escapeChar)) break; delegate.atomPatternCharacter(escapeChar); break; } if (!atEndOfPattern() && peek() == '{') { consume(); auto optClassID = tryConsumeUnicodePropertyExpression(); if (!optClassID) { // tryConsumeUnicodePropertyExpression() will set m_errorCode for a malformed property expression break; } delegate.atomBuiltInCharacterClass(optClassID.value(), escapeChar == 'P'); } else m_errorCode = ErrorCode::InvalidUnicodePropertyExpression; break; } // UnicodeEscape case 'u': { consume(); if (atEndOfPattern()) { if (isIdentityEscapeAnError('u')) break; delegate.atomPatternCharacter('u'); break; } if (m_isUnicode && peek() == '{') { consume(); UChar32 codePoint = 0; do { if (atEndOfPattern() || !isASCIIHexDigit(peek())) { m_errorCode = ErrorCode::InvalidUnicodeEscape; break; } codePoint = (codePoint << 4) | toASCIIHexValue(consume()); if (codePoint > UCHAR_MAX_VALUE) m_errorCode = ErrorCode::InvalidUnicodeEscape; } while (!atEndOfPattern() && peek() != '}'); if (!atEndOfPattern() && peek() == '}') consume(); else if (!hasError(m_errorCode)) m_errorCode = ErrorCode::InvalidUnicodeEscape; if (hasError(m_errorCode)) return false; delegate.atomPatternCharacter(codePoint); break; } int u = tryConsumeHex(4); if (u == -1) { if (isIdentityEscapeAnError('u')) break; delegate.atomPatternCharacter('u'); } else { // If we have the first of a surrogate pair, look for the second. if (U16_IS_LEAD(u) && m_isUnicode && (patternRemaining() >= 6) && peek() == '\\') { ParseState state = saveState(); consume(); if (tryConsume('u')) { int surrogate2 = tryConsumeHex(4); if (U16_IS_TRAIL(surrogate2)) { u = U16_GET_SUPPLEMENTARY(u, surrogate2); delegate.atomPatternCharacter(u); break; } } restoreState(state); } delegate.atomPatternCharacter(u); } break; } // IdentityEscape default: int ch = peek(); if (ch == '-' && m_isUnicode && inCharacterClass) { // \- is allowed for ClassEscape with unicode flag. delegate.atomPatternCharacter(consume()); break; } if (isIdentityEscapeAnError(ch)) break; delegate.atomPatternCharacter(consume()); } return true; } UChar32 consumePossibleSurrogatePair() { UChar32 ch = consume(); if (U16_IS_LEAD(ch) && m_isUnicode && (patternRemaining() > 0)) { ParseState state = saveState(); UChar32 surrogate2 = consume(); if (U16_IS_TRAIL(surrogate2)) ch = U16_GET_SUPPLEMENTARY(ch, surrogate2); else restoreState(state); } return ch; } /* * parseAtomEscape(), parseCharacterClassEscape(): * * These methods alias to parseEscape(). */ bool parseAtomEscape() { return parseEscape(m_delegate); } void parseCharacterClassEscape(CharacterClassParserDelegate& delegate) { parseEscape(delegate); } /* * parseCharacterClass(): * * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape) * to an instance of CharacterClassParserDelegate, to describe the character class to the * delegate. */ void parseCharacterClass() { ASSERT(!hasError(m_errorCode)); ASSERT(peek() == '['); consume(); CharacterClassParserDelegate characterClassConstructor(m_delegate, m_errorCode); characterClassConstructor.begin(tryConsume('^')); while (!atEndOfPattern()) { switch (peek()) { case ']': consume(); characterClassConstructor.end(); return; case '\\': parseCharacterClassEscape(characterClassConstructor); break; default: characterClassConstructor.atomPatternCharacter(consumePossibleSurrogatePair(), true); } if (hasError(m_errorCode)) return; } m_errorCode = ErrorCode::CharacterClassUnmatched; } /* * parseParenthesesBegin(): * * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns. */ void parseParenthesesBegin() { ASSERT(!hasError(m_errorCode)); ASSERT(peek() == '('); consume(); if (tryConsume('?')) { if (atEndOfPattern()) { m_errorCode = ErrorCode::ParenthesesTypeInvalid; return; } switch (consume()) { case ':': m_delegate.atomParenthesesSubpatternBegin(false); break; case '=': m_delegate.atomParentheticalAssertionBegin(); break; case '!': m_delegate.atomParentheticalAssertionBegin(true); break; case '<': { auto groupName = tryConsumeGroupName(); if (groupName) { auto setAddResult = m_captureGroupNames.add(groupName.value()); if (setAddResult.isNewEntry) m_delegate.atomParenthesesSubpatternBegin(true, groupName); else m_errorCode = ErrorCode::DuplicateGroupName; } else m_errorCode = ErrorCode::InvalidGroupName; break; } default: m_errorCode = ErrorCode::ParenthesesTypeInvalid; } } else m_delegate.atomParenthesesSubpatternBegin(); ++m_parenthesesNestingDepth; } /* * parseParenthesesEnd(): * * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses). */ void parseParenthesesEnd() { ASSERT(!hasError(m_errorCode)); ASSERT(peek() == ')'); consume(); if (m_parenthesesNestingDepth > 0) m_delegate.atomParenthesesEnd(); else m_errorCode = ErrorCode::ParenthesesUnmatched; --m_parenthesesNestingDepth; } /* * parseQuantifier(): * * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers. */ void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max) { ASSERT(!hasError(m_errorCode)); ASSERT(min <= max); if (min == UINT_MAX) { m_errorCode = ErrorCode::QuantifierTooLarge; return; } if (lastTokenWasAnAtom) m_delegate.quantifyAtom(min, max, !tryConsume('?')); else m_errorCode = ErrorCode::QuantifierWithoutAtom; } /* * parseTokens(): * * This method loops over the input pattern reporting tokens to the delegate. * The method returns when a parse error is detected, or the end of the pattern * is reached. One piece of state is tracked around the loop, which is whether * the last token passed to the delegate was an atom (this is necessary to detect * a parse error when a quantifier provided without an atom to quantify). */ void parseTokens() { bool lastTokenWasAnAtom = false; while (!atEndOfPattern()) { switch (peek()) { case '|': consume(); m_delegate.disjunction(); lastTokenWasAnAtom = false; break; case '(': parseParenthesesBegin(); lastTokenWasAnAtom = false; break; case ')': parseParenthesesEnd(); lastTokenWasAnAtom = true; break; case '^': consume(); m_delegate.assertionBOL(); lastTokenWasAnAtom = false; break; case '$': consume(); m_delegate.assertionEOL(); lastTokenWasAnAtom = false; break; case '.': consume(); m_delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::DotClassID, false); lastTokenWasAnAtom = true; break; case '[': parseCharacterClass(); lastTokenWasAnAtom = true; break; case '\\': lastTokenWasAnAtom = parseAtomEscape(); break; case '*': consume(); parseQuantifier(lastTokenWasAnAtom, 0, quantifyInfinite); lastTokenWasAnAtom = false; break; case '+': consume(); parseQuantifier(lastTokenWasAnAtom, 1, quantifyInfinite); lastTokenWasAnAtom = false; break; case '?': consume(); parseQuantifier(lastTokenWasAnAtom, 0, 1); lastTokenWasAnAtom = false; break; case '{': { ParseState state = saveState(); consume(); if (peekIsDigit()) { unsigned min = consumeNumber(); unsigned max = min; if (tryConsume(',')) max = peekIsDigit() ? consumeNumber() : quantifyInfinite; if (tryConsume('}')) { if (min <= max) parseQuantifier(lastTokenWasAnAtom, min, max); else m_errorCode = ErrorCode::QuantifierOutOfOrder; lastTokenWasAnAtom = false; break; } } restoreState(state); } // if we did not find a complete quantifer, fall through to the default case. FALLTHROUGH; default: m_delegate.atomPatternCharacter(consumePossibleSurrogatePair()); lastTokenWasAnAtom = true; } if (hasError(m_errorCode)) return; } if (m_parenthesesNestingDepth > 0) m_errorCode = ErrorCode::MissingParentheses; } /* * parse(): * * This method calls parseTokens() to parse over the input and returns error code for a result. */ ErrorCode parse() { if (m_size > MAX_PATTERN_SIZE) m_errorCode = ErrorCode::PatternTooLarge; else parseTokens(); ASSERT(atEndOfPattern() || hasError(m_errorCode)); return m_errorCode; } // Misc helper functions: typedef unsigned ParseState; ParseState saveState() { return m_index; } void restoreState(ParseState state) { m_index = state; } bool atEndOfPattern() { ASSERT(m_index <= m_size); return m_index == m_size; } unsigned patternRemaining() { ASSERT(m_index <= m_size); return m_size - m_index; } int peek() { ASSERT(m_index < m_size); return m_data[m_index]; } bool peekIsDigit() { return !atEndOfPattern() && WTF::isASCIIDigit(peek()); } unsigned peekDigit() { ASSERT(peekIsDigit()); return peek() - '0'; } int tryConsumeUnicodeEscape() { if (!tryConsume('u')) return -1; if (m_isUnicode && tryConsume('{')) { int codePoint = 0; do { if (atEndOfPattern() || !isASCIIHexDigit(peek())) { m_errorCode = ErrorCode::InvalidUnicodeEscape; return -1; } codePoint = (codePoint << 4) | toASCIIHexValue(consume()); if (codePoint > UCHAR_MAX_VALUE) { m_errorCode = ErrorCode::InvalidUnicodeEscape; return -1; } } while (!atEndOfPattern() && peek() != '}'); if (!atEndOfPattern() && peek() == '}') consume(); else if (!hasError(m_errorCode)) m_errorCode = ErrorCode::InvalidUnicodeEscape; if (hasError(m_errorCode)) return -1; return codePoint; } int u = tryConsumeHex(4); if (u == -1) return -1; // If we have the first of a surrogate pair, look for the second. if (U16_IS_LEAD(u) && m_isUnicode && (patternRemaining() >= 6) && peek() == '\\') { ParseState state = saveState(); consume(); if (tryConsume('u')) { int surrogate2 = tryConsumeHex(4); if (U16_IS_TRAIL(surrogate2)) { u = U16_GET_SUPPLEMENTARY(u, surrogate2); return u; } } restoreState(state); } return u; } int tryConsumeIdentifierCharacter() { int ch = peek(); if (ch == '\\') { consume(); ch = tryConsumeUnicodeEscape(); } else consume(); return ch; } bool isIdentifierStart(int ch) { return (WTF::isASCII(ch) && (WTF::isASCIIAlpha(ch) || ch == '_' || ch == '$')) || (U_GET_GC_MASK(ch) & U_GC_L_MASK); } bool isIdentifierPart(int ch) { return (WTF::isASCII(ch) && (WTF::isASCIIAlpha(ch) || ch == '_' || ch == '$')) || (U_GET_GC_MASK(ch) & (U_GC_L_MASK | U_GC_MN_MASK | U_GC_MC_MASK | U_GC_ND_MASK | U_GC_PC_MASK)) || ch == 0x200C || ch == 0x200D; } bool isUnicodePropertyValueExpressionChar(int ch) { return WTF::isASCIIAlphanumeric(ch) || ch == '_' || ch == '='; } int consume() { ASSERT(m_index < m_size); return m_data[m_index++]; } unsigned consumeDigit() { ASSERT(peekIsDigit()); return consume() - '0'; } unsigned consumeNumber() { Checked n = consumeDigit(); while (peekIsDigit()) n = n * 10 + consumeDigit(); return n.hasOverflowed() ? quantifyInfinite : n.unsafeGet(); } unsigned consumeOctal() { ASSERT(WTF::isASCIIOctalDigit(peek())); unsigned n = consumeDigit(); while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek())) n = n * 8 + consumeDigit(); return n; } bool tryConsume(UChar ch) { if (atEndOfPattern() || (m_data[m_index] != ch)) return false; ++m_index; return true; } int tryConsumeHex(int count) { ParseState state = saveState(); int n = 0; while (count--) { if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) { restoreState(state); return -1; } n = (n << 4) | WTF::toASCIIHexValue(consume()); } return n; } std::optional tryConsumeGroupName() { if (atEndOfPattern()) return std::nullopt; ParseState state = saveState(); int ch = tryConsumeIdentifierCharacter(); if (isIdentifierStart(ch)) { StringBuilder identifierBuilder; identifierBuilder.append(ch); while (!atEndOfPattern()) { ch = tryConsumeIdentifierCharacter(); if (ch == '>') return std::optional(identifierBuilder.toString()); if (!isIdentifierPart(ch)) break; identifierBuilder.append(ch); } } restoreState(state); return std::nullopt; } std::optional tryConsumeUnicodePropertyExpression() { if (atEndOfPattern() || !isUnicodePropertyValueExpressionChar(peek())) { m_errorCode = ErrorCode::InvalidUnicodePropertyExpression; return std::nullopt; } StringBuilder expressionBuilder; String unicodePropertyName; bool foundEquals = false; unsigned errors = 0; expressionBuilder.append(consume()); while (!atEndOfPattern()) { int ch = peek(); if (ch == '}') { consume(); if (errors) { m_errorCode = ErrorCode::InvalidUnicodePropertyExpression; return std::nullopt; } if (foundEquals) { auto result = unicodeMatchPropertyValue(unicodePropertyName, expressionBuilder.toString()); if (!result) m_errorCode = ErrorCode::InvalidUnicodePropertyExpression; return result; } auto result = unicodeMatchProperty(expressionBuilder.toString()); if (!result) m_errorCode = ErrorCode::InvalidUnicodePropertyExpression; return result; } consume(); if (ch == '=') { if (!foundEquals) { foundEquals = true; unicodePropertyName = expressionBuilder.toString(); expressionBuilder.clear(); } else errors++; } else if (!isUnicodePropertyValueExpressionChar(ch)) errors++; else expressionBuilder.append(ch); } m_errorCode = ErrorCode::InvalidUnicodePropertyExpression; return std::nullopt; } Delegate& m_delegate; unsigned m_backReferenceLimit; ErrorCode m_errorCode { ErrorCode::NoError }; const CharType* m_data; unsigned m_size; unsigned m_index { 0 }; bool m_isUnicode; unsigned m_parenthesesNestingDepth { 0 }; HashSet m_captureGroupNames; // Derived by empirical testing of compile time in PCRE and WREC. static const unsigned MAX_PATTERN_SIZE = 1024 * 1024; }; /* * Yarr::parse(): * * The parse method is passed a pattern to be parsed and a delegate upon which * callbacks will be made to record the parsed tokens forming the regex. * Yarr::parse() returns null on success, or a const C string providing an error * message where a parse error occurs. * * The Delegate must implement the following interface: * * void assertionBOL(); * void assertionEOL(); * void assertionWordBoundary(bool invert); * * void atomPatternCharacter(UChar32 ch); * void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert); * void atomCharacterClassBegin(bool invert) * void atomCharacterClassAtom(UChar32 ch) * void atomCharacterClassRange(UChar32 begin, UChar32 end) * void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) * void atomCharacterClassEnd() * void atomParenthesesSubpatternBegin(bool capture = true, std::optional groupName); * void atomParentheticalAssertionBegin(bool invert = false); * void atomParenthesesEnd(); * void atomBackReference(unsigned subpatternId); * void atomNamedBackReference(String subpatternName); * * void quantifyAtom(unsigned min, unsigned max, bool greedy); * * void disjunction(); * * The regular expression is described by a sequence of assertion*() and atom*() * callbacks to the delegate, describing the terms in the regular expression. * Following an atom a quantifyAtom() call may occur to indicate that the previous * atom should be quantified. In the case of atoms described across multiple * calls (parentheses and character classes) the call to quantifyAtom() will come * after the call to the atom*End() method, never after atom*Begin(). * * Character classes may either be described by a single call to * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls. * In the latter case, ...Begin() will be called, followed by a sequence of * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End(). * * Sequences of atoms and assertions are broken into alternatives via calls to * disjunction(). Assertions, atoms, and disjunctions emitted between calls to * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern. * atomParenthesesBegin() is passed a subpatternId. In the case of a regular * capturing subpattern, this will be the subpatternId associated with these * parentheses, and will also by definition be the lowest subpatternId of these * parentheses and of any nested paretheses. The atomParenthesesEnd() method * is passed the subpatternId of the last capturing subexpression nested within * these paretheses. In the case of a capturing subpattern with no nested * capturing subpatterns, the same subpatternId will be passed to the begin and * end functions. In the case of non-capturing subpatterns the subpatternId * passed to the begin method is also the first possible subpatternId that might * be nested within these paretheses. If a set of non-capturing parentheses does * not contain any capturing subpatterns, then the subpatternId passed to begin * will be greater than the subpatternId passed to end. */ template ErrorCode parse(Delegate& delegate, const String& pattern, bool isUnicode, unsigned backReferenceLimit = quantifyInfinite) { if (pattern.is8Bit()) return Parser(delegate, pattern, isUnicode, backReferenceLimit).parse(); return Parser(delegate, pattern, isUnicode, backReferenceLimit).parse(); } } } // namespace JSC::Yarr