diff options
Diffstat (limited to 'src/3rdparty/masm/yarr/YarrPattern.cpp')
-rw-r--r-- | src/3rdparty/masm/yarr/YarrPattern.cpp | 962 |
1 files changed, 795 insertions, 167 deletions
diff --git a/src/3rdparty/masm/yarr/YarrPattern.cpp b/src/3rdparty/masm/yarr/YarrPattern.cpp index c7e5b6b09b..ac66ea1b9a 100644 --- a/src/3rdparty/masm/yarr/YarrPattern.cpp +++ b/src/3rdparty/masm/yarr/YarrPattern.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2009, 2013 Apple Inc. All rights reserved. + * Copyright (C) 2009, 2013-2016 Apple Inc. All rights reserved. * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged * * Redistribution and use in source and binary forms, with or without @@ -27,10 +27,15 @@ #include "config.h" #include "YarrPattern.h" +#include "Options.h" #include "Yarr.h" -#include "YarrCanonicalizeUCS2.h" +#include "YarrCanonicalize.h" #include "YarrParser.h" +#include <wtf/DataLog.h> +#include <wtf/Optional.h> +//#include <wtf/Threading.h> #include <wtf/Vector.h> +#include <wtf/text/WTFString.h> using namespace WTF; @@ -40,8 +45,11 @@ namespace JSC { namespace Yarr { class CharacterClassConstructor { public: - CharacterClassConstructor(bool isCaseInsensitive = false) + CharacterClassConstructor(bool isCaseInsensitive, CanonicalMode canonicalMode) : m_isCaseInsensitive(isCaseInsensitive) + , m_hasNonBMPCharacters(false) + , m_anyCharacter(false) + , m_canonicalMode(canonicalMode) { } @@ -51,6 +59,8 @@ public: m_ranges.clear(); m_matchesUnicode.clear(); m_rangesUnicode.clear(); + m_hasNonBMPCharacters = false; + m_anyCharacter = false; } void append(const CharacterClass* other) @@ -65,11 +75,71 @@ public: addSortedRange(m_rangesUnicode, other->m_rangesUnicode[i].begin, other->m_rangesUnicode[i].end); } - void putChar(UChar ch) + void appendInverted(const CharacterClass* other) { - // Handle ascii cases. - if (ch <= 0x7f) { - if (m_isCaseInsensitive && isASCIIAlpha(ch)) { + auto addSortedInverted = [&](UChar32 min, UChar32 max, + const Vector<UChar32>& srcMatches, const Vector<CharacterRange>& srcRanges, + Vector<UChar32>& destMatches, Vector<CharacterRange>& destRanges) { + + auto addSortedMatchOrRange = [&](UChar32 lo, UChar32 hiPlusOne) { + if (lo < hiPlusOne) { + if (lo + 1 == hiPlusOne) + addSorted(destMatches, lo); + else + addSortedRange(destRanges, lo, hiPlusOne - 1); + } + }; + + UChar32 lo = min; + size_t matchesIndex = 0; + size_t rangesIndex = 0; + bool matchesRemaining = matchesIndex < srcMatches.size(); + bool rangesRemaining = rangesIndex < srcRanges.size(); + + if (!matchesRemaining && !rangesRemaining) { + addSortedMatchOrRange(min, max + 1); + return; + } + + while (matchesRemaining || rangesRemaining) { + UChar32 hiPlusOne; + UChar32 nextLo; + + if (matchesRemaining + && (!rangesRemaining || srcMatches[matchesIndex] < srcRanges[rangesIndex].begin)) { + hiPlusOne = srcMatches[matchesIndex]; + nextLo = hiPlusOne + 1; + ++matchesIndex; + matchesRemaining = matchesIndex < srcMatches.size(); + } else { + hiPlusOne = srcRanges[rangesIndex].begin; + nextLo = srcRanges[rangesIndex].end + 1; + ++rangesIndex; + rangesRemaining = rangesIndex < srcRanges.size(); + } + + addSortedMatchOrRange(lo, hiPlusOne); + + lo = nextLo; + } + + addSortedMatchOrRange(lo, max + 1); + }; + + addSortedInverted(0, 0x7f, other->m_matches, other->m_ranges, m_matches, m_ranges); + addSortedInverted(0x80, 0x10ffff, other->m_matchesUnicode, other->m_rangesUnicode, m_matchesUnicode, m_rangesUnicode); + } + + void putChar(UChar32 ch) + { + if (!m_isCaseInsensitive) { + addSorted(ch); + return; + } + + if (m_canonicalMode == CanonicalMode::UCS2 && isASCII(ch)) { + // Handle ASCII cases. + if (isASCIIAlpha(ch)) { addSorted(m_matches, toASCIIUpper(ch)); addSorted(m_matches, toASCIILower(ch)); } else @@ -77,40 +147,33 @@ public: return; } - // Simple case, not a case-insensitive match. - if (!m_isCaseInsensitive) { - addSorted(m_matchesUnicode, ch); - return; - } - // Add multiple matches, if necessary. - UCS2CanonicalizationRange* info = rangeInfoFor(ch); + const CanonicalizationRange* info = canonicalRangeInfoFor(ch, m_canonicalMode); if (info->type == CanonicalizeUnique) - addSorted(m_matchesUnicode, ch); + addSorted(ch); else putUnicodeIgnoreCase(ch, info); } - void putUnicodeIgnoreCase(UChar ch, UCS2CanonicalizationRange* info) + void putUnicodeIgnoreCase(UChar32 ch, const CanonicalizationRange* info) { ASSERT(m_isCaseInsensitive); - ASSERT(ch > 0x7f); ASSERT(ch >= info->begin && ch <= info->end); ASSERT(info->type != CanonicalizeUnique); if (info->type == CanonicalizeSet) { - for (uint16_t* set = characterSetInfo[info->value]; (ch = *set); ++set) - addSorted(m_matchesUnicode, ch); + for (const UChar32* set = canonicalCharacterSetInfo(info->value, m_canonicalMode); (ch = *set); ++set) + addSorted(ch); } else { - addSorted(m_matchesUnicode, ch); - addSorted(m_matchesUnicode, getCanonicalPair(info, ch)); + addSorted(ch); + addSorted(getCanonicalPair(info, ch)); } } - void putRange(UChar lo, UChar hi) + void putRange(UChar32 lo, UChar32 hi) { - if (lo <= 0x7f) { + if (isASCII(lo)) { char asciiLo = lo; - char asciiHi = std::min(hi, (UChar)0x7f); + char asciiHi = std::min(hi, (UChar32)0x7f); addSortedRange(m_ranges, lo, asciiHi); if (m_isCaseInsensitive) { @@ -120,19 +183,19 @@ public: addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a')); } } - if (hi <= 0x7f) + if (isASCII(hi)) return; - lo = std::max(lo, (UChar)0x80); + lo = std::max(lo, (UChar32)0x80); addSortedRange(m_rangesUnicode, lo, hi); if (!m_isCaseInsensitive) return; - UCS2CanonicalizationRange* info = rangeInfoFor(lo); + const CanonicalizationRange* info = canonicalRangeInfoFor(lo, m_canonicalMode); while (true) { // Handle the range [lo .. end] - UChar end = std::min<UChar>(info->end, hi); + UChar32 end = std::min<UChar32>(info->end, hi); switch (info->type) { case CanonicalizeUnique: @@ -140,7 +203,7 @@ public: break; case CanonicalizeSet: { UChar ch; - for (uint16_t* set = characterSetInfo[info->value]; (ch = *set); ++set) + for (const UChar32* set = canonicalCharacterSetInfo(info->value, m_canonicalMode); (ch = *set); ++set) addSorted(m_matchesUnicode, ch); break; } @@ -175,24 +238,38 @@ public: } - PassOwnPtr<CharacterClass> charClass() + std::unique_ptr<CharacterClass> charClass() { - OwnPtr<CharacterClass> characterClass = adoptPtr(new CharacterClass); + coalesceTables(); + + auto characterClass = std::make_unique<CharacterClass>(); characterClass->m_matches.swap(m_matches); characterClass->m_ranges.swap(m_ranges); characterClass->m_matchesUnicode.swap(m_matchesUnicode); characterClass->m_rangesUnicode.swap(m_rangesUnicode); + characterClass->m_hasNonBMPCharacters = hasNonBMPCharacters(); + characterClass->m_anyCharacter = anyCharacter(); + + m_hasNonBMPCharacters = false; + m_anyCharacter = false; - return characterClass.release(); + return characterClass; } private: - void addSorted(Vector<UChar>& matches, UChar ch) + void addSorted(UChar32 ch) + { + addSorted(isASCII(ch) ? m_matches : m_matchesUnicode, ch); + } + + void addSorted(Vector<UChar32>& matches, UChar32 ch) { unsigned pos = 0; - ASSERT(matches.size() <= UINT_MAX); - unsigned range = static_cast<unsigned>(matches.size()); + unsigned range = matches.size(); + + if (!U_IS_BMP(ch)) + m_hasNonBMPCharacters = true; // binary chop, find position to insert char. while (range) { @@ -201,9 +278,31 @@ private: int val = matches[pos+index] - ch; if (!val) return; - else if (val > 0) + else if (val > 0) { + if (val == 1) { + UChar32 lo = ch; + UChar32 hi = ch + 1; + matches.remove(pos + index); + if (pos + index > 0 && matches[pos + index - 1] == ch - 1) { + lo = ch - 1; + matches.remove(pos + index - 1); + } + addSortedRange(isASCII(ch) ? m_ranges : m_rangesUnicode, lo, hi); + return; + } range = index; - else { + } else { + if (val == -1) { + UChar32 lo = ch - 1; + UChar32 hi = ch; + matches.remove(pos + index); + if (pos + index + 1 < matches.size() && matches[pos + index + 1] == ch + 1) { + hi = ch + 1; + matches.remove(pos + index + 1); + } + addSortedRange(isASCII(ch) ? m_ranges : m_rangesUnicode, lo, hi); + return; + } pos += (index+1); range -= (index+1); } @@ -215,17 +314,19 @@ private: matches.insert(pos, ch); } - void addSortedRange(Vector<CharacterRange>& ranges, UChar lo, UChar hi) + void addSortedRange(Vector<CharacterRange>& ranges, UChar32 lo, UChar32 hi) { - ASSERT(ranges.size() <= UINT_MAX); - unsigned end = static_cast<unsigned>(ranges.size()); - + size_t end = ranges.size(); + + if (!U_IS_BMP(hi)) + m_hasNonBMPCharacters = true; + // Simple linear scan - I doubt there are that many ranges anyway... // feel free to fix this with something faster (eg binary chop). - for (unsigned i = 0; i < end; ++i) { + for (size_t i = 0; i < end; ++i) { // does the new range fall before the current position in the array if (hi < ranges[i].begin) { - // optional optimization: concatenate appending ranges? - may not be worthwhile. + // Concatenate appending ranges. if (hi == (ranges[i].begin - 1)) { ranges[i].begin = lo; return; @@ -233,7 +334,7 @@ private: ranges.insert(i, CharacterRange(lo, hi)); return; } - // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the beginning + // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining // If the new range start at or before the end of the last range, then the overlap (if it starts one after the // end of the last range they concatenate, which is just as good. if (lo <= (ranges[i].end + 1)) { @@ -241,18 +342,7 @@ private: ranges[i].begin = std::min(ranges[i].begin, lo); ranges[i].end = std::max(ranges[i].end, hi); - // now check if the new range can subsume any subsequent ranges. - unsigned next = i+1; - // each iteration of the loop we will either remove something from the list, or break the loop. - while (next < ranges.size()) { - if (ranges[next].begin <= (ranges[i].end + 1)) { - // the next entry now overlaps / concatenates this one. - ranges[i].end = std::max(ranges[i].end, ranges[next].end); - ranges.remove(next); - } else - break; - } - + mergeRangesFrom(ranges, i); return; } } @@ -261,25 +351,95 @@ private: ranges.append(CharacterRange(lo, hi)); } - bool m_isCaseInsensitive; + void mergeRangesFrom(Vector<CharacterRange>& ranges, size_t index) + { + size_t next = index + 1; + + // each iteration of the loop we will either remove something from the list, or break out of the loop. + while (next < ranges.size()) { + if (ranges[next].begin <= (ranges[index].end + 1)) { + // the next entry now overlaps / concatenates with this one. + ranges[index].end = std::max(ranges[index].end, ranges[next].end); + ranges.remove(next); + } else + break; + } + + } + + void coalesceTables() + { + auto coalesceMatchesAndRanges = [&](Vector<UChar32>& matches, Vector<CharacterRange>& ranges) { + + size_t matchesIndex = 0; + size_t rangesIndex = 0; + + while (matchesIndex < matches.size() && rangesIndex < ranges.size()) { + while (matchesIndex < matches.size() && matches[matchesIndex] < ranges[rangesIndex].begin - 1) + matchesIndex++; + + if (matchesIndex < matches.size() && matches[matchesIndex] == ranges[rangesIndex].begin - 1) { + ranges[rangesIndex].begin = matches[matchesIndex]; + matches.remove(matchesIndex); + } + + while (matchesIndex < matches.size() && matches[matchesIndex] < ranges[rangesIndex].end + 1) + matchesIndex++; + + if (matchesIndex < matches.size()) { + if (matches[matchesIndex] == ranges[rangesIndex].end + 1) { + ranges[rangesIndex].end = matches[matchesIndex]; + matches.remove(matchesIndex); + + mergeRangesFrom(ranges, rangesIndex); + } else + matchesIndex++; + } + } + }; + + coalesceMatchesAndRanges(m_matches, m_ranges); + coalesceMatchesAndRanges(m_matchesUnicode, m_rangesUnicode); + + if (!m_matches.size() && !m_matchesUnicode.size() + && m_ranges.size() == 1 && m_rangesUnicode.size() == 1 + && m_ranges[0].begin == 0 && m_ranges[0].end == 0x7f + && m_rangesUnicode[0].begin == 0x80 && m_rangesUnicode[0].end == 0x10ffff) + m_anyCharacter = true; + } - Vector<UChar> m_matches; + bool hasNonBMPCharacters() + { + return m_hasNonBMPCharacters; + } + + bool anyCharacter() + { + return m_anyCharacter; + } + + bool m_isCaseInsensitive : 1; + bool m_hasNonBMPCharacters : 1; + bool m_anyCharacter : 1; + CanonicalMode m_canonicalMode; + + Vector<UChar32> m_matches; Vector<CharacterRange> m_ranges; - Vector<UChar> m_matchesUnicode; + Vector<UChar32> m_matchesUnicode; Vector<CharacterRange> m_rangesUnicode; }; class YarrPatternConstructor { public: - YarrPatternConstructor(YarrPattern& pattern) + YarrPatternConstructor(YarrPattern& pattern, void* stackLimit) : m_pattern(pattern) - , m_characterClassConstructor(pattern.m_ignoreCase) - , m_invertParentheticalAssertion(false) + , m_characterClassConstructor(pattern.ignoreCase(), pattern.unicode() ? CanonicalMode::Unicode : CanonicalMode::UCS2) + , m_stackLimit(stackLimit) { - OwnPtr<PatternDisjunction> body = adoptPtr(new PatternDisjunction); + auto body = std::make_unique<PatternDisjunction>(); m_pattern.m_body = body.get(); m_alternative = body->addNewAlternative(); - m_pattern.m_disjunctions.append(body.release()); + m_pattern.m_disjunctions.append(WTFMove(body)); } ~YarrPatternConstructor() @@ -291,15 +451,15 @@ public: m_pattern.reset(); m_characterClassConstructor.reset(); - OwnPtr<PatternDisjunction> body = adoptPtr(new PatternDisjunction); + auto body = std::make_unique<PatternDisjunction>(); m_pattern.m_body = body.get(); m_alternative = body->addNewAlternative(); - m_pattern.m_disjunctions.append(body.release()); + m_pattern.m_disjunctions.append(WTFMove(body)); } void assertionBOL() { - if (!m_alternative->m_terms.size() & !m_invertParentheticalAssertion) { + if (!m_alternative->m_terms.size() && !m_invertParentheticalAssertion) { m_alternative->m_startsWithBOL = true; m_alternative->m_containsBOL = true; m_pattern.m_containsBOL = true; @@ -315,41 +475,51 @@ public: m_alternative->m_terms.append(PatternTerm::WordBoundary(invert)); } - void atomPatternCharacter(UChar ch) + void atomPatternCharacter(UChar32 ch) { // We handle case-insensitive checking of unicode characters which do have both // cases by handling them as if they were defined using a CharacterClass. - if (!m_pattern.m_ignoreCase || isASCII(ch)) { + if (!m_pattern.ignoreCase() || (isASCII(ch) && !m_pattern.unicode())) { m_alternative->m_terms.append(PatternTerm(ch)); return; } - UCS2CanonicalizationRange* info = rangeInfoFor(ch); + const CanonicalizationRange* info = canonicalRangeInfoFor(ch, m_pattern.unicode() ? CanonicalMode::Unicode : CanonicalMode::UCS2); if (info->type == CanonicalizeUnique) { m_alternative->m_terms.append(PatternTerm(ch)); return; } m_characterClassConstructor.putUnicodeIgnoreCase(ch, info); - OwnPtr<CharacterClass> newCharacterClass = m_characterClassConstructor.charClass(); + auto newCharacterClass = m_characterClassConstructor.charClass(); m_alternative->m_terms.append(PatternTerm(newCharacterClass.get(), false)); - m_pattern.m_userCharacterClasses.append(newCharacterClass.release()); + m_pattern.m_userCharacterClasses.append(WTFMove(newCharacterClass)); } void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) { switch (classID) { - case DigitClassID: + case BuiltInCharacterClassID::DigitClassID: m_alternative->m_terms.append(PatternTerm(m_pattern.digitsCharacterClass(), invert)); break; - case SpaceClassID: + case BuiltInCharacterClassID::SpaceClassID: m_alternative->m_terms.append(PatternTerm(m_pattern.spacesCharacterClass(), invert)); break; - case WordClassID: - m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert)); + case BuiltInCharacterClassID::WordClassID: + if (m_pattern.unicode() && m_pattern.ignoreCase()) + m_alternative->m_terms.append(PatternTerm(m_pattern.wordUnicodeIgnoreCaseCharCharacterClass(), invert)); + else + m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert)); + break; + case BuiltInCharacterClassID::DotClassID: + ASSERT(!invert); + if (m_pattern.dotAll()) + m_alternative->m_terms.append(PatternTerm(m_pattern.anyCharacterClass(), false)); + else + m_alternative->m_terms.append(PatternTerm(m_pattern.newlineCharacterClass(), true)); break; - case NewlineClassID: - m_alternative->m_terms.append(PatternTerm(m_pattern.newlineCharacterClass(), invert)); + default: + m_alternative->m_terms.append(PatternTerm(m_pattern.unicodeCharacterClassFor(classID), invert)); break; } } @@ -359,64 +529,83 @@ public: m_invertCharacterClass = invert; } - void atomCharacterClassAtom(UChar ch) + void atomCharacterClassAtom(UChar32 ch) { m_characterClassConstructor.putChar(ch); } - void atomCharacterClassRange(UChar begin, UChar end) + void atomCharacterClassRange(UChar32 begin, UChar32 end) { m_characterClassConstructor.putRange(begin, end); } void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) { - ASSERT(classID != NewlineClassID); + ASSERT(classID != BuiltInCharacterClassID::DotClassID); switch (classID) { - case DigitClassID: + case BuiltInCharacterClassID::DigitClassID: m_characterClassConstructor.append(invert ? m_pattern.nondigitsCharacterClass() : m_pattern.digitsCharacterClass()); break; - case SpaceClassID: + case BuiltInCharacterClassID::SpaceClassID: m_characterClassConstructor.append(invert ? m_pattern.nonspacesCharacterClass() : m_pattern.spacesCharacterClass()); break; - case WordClassID: - m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass()); + case BuiltInCharacterClassID::WordClassID: + if (m_pattern.unicode() && m_pattern.ignoreCase()) + m_characterClassConstructor.append(invert ? m_pattern.nonwordUnicodeIgnoreCaseCharCharacterClass() : m_pattern.wordUnicodeIgnoreCaseCharCharacterClass()); + else + m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass()); break; default: - RELEASE_ASSERT_NOT_REACHED(); + if (!invert) + m_characterClassConstructor.append(m_pattern.unicodeCharacterClassFor(classID)); + else + m_characterClassConstructor.appendInverted(m_pattern.unicodeCharacterClassFor(classID)); } } void atomCharacterClassEnd() { - OwnPtr<CharacterClass> newCharacterClass = m_characterClassConstructor.charClass(); + auto newCharacterClass = m_characterClassConstructor.charClass(); + + if (!m_invertCharacterClass && newCharacterClass.get()->m_anyCharacter) { + m_alternative->m_terms.append(PatternTerm(m_pattern.anyCharacterClass(), false)); + return; + } m_alternative->m_terms.append(PatternTerm(newCharacterClass.get(), m_invertCharacterClass)); - m_pattern.m_userCharacterClasses.append(newCharacterClass.release()); + m_pattern.m_userCharacterClasses.append(WTFMove(newCharacterClass)); } - void atomParenthesesSubpatternBegin(bool capture = true) + void atomParenthesesSubpatternBegin(bool capture = true, std::optional<String> optGroupName = std::nullopt) { unsigned subpatternId = m_pattern.m_numSubpatterns + 1; - if (capture) + if (capture) { m_pattern.m_numSubpatterns++; + if (optGroupName) { + while (m_pattern.m_captureGroupNames.size() < subpatternId) + m_pattern.m_captureGroupNames.append(String()); + m_pattern.m_captureGroupNames.append(optGroupName.value()); + m_pattern.m_namedGroupToParenIndex.add(optGroupName.value(), subpatternId); + } + } else + ASSERT(!optGroupName); - OwnPtr<PatternDisjunction> parenthesesDisjunction = adoptPtr(new PatternDisjunction(m_alternative)); + auto parenthesesDisjunction = std::make_unique<PatternDisjunction>(m_alternative); m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction.get(), capture, false)); m_alternative = parenthesesDisjunction->addNewAlternative(); - m_pattern.m_disjunctions.append(parenthesesDisjunction.release()); + m_pattern.m_disjunctions.append(WTFMove(parenthesesDisjunction)); } void atomParentheticalAssertionBegin(bool invert = false) { - OwnPtr<PatternDisjunction> parenthesesDisjunction = adoptPtr(new PatternDisjunction(m_alternative)); + auto parenthesesDisjunction = std::make_unique<PatternDisjunction>(m_alternative); m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction.get(), false, invert)); m_alternative = parenthesesDisjunction->addNewAlternative(); m_invertParentheticalAssertion = invert; - m_pattern.m_disjunctions.append(parenthesesDisjunction.release()); + m_pattern.m_disjunctions.append(WTFMove(parenthesesDisjunction)); } void atomParenthesesEnd() @@ -429,8 +618,7 @@ public: PatternTerm& lastTerm = m_alternative->lastTerm(); - ASSERT(parenthesesDisjunction->m_alternatives.size() <= UINT_MAX); - unsigned numParenAlternatives = static_cast<unsigned>(parenthesesDisjunction->m_alternatives.size()); + unsigned numParenAlternatives = parenthesesDisjunction->m_alternatives.size(); unsigned numBOLAnchoredAlts = 0; for (unsigned i = 0; i < numParenAlternatives; i++) { @@ -478,16 +666,22 @@ public: m_alternative->m_terms.append(PatternTerm(subpatternId)); } - // deep copy the argument disjunction. If filterStartsWithBOL is true, + void atomNamedBackReference(String subpatternName) + { + ASSERT(m_pattern.m_namedGroupToParenIndex.find(subpatternName) != m_pattern.m_namedGroupToParenIndex.end()); + atomBackReference(m_pattern.m_namedGroupToParenIndex.get(subpatternName)); + } + + // deep copy the argument disjunction. If filterStartsWithBOL is true, // skip alternatives with m_startsWithBOL set true. PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction, bool filterStartsWithBOL = false) { - OwnPtr<PatternDisjunction> newDisjunction; + std::unique_ptr<PatternDisjunction> newDisjunction; for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { PatternAlternative* alternative = disjunction->m_alternatives[alt].get(); if (!filterStartsWithBOL || !alternative->m_startsWithBOL) { if (!newDisjunction) { - newDisjunction = adoptPtr(new PatternDisjunction()); + newDisjunction = std::make_unique<PatternDisjunction>(); newDisjunction->m_parent = disjunction->m_parent; } PatternAlternative* newAlternative = newDisjunction->addNewAlternative(); @@ -501,7 +695,7 @@ public: return 0; PatternDisjunction* copiedDisjunction = newDisjunction.get(); - m_pattern.m_disjunctions.append(newDisjunction.release()); + m_pattern.m_disjunctions.append(WTFMove(newDisjunction)); return copiedDisjunction; } @@ -512,6 +706,7 @@ public: PatternTerm termCopy = term; termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction, filterStartsWithBOL); + m_pattern.m_hasCopiedParenSubexpressions = true; return termCopy; } @@ -527,7 +722,7 @@ public: PatternTerm& term = m_alternative->lastTerm(); ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary); - ASSERT((term.quantityCount == 1) && (term.quantityType == QuantifierFixedCount)); + ASSERT(term.quantityMinCount == 1 && term.quantityMaxCount == 1 && term.quantityType == QuantifierFixedCount); if (term.type == PatternTerm::TypeParentheticalAssertion) { // If an assertion is quantified with a minimum count of zero, it can simply be removed. @@ -549,12 +744,12 @@ public: return; } - if (min == 0) - term.quantify(max, greedy ? QuantifierGreedy : QuantifierNonGreedy); - else if (min == max) - term.quantify(min, QuantifierFixedCount); + if (min == max) + term.quantify(min, max, QuantifierFixedCount); + else if (!min || (term.type == PatternTerm::TypeParenthesesSubpattern && m_pattern.m_hasCopiedParenSubexpressions)) + term.quantify(min, max, greedy ? QuantifierGreedy : QuantifierNonGreedy); else { - term.quantify(min, QuantifierFixedCount); + term.quantify(min, min, QuantifierFixedCount); m_alternative->m_terms.append(copyTerm(term)); // NOTE: this term is interesting from an analysis perspective, in that it can be ignored..... m_alternative->lastTerm().quantify((max == quantifyInfinite) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy); @@ -568,10 +763,14 @@ public: m_alternative = m_alternative->m_parent->addNewAlternative(); } - unsigned setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition) + ErrorCode setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition, unsigned& newCallFrameSize) WARN_UNUSED_RETURN { + if (UNLIKELY(!isSafeToRecurse())) + return ErrorCode::TooManyDisjunctions; + + ErrorCode error = ErrorCode::NoError; alternative->m_hasFixedSize = true; - Checked<unsigned> currentInputPosition = initialInputPosition; + Checked<unsigned, RecordOverflow> currentInputPosition = initialInputPosition; for (unsigned i = 0; i < alternative->m_terms.size(); ++i) { PatternTerm& term = alternative->m_terms[i]; @@ -599,8 +798,14 @@ public: term.frameLocation = currentCallFrameSize; currentCallFrameSize += YarrStackSpaceForBackTrackInfoPatternCharacter; alternative->m_hasFixedSize = false; + } else if (m_pattern.unicode()) { + Checked<unsigned, RecordOverflow> tempCount = term.quantityMaxCount; + tempCount *= U16_LENGTH(term.patternCharacter); + if (tempCount.hasOverflowed()) + return ErrorCode::OffsetTooLarge; + currentInputPosition += tempCount; } else - currentInputPosition += term.quantityCount; + currentInputPosition += term.quantityMaxCount; break; case PatternTerm::TypeCharacterClass: @@ -609,29 +814,39 @@ public: term.frameLocation = currentCallFrameSize; currentCallFrameSize += YarrStackSpaceForBackTrackInfoCharacterClass; alternative->m_hasFixedSize = false; + } else if (m_pattern.unicode()) { + term.frameLocation = currentCallFrameSize; + currentCallFrameSize += YarrStackSpaceForBackTrackInfoCharacterClass; + currentInputPosition += term.quantityMaxCount; + alternative->m_hasFixedSize = false; } else - currentInputPosition += term.quantityCount; + currentInputPosition += term.quantityMaxCount; break; case PatternTerm::TypeParenthesesSubpattern: // Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own. term.frameLocation = currentCallFrameSize; - if (term.quantityCount == 1 && !term.parentheses.isCopy) { - if (term.quantityType != QuantifierFixedCount) - currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesOnce; - currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet()); + if (term.quantityMaxCount == 1 && !term.parentheses.isCopy) { + currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesOnce; + error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet(), currentCallFrameSize); + if (hasError(error)) + return error; // If quantity is fixed, then pre-check its minimum size. if (term.quantityType == QuantifierFixedCount) currentInputPosition += term.parentheses.disjunction->m_minimumSize; term.inputPosition = currentInputPosition.unsafeGet(); } else if (term.parentheses.isTerminal) { currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesTerminal; - currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet()); + error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet(), currentCallFrameSize); + if (hasError(error)) + return error; term.inputPosition = currentInputPosition.unsafeGet(); } else { term.inputPosition = currentInputPosition.unsafeGet(); - setupDisjunctionOffsets(term.parentheses.disjunction, 0, currentInputPosition.unsafeGet()); currentCallFrameSize += YarrStackSpaceForBackTrackInfoParentheses; + error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet(), currentCallFrameSize); + if (hasError(error)) + return error; } // Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length. alternative->m_hasFixedSize = false; @@ -640,35 +855,53 @@ public: case PatternTerm::TypeParentheticalAssertion: term.inputPosition = currentInputPosition.unsafeGet(); term.frameLocation = currentCallFrameSize; - currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + YarrStackSpaceForBackTrackInfoParentheticalAssertion, currentInputPosition.unsafeGet()); + error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + YarrStackSpaceForBackTrackInfoParentheticalAssertion, currentInputPosition.unsafeGet(), currentCallFrameSize); + if (hasError(error)) + return error; break; case PatternTerm::TypeDotStarEnclosure: + ASSERT(!m_pattern.m_saveInitialStartValue); alternative->m_hasFixedSize = false; term.inputPosition = initialInputPosition; + m_pattern.m_initialStartValueFrameLocation = currentCallFrameSize; + currentCallFrameSize += YarrStackSpaceForDotStarEnclosure; + m_pattern.m_saveInitialStartValue = true; break; } + if (currentInputPosition.hasOverflowed()) + return ErrorCode::OffsetTooLarge; } alternative->m_minimumSize = (currentInputPosition - initialInputPosition).unsafeGet(); - return currentCallFrameSize; + newCallFrameSize = currentCallFrameSize; + return error; } - unsigned setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition) + ErrorCode setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition, unsigned& callFrameSize) { + if (UNLIKELY(!isSafeToRecurse())) + return ErrorCode::TooManyDisjunctions; + if ((disjunction != m_pattern.m_body) && (disjunction->m_alternatives.size() > 1)) initialCallFrameSize += YarrStackSpaceForBackTrackInfoAlternative; unsigned minimumInputSize = UINT_MAX; unsigned maximumCallFrameSize = 0; bool hasFixedSize = true; + ErrorCode error = ErrorCode::NoError; for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { PatternAlternative* alternative = disjunction->m_alternatives[alt].get(); - unsigned currentAlternativeCallFrameSize = setupAlternativeOffsets(alternative, initialCallFrameSize, initialInputPosition); + unsigned currentAlternativeCallFrameSize; + error = setupAlternativeOffsets(alternative, initialCallFrameSize, initialInputPosition, currentAlternativeCallFrameSize); + if (hasError(error)) + return error; minimumInputSize = std::min(minimumInputSize, alternative->m_minimumSize); maximumCallFrameSize = std::max(maximumCallFrameSize, currentAlternativeCallFrameSize); hasFixedSize &= alternative->m_hasFixedSize; + if (alternative->m_minimumSize > INT_MAX) + m_pattern.m_containsUnsignedLengthPattern = true; } ASSERT(minimumInputSize != UINT_MAX); @@ -677,12 +910,15 @@ public: disjunction->m_hasFixedSize = hasFixedSize; disjunction->m_minimumSize = minimumInputSize; disjunction->m_callFrameSize = maximumCallFrameSize; - return maximumCallFrameSize; + callFrameSize = maximumCallFrameSize; + return error; } - void setupOffsets() + ErrorCode setupOffsets() { - setupDisjunctionOffsets(m_pattern.m_body, 0, 0); + // FIXME: Yarr should not use the stack to handle subpatterns (rdar://problem/26436314). + unsigned ignoredCallFrameSize; + return setupDisjunctionOffsets(m_pattern.m_body, 0, 0, ignoredCallFrameSize); } // This optimization identifies sets of parentheses that we will never need to backtrack. @@ -699,14 +935,15 @@ public: if (m_pattern.m_numSubpatterns) return; - Vector<OwnPtr<PatternAlternative> >& alternatives = m_pattern.m_body->m_alternatives; + Vector<std::unique_ptr<PatternAlternative>>& alternatives = m_pattern.m_body->m_alternatives; for (size_t i = 0; i < alternatives.size(); ++i) { Vector<PatternTerm>& terms = alternatives[i]->m_terms; if (terms.size()) { PatternTerm& term = terms.last(); if (term.type == PatternTerm::TypeParenthesesSubpattern && term.quantityType == QuantifierGreedy - && term.quantityCount == quantifyInfinite + && term.quantityMinCount == 0 + && term.quantityMaxCount == quantifyInfinite && !term.capture()) term.parentheses.isTerminal = true; } @@ -722,7 +959,7 @@ public: // At this point, this is only valid for non-multiline expressions. PatternDisjunction* disjunction = m_pattern.m_body; - if (!m_pattern.m_containsBOL || m_pattern.m_multiline) + if (!m_pattern.m_containsBOL || m_pattern.multiline()) return; PatternDisjunction* loopDisjunction = copyDisjunction(disjunction, true); @@ -740,11 +977,12 @@ public: } } - bool containsCapturingTerms(PatternAlternative* alternative, size_t firstTermIndex, size_t lastTermIndex) + bool containsCapturingTerms(PatternAlternative* alternative, size_t firstTermIndex, size_t endIndex) { Vector<PatternTerm>& terms = alternative->m_terms; - for (size_t termIndex = firstTermIndex; termIndex <= lastTermIndex; ++termIndex) { + ASSERT(endIndex <= terms.size()); + for (size_t termIndex = firstTermIndex; termIndex < endIndex; ++termIndex) { PatternTerm& term = terms[termIndex]; if (term.m_capture) @@ -753,7 +991,7 @@ public: if (term.type == PatternTerm::TypeParenthesesSubpattern) { PatternDisjunction* nestedDisjunction = term.parentheses.disjunction; for (unsigned alt = 0; alt < nestedDisjunction->m_alternatives.size(); ++alt) { - if (containsCapturingTerms(nestedDisjunction->m_alternatives[alt].get(), 0, nestedDisjunction->m_alternatives[alt]->m_terms.size() - 1)) + if (containsCapturingTerms(nestedDisjunction->m_alternatives[alt].get(), 0, nestedDisjunction->m_alternatives[alt]->m_terms.size())) return true; } } @@ -769,16 +1007,17 @@ public: // beginning and the end of the match. void optimizeDotStarWrappedExpressions() { - Vector<OwnPtr<PatternAlternative> >& alternatives = m_pattern.m_body->m_alternatives; + Vector<std::unique_ptr<PatternAlternative>>& alternatives = m_pattern.m_body->m_alternatives; if (alternatives.size() != 1) return; + CharacterClass* dotCharacterClass = m_pattern.dotAll() ? m_pattern.anyCharacterClass() : m_pattern.newlineCharacterClass(); PatternAlternative* alternative = alternatives[0].get(); Vector<PatternTerm>& terms = alternative->m_terms; if (terms.size() >= 3) { bool startsWithBOL = false; bool endsWithEOL = false; - size_t termIndex, firstExpressionTerm, lastExpressionTerm; + size_t termIndex, firstExpressionTerm; termIndex = 0; if (terms[termIndex].type == PatternTerm::TypeAssertionBOL) { @@ -787,7 +1026,10 @@ public: } PatternTerm& firstNonAnchorTerm = terms[termIndex]; - if ((firstNonAnchorTerm.type != PatternTerm::TypeCharacterClass) || (firstNonAnchorTerm.characterClass != m_pattern.newlineCharacterClass()) || !((firstNonAnchorTerm.quantityType == QuantifierGreedy) || (firstNonAnchorTerm.quantityType == QuantifierNonGreedy))) + if (firstNonAnchorTerm.type != PatternTerm::TypeCharacterClass + || firstNonAnchorTerm.characterClass != dotCharacterClass + || firstNonAnchorTerm.quantityMinCount + || firstNonAnchorTerm.quantityMaxCount != quantifyInfinite) return; firstExpressionTerm = termIndex + 1; @@ -799,16 +1041,19 @@ public: } PatternTerm& lastNonAnchorTerm = terms[termIndex]; - if ((lastNonAnchorTerm.type != PatternTerm::TypeCharacterClass) || (lastNonAnchorTerm.characterClass != m_pattern.newlineCharacterClass()) || (lastNonAnchorTerm.quantityType != QuantifierGreedy)) + if (lastNonAnchorTerm.type != PatternTerm::TypeCharacterClass + || lastNonAnchorTerm.characterClass != dotCharacterClass + || lastNonAnchorTerm.quantityType != QuantifierGreedy + || lastNonAnchorTerm.quantityMinCount + || lastNonAnchorTerm.quantityMaxCount != quantifyInfinite) return; - - lastExpressionTerm = termIndex - 1; - if (firstExpressionTerm > lastExpressionTerm) + size_t endIndex = termIndex; + if (firstExpressionTerm >= endIndex) return; - if (!containsCapturingTerms(alternative, firstExpressionTerm, lastExpressionTerm)) { - for (termIndex = terms.size() - 1; termIndex > lastExpressionTerm; --termIndex) + if (!containsCapturingTerms(alternative, firstExpressionTerm, endIndex)) { + for (termIndex = terms.size() - 1; termIndex >= endIndex; --termIndex) terms.remove(termIndex); for (termIndex = firstExpressionTerm; termIndex > 0; --termIndex) @@ -822,62 +1067,445 @@ public: } private: + bool isSafeToRecurse() const + { + if (!m_stackLimit) + return true; + int8_t* curr = reinterpret_cast<int8_t*>(&curr); + int8_t* limit = reinterpret_cast<int8_t*>(m_stackLimit); + return curr >= limit; + } + YarrPattern& m_pattern; PatternAlternative* m_alternative; CharacterClassConstructor m_characterClassConstructor; + void* m_stackLimit; bool m_invertCharacterClass; - bool m_invertParentheticalAssertion; + bool m_invertParentheticalAssertion { false }; }; -const char* YarrPattern::compile(const String& patternString) +ErrorCode YarrPattern::compile(const String& patternString, void* stackLimit) { - YarrPatternConstructor constructor(*this); + YarrPatternConstructor constructor(*this, stackLimit); - if (const char* error = parse(constructor, patternString)) - return error; + if (m_flags == InvalidFlags) + return ErrorCode::InvalidRegularExpressionFlags; + + { + ErrorCode error = parse(constructor, patternString, unicode()); + if (hasError(error)) + return error; + } // If the pattern contains illegal backreferences reset & reparse. // Quoting Netscape's "What's new in JavaScript 1.2", // "Note: if the number of left parentheses is less than the number specified // in \#, the \# is taken as an octal escape as described in the next row." if (containsIllegalBackReference()) { + if (unicode()) + return ErrorCode::InvalidBackreference; + unsigned numSubpatterns = m_numSubpatterns; constructor.reset(); -#if !ASSERT_DISABLED - const char* error = -#endif - parse(constructor, patternString, numSubpatterns); - - ASSERT(!error); + ErrorCode error = parse(constructor, patternString, unicode(), numSubpatterns); + ASSERT_UNUSED(error, !hasError(error)); ASSERT(numSubpatterns == m_numSubpatterns); } constructor.checkForTerminalParentheses(); constructor.optimizeDotStarWrappedExpressions(); constructor.optimizeBOL(); - - constructor.setupOffsets(); - return 0; + { + ErrorCode error = constructor.setupOffsets(); + if (hasError(error)) + return error; + } + + if (Options::dumpCompiledRegExpPatterns()) + dumpPattern(patternString); + + return ErrorCode::NoError; } -YarrPattern::YarrPattern(const String& pattern, bool ignoreCase, bool multiline, const char** error) - : m_ignoreCase(ignoreCase) - , m_multiline(multiline) - , m_containsBackreferences(false) +YarrPattern::YarrPattern(const String& pattern, RegExpFlags flags, ErrorCode& error, void* stackLimit) + : m_containsBackreferences(false) , m_containsBOL(false) - , m_numSubpatterns(0) - , m_maxBackReference(0) - , newlineCached(0) - , digitsCached(0) - , spacesCached(0) - , wordcharCached(0) - , nondigitsCached(0) - , nonspacesCached(0) - , nonwordcharCached(0) + , m_containsUnsignedLengthPattern(false) + , m_hasCopiedParenSubexpressions(false) + , m_saveInitialStartValue(false) + , m_flags(flags) +{ + error = compile(pattern, stackLimit); +} + +void indentForNestingLevel(PrintStream& out, unsigned nestingDepth) +{ + out.print(" "); + for (; nestingDepth; --nestingDepth) + out.print(" "); +} + +void dumpUChar32(PrintStream& out, UChar32 c) +{ + if (c >= ' '&& c <= 0xff) + out.printf("'%c'", static_cast<char>(c)); + else + out.printf("0x%04x", c); +} + +void dumpCharacterClass(PrintStream& out, YarrPattern* pattern, CharacterClass* characterClass) +{ + if (characterClass == pattern->anyCharacterClass()) + out.print("<any character>"); + else if (characterClass == pattern->newlineCharacterClass()) + out.print("<newline>"); + else if (characterClass == pattern->digitsCharacterClass()) + out.print("<digits>"); + else if (characterClass == pattern->spacesCharacterClass()) + out.print("<whitespace>"); + else if (characterClass == pattern->wordcharCharacterClass()) + out.print("<word>"); + else if (characterClass == pattern->wordUnicodeIgnoreCaseCharCharacterClass()) + out.print("<unicode ignore case>"); + else if (characterClass == pattern->nondigitsCharacterClass()) + out.print("<non-digits>"); + else if (characterClass == pattern->nonspacesCharacterClass()) + out.print("<non-whitespace>"); + else if (characterClass == pattern->nonwordcharCharacterClass()) + out.print("<non-word>"); + else if (characterClass == pattern->nonwordUnicodeIgnoreCaseCharCharacterClass()) + out.print("<unicode non-ignore case>"); + else { + bool needMatchesRangesSeperator = false; + + auto dumpMatches = [&] (const char* prefix, Vector<UChar32> matches) { + size_t matchesSize = matches.size(); + if (matchesSize) { + if (needMatchesRangesSeperator) + out.print(","); + needMatchesRangesSeperator = true; + + out.print(prefix, ":("); + for (size_t i = 0; i < matchesSize; ++i) { + if (i) + out.print(","); + dumpUChar32(out, matches[i]); + } + out.print(")"); + } + }; + + auto dumpRanges = [&] (const char* prefix, Vector<CharacterRange> ranges) { + size_t rangeSize = ranges.size(); + if (rangeSize) { + if (needMatchesRangesSeperator) + out.print(","); + needMatchesRangesSeperator = true; + + out.print(prefix, " ranges:("); + for (size_t i = 0; i < rangeSize; ++i) { + if (i) + out.print(","); + CharacterRange range = ranges[i]; + out.print("("); + dumpUChar32(out, range.begin); + out.print(".."); + dumpUChar32(out, range.end); + out.print(")"); + } + out.print(")"); + } + }; + + out.print("["); + dumpMatches("ASCII", characterClass->m_matches); + dumpRanges("ASCII", characterClass->m_ranges); + dumpMatches("Unicode", characterClass->m_matchesUnicode); + dumpRanges("Unicode", characterClass->m_rangesUnicode); + out.print("]"); + } +} + +void PatternAlternative::dump(PrintStream& out, YarrPattern* thisPattern, unsigned nestingDepth) +{ + out.print("minimum size: ", m_minimumSize); + if (m_hasFixedSize) + out.print(",fixed size"); + if (m_onceThrough) + out.print(",once through"); + if (m_startsWithBOL) + out.print(",starts with ^"); + if (m_containsBOL) + out.print(",contains ^"); + out.print("\n"); + + for (size_t i = 0; i < m_terms.size(); ++i) + m_terms[i].dump(out, thisPattern, nestingDepth); +} + +void PatternTerm::dumpQuantifier(PrintStream& out) +{ + if (quantityType == QuantifierFixedCount && quantityMinCount == 1 && quantityMaxCount == 1) + return; + out.print(" {", quantityMinCount.unsafeGet()); + if (quantityMinCount != quantityMaxCount) { + if (quantityMaxCount == UINT_MAX) + out.print(",..."); + else + out.print(",", quantityMaxCount.unsafeGet()); + } + out.print("}"); + if (quantityType == QuantifierGreedy) + out.print(" greedy"); + else if (quantityType == QuantifierNonGreedy) + out.print(" non-greedy"); +} + +void PatternTerm::dump(PrintStream& out, YarrPattern* thisPattern, unsigned nestingDepth) +{ + indentForNestingLevel(out, nestingDepth); + + if (type != TypeParenthesesSubpattern && type != TypeParentheticalAssertion) { + if (invert()) + out.print("not "); + } + + switch (type) { + case TypeAssertionBOL: + out.println("BOL"); + break; + case TypeAssertionEOL: + out.println("EOL"); + break; + case TypeAssertionWordBoundary: + out.println("word boundary"); + break; + case TypePatternCharacter: + out.printf("character "); + out.printf("inputPosition %u ", inputPosition); + if (thisPattern->ignoreCase() && isASCIIAlpha(patternCharacter)) { + dumpUChar32(out, toASCIIUpper(patternCharacter)); + out.print("/"); + dumpUChar32(out, toASCIILower(patternCharacter)); + } else + dumpUChar32(out, patternCharacter); + dumpQuantifier(out); + if (quantityType != QuantifierFixedCount) + out.print(",frame location ", frameLocation); + out.println(); + break; + case TypeCharacterClass: + out.print("character class "); + if (characterClass->m_anyCharacter) + out.print("<any character>"); + else if (characterClass == thisPattern->newlineCharacterClass()) + out.print("<newline>"); + else if (characterClass == thisPattern->digitsCharacterClass()) + out.print("<digits>"); + else if (characterClass == thisPattern->spacesCharacterClass()) + out.print("<whitespace>"); + else if (characterClass == thisPattern->wordcharCharacterClass()) + out.print("<word>"); + else if (characterClass == thisPattern->wordUnicodeIgnoreCaseCharCharacterClass()) + out.print("<unicode ignore case>"); + else if (characterClass == thisPattern->nondigitsCharacterClass()) + out.print("<non-digits>"); + else if (characterClass == thisPattern->nonspacesCharacterClass()) + out.print("<non-whitespace>"); + else if (characterClass == thisPattern->nonwordcharCharacterClass()) + out.print("<non-word>"); + else if (characterClass == thisPattern->nonwordUnicodeIgnoreCaseCharCharacterClass()) + out.print("<unicode non-ignore case>"); + else { + bool needMatchesRangesSeperator = false; + + auto dumpMatches = [&] (const char* prefix, Vector<UChar32> matches) { + size_t matchesSize = matches.size(); + if (matchesSize) { + if (needMatchesRangesSeperator) + out.print(","); + needMatchesRangesSeperator = true; + + out.print(prefix, ":("); + for (size_t i = 0; i < matchesSize; ++i) { + if (i) + out.print(","); + dumpUChar32(out, matches[i]); + } + out.print(")"); + } + }; + + auto dumpRanges = [&] (const char* prefix, Vector<CharacterRange> ranges) { + size_t rangeSize = ranges.size(); + if (rangeSize) { + if (needMatchesRangesSeperator) + out.print(","); + needMatchesRangesSeperator = true; + + out.print(prefix, " ranges:("); + for (size_t i = 0; i < rangeSize; ++i) { + if (i) + out.print(","); + CharacterRange range = ranges[i]; + out.print("("); + dumpUChar32(out, range.begin); + out.print(".."); + dumpUChar32(out, range.end); + out.print(")"); + } + out.print(")"); + } + }; + + out.print("["); + dumpMatches("ASCII", characterClass->m_matches); + dumpRanges("ASCII", characterClass->m_ranges); + dumpMatches("Unicode", characterClass->m_matchesUnicode); + dumpRanges("Unicode", characterClass->m_rangesUnicode); + out.print("]"); + } + dumpQuantifier(out); + if (quantityType != QuantifierFixedCount || thisPattern->unicode()) + out.print(",frame location ", frameLocation); + out.println(); + break; + case TypeBackReference: + out.print("back reference to subpattern #", backReferenceSubpatternId); + out.println(",frame location ", frameLocation); + break; + case TypeForwardReference: + out.println("forward reference"); + break; + case TypeParenthesesSubpattern: + if (m_capture) + out.print("captured "); + else + out.print("non-captured "); + + FALLTHROUGH; + case TypeParentheticalAssertion: + if (m_invert) + out.print("inverted "); + + if (type == TypeParenthesesSubpattern) + out.print("subpattern"); + else if (type == TypeParentheticalAssertion) + out.print("assertion"); + + if (m_capture) + out.print(" #", parentheses.subpatternId); + + dumpQuantifier(out); + + if (parentheses.isCopy) + out.print(",copy"); + + if (parentheses.isTerminal) + out.print(",terminal"); + + out.println(",frame location ", frameLocation); + + if (parentheses.disjunction->m_alternatives.size() > 1) { + indentForNestingLevel(out, nestingDepth + 1); + unsigned alternativeFrameLocation = frameLocation; + if (quantityMaxCount == 1 && !parentheses.isCopy) + alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce; + else if (parentheses.isTerminal) + alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesTerminal; + else + alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParentheses; + out.println("alternative list,frame location ", alternativeFrameLocation); + } + + parentheses.disjunction->dump(out, thisPattern, nestingDepth + 1); + break; + case TypeDotStarEnclosure: + out.println(".* enclosure,frame location ", thisPattern->m_initialStartValueFrameLocation); + break; + } +} + +void PatternDisjunction::dump(PrintStream& out, YarrPattern* thisPattern, unsigned nestingDepth = 0) +{ + unsigned alternativeCount = m_alternatives.size(); + for (unsigned i = 0; i < alternativeCount; ++i) { + indentForNestingLevel(out, nestingDepth); + if (alternativeCount > 1) + out.print("alternative #", i, ": "); + m_alternatives[i].get()->dump(out, thisPattern, nestingDepth + (alternativeCount > 1)); + } +} + +void YarrPattern::dumpPattern(const String& patternString) +{ + dumpPattern(WTF::dataFile(), patternString); +} + +void YarrPattern::dumpPattern(PrintStream& out, const String& patternString) +{ + out.print("RegExp pattern for /"); + out.print(patternString); + out.print("/"); + if (global()) + out.print("g"); + if (ignoreCase()) + out.print("i"); + if (multiline()) + out.print("m"); + if (unicode()) + out.print("u"); + if (sticky()) + out.print("y"); + if (m_flags != NoFlags) { + bool printSeperator = false; + out.print(" ("); + if (global()) { + out.print("global"); + printSeperator = true; + } + if (ignoreCase()) { + if (printSeperator) + out.print("|"); + out.print("ignore case"); + printSeperator = true; + } + if (multiline()) { + if (printSeperator) + out.print("|"); + out.print("multiline"); + printSeperator = true; + } + if (unicode()) { + if (printSeperator) + out.print("|"); + out.print("unicode"); + printSeperator = true; + } + if (sticky()) { + if (printSeperator) + out.print("|"); + out.print("sticky"); + printSeperator = true; + } + out.print(")"); + } + out.print(":\n"); + if (m_body->m_callFrameSize) + out.print(" callframe size: ", m_body->m_callFrameSize, "\n"); + m_body->dump(out, this); +} + +std::unique_ptr<CharacterClass> anycharCreate() { - *error = compile(pattern); + auto characterClass = std::make_unique<CharacterClass>(); + characterClass->m_ranges.append(CharacterRange(0x00, 0x7f)); + characterClass->m_rangesUnicode.append(CharacterRange(0x0080, 0x10ffff)); + characterClass->m_hasNonBMPCharacters = true; + characterClass->m_anyCharacter = true; + return characterClass; } -} } +} } // namespace JSC::Yarr |