aboutsummaryrefslogtreecommitdiffstats
path: root/src/3rdparty/masm/yarr/YarrPattern.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/3rdparty/masm/yarr/YarrPattern.cpp')
-rw-r--r--src/3rdparty/masm/yarr/YarrPattern.cpp962
1 files changed, 795 insertions, 167 deletions
diff --git a/src/3rdparty/masm/yarr/YarrPattern.cpp b/src/3rdparty/masm/yarr/YarrPattern.cpp
index c7e5b6b09b..ac66ea1b9a 100644
--- a/src/3rdparty/masm/yarr/YarrPattern.cpp
+++ b/src/3rdparty/masm/yarr/YarrPattern.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2009, 2013 Apple Inc. All rights reserved.
+ * Copyright (C) 2009, 2013-2016 Apple Inc. All rights reserved.
* Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
*
* Redistribution and use in source and binary forms, with or without
@@ -27,10 +27,15 @@
#include "config.h"
#include "YarrPattern.h"
+#include "Options.h"
#include "Yarr.h"
-#include "YarrCanonicalizeUCS2.h"
+#include "YarrCanonicalize.h"
#include "YarrParser.h"
+#include <wtf/DataLog.h>
+#include <wtf/Optional.h>
+//#include <wtf/Threading.h>
#include <wtf/Vector.h>
+#include <wtf/text/WTFString.h>
using namespace WTF;
@@ -40,8 +45,11 @@ namespace JSC { namespace Yarr {
class CharacterClassConstructor {
public:
- CharacterClassConstructor(bool isCaseInsensitive = false)
+ CharacterClassConstructor(bool isCaseInsensitive, CanonicalMode canonicalMode)
: m_isCaseInsensitive(isCaseInsensitive)
+ , m_hasNonBMPCharacters(false)
+ , m_anyCharacter(false)
+ , m_canonicalMode(canonicalMode)
{
}
@@ -51,6 +59,8 @@ public:
m_ranges.clear();
m_matchesUnicode.clear();
m_rangesUnicode.clear();
+ m_hasNonBMPCharacters = false;
+ m_anyCharacter = false;
}
void append(const CharacterClass* other)
@@ -65,11 +75,71 @@ public:
addSortedRange(m_rangesUnicode, other->m_rangesUnicode[i].begin, other->m_rangesUnicode[i].end);
}
- void putChar(UChar ch)
+ void appendInverted(const CharacterClass* other)
{
- // Handle ascii cases.
- if (ch <= 0x7f) {
- if (m_isCaseInsensitive && isASCIIAlpha(ch)) {
+ auto addSortedInverted = [&](UChar32 min, UChar32 max,
+ const Vector<UChar32>& srcMatches, const Vector<CharacterRange>& srcRanges,
+ Vector<UChar32>& destMatches, Vector<CharacterRange>& destRanges) {
+
+ auto addSortedMatchOrRange = [&](UChar32 lo, UChar32 hiPlusOne) {
+ if (lo < hiPlusOne) {
+ if (lo + 1 == hiPlusOne)
+ addSorted(destMatches, lo);
+ else
+ addSortedRange(destRanges, lo, hiPlusOne - 1);
+ }
+ };
+
+ UChar32 lo = min;
+ size_t matchesIndex = 0;
+ size_t rangesIndex = 0;
+ bool matchesRemaining = matchesIndex < srcMatches.size();
+ bool rangesRemaining = rangesIndex < srcRanges.size();
+
+ if (!matchesRemaining && !rangesRemaining) {
+ addSortedMatchOrRange(min, max + 1);
+ return;
+ }
+
+ while (matchesRemaining || rangesRemaining) {
+ UChar32 hiPlusOne;
+ UChar32 nextLo;
+
+ if (matchesRemaining
+ && (!rangesRemaining || srcMatches[matchesIndex] < srcRanges[rangesIndex].begin)) {
+ hiPlusOne = srcMatches[matchesIndex];
+ nextLo = hiPlusOne + 1;
+ ++matchesIndex;
+ matchesRemaining = matchesIndex < srcMatches.size();
+ } else {
+ hiPlusOne = srcRanges[rangesIndex].begin;
+ nextLo = srcRanges[rangesIndex].end + 1;
+ ++rangesIndex;
+ rangesRemaining = rangesIndex < srcRanges.size();
+ }
+
+ addSortedMatchOrRange(lo, hiPlusOne);
+
+ lo = nextLo;
+ }
+
+ addSortedMatchOrRange(lo, max + 1);
+ };
+
+ addSortedInverted(0, 0x7f, other->m_matches, other->m_ranges, m_matches, m_ranges);
+ addSortedInverted(0x80, 0x10ffff, other->m_matchesUnicode, other->m_rangesUnicode, m_matchesUnicode, m_rangesUnicode);
+ }
+
+ void putChar(UChar32 ch)
+ {
+ if (!m_isCaseInsensitive) {
+ addSorted(ch);
+ return;
+ }
+
+ if (m_canonicalMode == CanonicalMode::UCS2 && isASCII(ch)) {
+ // Handle ASCII cases.
+ if (isASCIIAlpha(ch)) {
addSorted(m_matches, toASCIIUpper(ch));
addSorted(m_matches, toASCIILower(ch));
} else
@@ -77,40 +147,33 @@ public:
return;
}
- // Simple case, not a case-insensitive match.
- if (!m_isCaseInsensitive) {
- addSorted(m_matchesUnicode, ch);
- return;
- }
-
// Add multiple matches, if necessary.
- UCS2CanonicalizationRange* info = rangeInfoFor(ch);
+ const CanonicalizationRange* info = canonicalRangeInfoFor(ch, m_canonicalMode);
if (info->type == CanonicalizeUnique)
- addSorted(m_matchesUnicode, ch);
+ addSorted(ch);
else
putUnicodeIgnoreCase(ch, info);
}
- void putUnicodeIgnoreCase(UChar ch, UCS2CanonicalizationRange* info)
+ void putUnicodeIgnoreCase(UChar32 ch, const CanonicalizationRange* info)
{
ASSERT(m_isCaseInsensitive);
- ASSERT(ch > 0x7f);
ASSERT(ch >= info->begin && ch <= info->end);
ASSERT(info->type != CanonicalizeUnique);
if (info->type == CanonicalizeSet) {
- for (uint16_t* set = characterSetInfo[info->value]; (ch = *set); ++set)
- addSorted(m_matchesUnicode, ch);
+ for (const UChar32* set = canonicalCharacterSetInfo(info->value, m_canonicalMode); (ch = *set); ++set)
+ addSorted(ch);
} else {
- addSorted(m_matchesUnicode, ch);
- addSorted(m_matchesUnicode, getCanonicalPair(info, ch));
+ addSorted(ch);
+ addSorted(getCanonicalPair(info, ch));
}
}
- void putRange(UChar lo, UChar hi)
+ void putRange(UChar32 lo, UChar32 hi)
{
- if (lo <= 0x7f) {
+ if (isASCII(lo)) {
char asciiLo = lo;
- char asciiHi = std::min(hi, (UChar)0x7f);
+ char asciiHi = std::min(hi, (UChar32)0x7f);
addSortedRange(m_ranges, lo, asciiHi);
if (m_isCaseInsensitive) {
@@ -120,19 +183,19 @@ public:
addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a'));
}
}
- if (hi <= 0x7f)
+ if (isASCII(hi))
return;
- lo = std::max(lo, (UChar)0x80);
+ lo = std::max(lo, (UChar32)0x80);
addSortedRange(m_rangesUnicode, lo, hi);
if (!m_isCaseInsensitive)
return;
- UCS2CanonicalizationRange* info = rangeInfoFor(lo);
+ const CanonicalizationRange* info = canonicalRangeInfoFor(lo, m_canonicalMode);
while (true) {
// Handle the range [lo .. end]
- UChar end = std::min<UChar>(info->end, hi);
+ UChar32 end = std::min<UChar32>(info->end, hi);
switch (info->type) {
case CanonicalizeUnique:
@@ -140,7 +203,7 @@ public:
break;
case CanonicalizeSet: {
UChar ch;
- for (uint16_t* set = characterSetInfo[info->value]; (ch = *set); ++set)
+ for (const UChar32* set = canonicalCharacterSetInfo(info->value, m_canonicalMode); (ch = *set); ++set)
addSorted(m_matchesUnicode, ch);
break;
}
@@ -175,24 +238,38 @@ public:
}
- PassOwnPtr<CharacterClass> charClass()
+ std::unique_ptr<CharacterClass> charClass()
{
- OwnPtr<CharacterClass> characterClass = adoptPtr(new CharacterClass);
+ coalesceTables();
+
+ auto characterClass = std::make_unique<CharacterClass>();
characterClass->m_matches.swap(m_matches);
characterClass->m_ranges.swap(m_ranges);
characterClass->m_matchesUnicode.swap(m_matchesUnicode);
characterClass->m_rangesUnicode.swap(m_rangesUnicode);
+ characterClass->m_hasNonBMPCharacters = hasNonBMPCharacters();
+ characterClass->m_anyCharacter = anyCharacter();
+
+ m_hasNonBMPCharacters = false;
+ m_anyCharacter = false;
- return characterClass.release();
+ return characterClass;
}
private:
- void addSorted(Vector<UChar>& matches, UChar ch)
+ void addSorted(UChar32 ch)
+ {
+ addSorted(isASCII(ch) ? m_matches : m_matchesUnicode, ch);
+ }
+
+ void addSorted(Vector<UChar32>& matches, UChar32 ch)
{
unsigned pos = 0;
- ASSERT(matches.size() <= UINT_MAX);
- unsigned range = static_cast<unsigned>(matches.size());
+ unsigned range = matches.size();
+
+ if (!U_IS_BMP(ch))
+ m_hasNonBMPCharacters = true;
// binary chop, find position to insert char.
while (range) {
@@ -201,9 +278,31 @@ private:
int val = matches[pos+index] - ch;
if (!val)
return;
- else if (val > 0)
+ else if (val > 0) {
+ if (val == 1) {
+ UChar32 lo = ch;
+ UChar32 hi = ch + 1;
+ matches.remove(pos + index);
+ if (pos + index > 0 && matches[pos + index - 1] == ch - 1) {
+ lo = ch - 1;
+ matches.remove(pos + index - 1);
+ }
+ addSortedRange(isASCII(ch) ? m_ranges : m_rangesUnicode, lo, hi);
+ return;
+ }
range = index;
- else {
+ } else {
+ if (val == -1) {
+ UChar32 lo = ch - 1;
+ UChar32 hi = ch;
+ matches.remove(pos + index);
+ if (pos + index + 1 < matches.size() && matches[pos + index + 1] == ch + 1) {
+ hi = ch + 1;
+ matches.remove(pos + index + 1);
+ }
+ addSortedRange(isASCII(ch) ? m_ranges : m_rangesUnicode, lo, hi);
+ return;
+ }
pos += (index+1);
range -= (index+1);
}
@@ -215,17 +314,19 @@ private:
matches.insert(pos, ch);
}
- void addSortedRange(Vector<CharacterRange>& ranges, UChar lo, UChar hi)
+ void addSortedRange(Vector<CharacterRange>& ranges, UChar32 lo, UChar32 hi)
{
- ASSERT(ranges.size() <= UINT_MAX);
- unsigned end = static_cast<unsigned>(ranges.size());
-
+ size_t end = ranges.size();
+
+ if (!U_IS_BMP(hi))
+ m_hasNonBMPCharacters = true;
+
// Simple linear scan - I doubt there are that many ranges anyway...
// feel free to fix this with something faster (eg binary chop).
- for (unsigned i = 0; i < end; ++i) {
+ for (size_t i = 0; i < end; ++i) {
// does the new range fall before the current position in the array
if (hi < ranges[i].begin) {
- // optional optimization: concatenate appending ranges? - may not be worthwhile.
+ // Concatenate appending ranges.
if (hi == (ranges[i].begin - 1)) {
ranges[i].begin = lo;
return;
@@ -233,7 +334,7 @@ private:
ranges.insert(i, CharacterRange(lo, hi));
return;
}
- // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the beginning
+ // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining
// If the new range start at or before the end of the last range, then the overlap (if it starts one after the
// end of the last range they concatenate, which is just as good.
if (lo <= (ranges[i].end + 1)) {
@@ -241,18 +342,7 @@ private:
ranges[i].begin = std::min(ranges[i].begin, lo);
ranges[i].end = std::max(ranges[i].end, hi);
- // now check if the new range can subsume any subsequent ranges.
- unsigned next = i+1;
- // each iteration of the loop we will either remove something from the list, or break the loop.
- while (next < ranges.size()) {
- if (ranges[next].begin <= (ranges[i].end + 1)) {
- // the next entry now overlaps / concatenates this one.
- ranges[i].end = std::max(ranges[i].end, ranges[next].end);
- ranges.remove(next);
- } else
- break;
- }
-
+ mergeRangesFrom(ranges, i);
return;
}
}
@@ -261,25 +351,95 @@ private:
ranges.append(CharacterRange(lo, hi));
}
- bool m_isCaseInsensitive;
+ void mergeRangesFrom(Vector<CharacterRange>& ranges, size_t index)
+ {
+ size_t next = index + 1;
+
+ // each iteration of the loop we will either remove something from the list, or break out of the loop.
+ while (next < ranges.size()) {
+ if (ranges[next].begin <= (ranges[index].end + 1)) {
+ // the next entry now overlaps / concatenates with this one.
+ ranges[index].end = std::max(ranges[index].end, ranges[next].end);
+ ranges.remove(next);
+ } else
+ break;
+ }
+
+ }
+
+ void coalesceTables()
+ {
+ auto coalesceMatchesAndRanges = [&](Vector<UChar32>& matches, Vector<CharacterRange>& ranges) {
+
+ size_t matchesIndex = 0;
+ size_t rangesIndex = 0;
+
+ while (matchesIndex < matches.size() && rangesIndex < ranges.size()) {
+ while (matchesIndex < matches.size() && matches[matchesIndex] < ranges[rangesIndex].begin - 1)
+ matchesIndex++;
+
+ if (matchesIndex < matches.size() && matches[matchesIndex] == ranges[rangesIndex].begin - 1) {
+ ranges[rangesIndex].begin = matches[matchesIndex];
+ matches.remove(matchesIndex);
+ }
+
+ while (matchesIndex < matches.size() && matches[matchesIndex] < ranges[rangesIndex].end + 1)
+ matchesIndex++;
+
+ if (matchesIndex < matches.size()) {
+ if (matches[matchesIndex] == ranges[rangesIndex].end + 1) {
+ ranges[rangesIndex].end = matches[matchesIndex];
+ matches.remove(matchesIndex);
+
+ mergeRangesFrom(ranges, rangesIndex);
+ } else
+ matchesIndex++;
+ }
+ }
+ };
+
+ coalesceMatchesAndRanges(m_matches, m_ranges);
+ coalesceMatchesAndRanges(m_matchesUnicode, m_rangesUnicode);
+
+ if (!m_matches.size() && !m_matchesUnicode.size()
+ && m_ranges.size() == 1 && m_rangesUnicode.size() == 1
+ && m_ranges[0].begin == 0 && m_ranges[0].end == 0x7f
+ && m_rangesUnicode[0].begin == 0x80 && m_rangesUnicode[0].end == 0x10ffff)
+ m_anyCharacter = true;
+ }
- Vector<UChar> m_matches;
+ bool hasNonBMPCharacters()
+ {
+ return m_hasNonBMPCharacters;
+ }
+
+ bool anyCharacter()
+ {
+ return m_anyCharacter;
+ }
+
+ bool m_isCaseInsensitive : 1;
+ bool m_hasNonBMPCharacters : 1;
+ bool m_anyCharacter : 1;
+ CanonicalMode m_canonicalMode;
+
+ Vector<UChar32> m_matches;
Vector<CharacterRange> m_ranges;
- Vector<UChar> m_matchesUnicode;
+ Vector<UChar32> m_matchesUnicode;
Vector<CharacterRange> m_rangesUnicode;
};
class YarrPatternConstructor {
public:
- YarrPatternConstructor(YarrPattern& pattern)
+ YarrPatternConstructor(YarrPattern& pattern, void* stackLimit)
: m_pattern(pattern)
- , m_characterClassConstructor(pattern.m_ignoreCase)
- , m_invertParentheticalAssertion(false)
+ , m_characterClassConstructor(pattern.ignoreCase(), pattern.unicode() ? CanonicalMode::Unicode : CanonicalMode::UCS2)
+ , m_stackLimit(stackLimit)
{
- OwnPtr<PatternDisjunction> body = adoptPtr(new PatternDisjunction);
+ auto body = std::make_unique<PatternDisjunction>();
m_pattern.m_body = body.get();
m_alternative = body->addNewAlternative();
- m_pattern.m_disjunctions.append(body.release());
+ m_pattern.m_disjunctions.append(WTFMove(body));
}
~YarrPatternConstructor()
@@ -291,15 +451,15 @@ public:
m_pattern.reset();
m_characterClassConstructor.reset();
- OwnPtr<PatternDisjunction> body = adoptPtr(new PatternDisjunction);
+ auto body = std::make_unique<PatternDisjunction>();
m_pattern.m_body = body.get();
m_alternative = body->addNewAlternative();
- m_pattern.m_disjunctions.append(body.release());
+ m_pattern.m_disjunctions.append(WTFMove(body));
}
void assertionBOL()
{
- if (!m_alternative->m_terms.size() & !m_invertParentheticalAssertion) {
+ if (!m_alternative->m_terms.size() && !m_invertParentheticalAssertion) {
m_alternative->m_startsWithBOL = true;
m_alternative->m_containsBOL = true;
m_pattern.m_containsBOL = true;
@@ -315,41 +475,51 @@ public:
m_alternative->m_terms.append(PatternTerm::WordBoundary(invert));
}
- void atomPatternCharacter(UChar ch)
+ void atomPatternCharacter(UChar32 ch)
{
// We handle case-insensitive checking of unicode characters which do have both
// cases by handling them as if they were defined using a CharacterClass.
- if (!m_pattern.m_ignoreCase || isASCII(ch)) {
+ if (!m_pattern.ignoreCase() || (isASCII(ch) && !m_pattern.unicode())) {
m_alternative->m_terms.append(PatternTerm(ch));
return;
}
- UCS2CanonicalizationRange* info = rangeInfoFor(ch);
+ const CanonicalizationRange* info = canonicalRangeInfoFor(ch, m_pattern.unicode() ? CanonicalMode::Unicode : CanonicalMode::UCS2);
if (info->type == CanonicalizeUnique) {
m_alternative->m_terms.append(PatternTerm(ch));
return;
}
m_characterClassConstructor.putUnicodeIgnoreCase(ch, info);
- OwnPtr<CharacterClass> newCharacterClass = m_characterClassConstructor.charClass();
+ auto newCharacterClass = m_characterClassConstructor.charClass();
m_alternative->m_terms.append(PatternTerm(newCharacterClass.get(), false));
- m_pattern.m_userCharacterClasses.append(newCharacterClass.release());
+ m_pattern.m_userCharacterClasses.append(WTFMove(newCharacterClass));
}
void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
{
switch (classID) {
- case DigitClassID:
+ case BuiltInCharacterClassID::DigitClassID:
m_alternative->m_terms.append(PatternTerm(m_pattern.digitsCharacterClass(), invert));
break;
- case SpaceClassID:
+ case BuiltInCharacterClassID::SpaceClassID:
m_alternative->m_terms.append(PatternTerm(m_pattern.spacesCharacterClass(), invert));
break;
- case WordClassID:
- m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert));
+ case BuiltInCharacterClassID::WordClassID:
+ if (m_pattern.unicode() && m_pattern.ignoreCase())
+ m_alternative->m_terms.append(PatternTerm(m_pattern.wordUnicodeIgnoreCaseCharCharacterClass(), invert));
+ else
+ m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert));
+ break;
+ case BuiltInCharacterClassID::DotClassID:
+ ASSERT(!invert);
+ if (m_pattern.dotAll())
+ m_alternative->m_terms.append(PatternTerm(m_pattern.anyCharacterClass(), false));
+ else
+ m_alternative->m_terms.append(PatternTerm(m_pattern.newlineCharacterClass(), true));
break;
- case NewlineClassID:
- m_alternative->m_terms.append(PatternTerm(m_pattern.newlineCharacterClass(), invert));
+ default:
+ m_alternative->m_terms.append(PatternTerm(m_pattern.unicodeCharacterClassFor(classID), invert));
break;
}
}
@@ -359,64 +529,83 @@ public:
m_invertCharacterClass = invert;
}
- void atomCharacterClassAtom(UChar ch)
+ void atomCharacterClassAtom(UChar32 ch)
{
m_characterClassConstructor.putChar(ch);
}
- void atomCharacterClassRange(UChar begin, UChar end)
+ void atomCharacterClassRange(UChar32 begin, UChar32 end)
{
m_characterClassConstructor.putRange(begin, end);
}
void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
{
- ASSERT(classID != NewlineClassID);
+ ASSERT(classID != BuiltInCharacterClassID::DotClassID);
switch (classID) {
- case DigitClassID:
+ case BuiltInCharacterClassID::DigitClassID:
m_characterClassConstructor.append(invert ? m_pattern.nondigitsCharacterClass() : m_pattern.digitsCharacterClass());
break;
- case SpaceClassID:
+ case BuiltInCharacterClassID::SpaceClassID:
m_characterClassConstructor.append(invert ? m_pattern.nonspacesCharacterClass() : m_pattern.spacesCharacterClass());
break;
- case WordClassID:
- m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass());
+ case BuiltInCharacterClassID::WordClassID:
+ if (m_pattern.unicode() && m_pattern.ignoreCase())
+ m_characterClassConstructor.append(invert ? m_pattern.nonwordUnicodeIgnoreCaseCharCharacterClass() : m_pattern.wordUnicodeIgnoreCaseCharCharacterClass());
+ else
+ m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass());
break;
default:
- RELEASE_ASSERT_NOT_REACHED();
+ if (!invert)
+ m_characterClassConstructor.append(m_pattern.unicodeCharacterClassFor(classID));
+ else
+ m_characterClassConstructor.appendInverted(m_pattern.unicodeCharacterClassFor(classID));
}
}
void atomCharacterClassEnd()
{
- OwnPtr<CharacterClass> newCharacterClass = m_characterClassConstructor.charClass();
+ auto newCharacterClass = m_characterClassConstructor.charClass();
+
+ if (!m_invertCharacterClass && newCharacterClass.get()->m_anyCharacter) {
+ m_alternative->m_terms.append(PatternTerm(m_pattern.anyCharacterClass(), false));
+ return;
+ }
m_alternative->m_terms.append(PatternTerm(newCharacterClass.get(), m_invertCharacterClass));
- m_pattern.m_userCharacterClasses.append(newCharacterClass.release());
+ m_pattern.m_userCharacterClasses.append(WTFMove(newCharacterClass));
}
- void atomParenthesesSubpatternBegin(bool capture = true)
+ void atomParenthesesSubpatternBegin(bool capture = true, std::optional<String> optGroupName = std::nullopt)
{
unsigned subpatternId = m_pattern.m_numSubpatterns + 1;
- if (capture)
+ if (capture) {
m_pattern.m_numSubpatterns++;
+ if (optGroupName) {
+ while (m_pattern.m_captureGroupNames.size() < subpatternId)
+ m_pattern.m_captureGroupNames.append(String());
+ m_pattern.m_captureGroupNames.append(optGroupName.value());
+ m_pattern.m_namedGroupToParenIndex.add(optGroupName.value(), subpatternId);
+ }
+ } else
+ ASSERT(!optGroupName);
- OwnPtr<PatternDisjunction> parenthesesDisjunction = adoptPtr(new PatternDisjunction(m_alternative));
+ auto parenthesesDisjunction = std::make_unique<PatternDisjunction>(m_alternative);
m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction.get(), capture, false));
m_alternative = parenthesesDisjunction->addNewAlternative();
- m_pattern.m_disjunctions.append(parenthesesDisjunction.release());
+ m_pattern.m_disjunctions.append(WTFMove(parenthesesDisjunction));
}
void atomParentheticalAssertionBegin(bool invert = false)
{
- OwnPtr<PatternDisjunction> parenthesesDisjunction = adoptPtr(new PatternDisjunction(m_alternative));
+ auto parenthesesDisjunction = std::make_unique<PatternDisjunction>(m_alternative);
m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction.get(), false, invert));
m_alternative = parenthesesDisjunction->addNewAlternative();
m_invertParentheticalAssertion = invert;
- m_pattern.m_disjunctions.append(parenthesesDisjunction.release());
+ m_pattern.m_disjunctions.append(WTFMove(parenthesesDisjunction));
}
void atomParenthesesEnd()
@@ -429,8 +618,7 @@ public:
PatternTerm& lastTerm = m_alternative->lastTerm();
- ASSERT(parenthesesDisjunction->m_alternatives.size() <= UINT_MAX);
- unsigned numParenAlternatives = static_cast<unsigned>(parenthesesDisjunction->m_alternatives.size());
+ unsigned numParenAlternatives = parenthesesDisjunction->m_alternatives.size();
unsigned numBOLAnchoredAlts = 0;
for (unsigned i = 0; i < numParenAlternatives; i++) {
@@ -478,16 +666,22 @@ public:
m_alternative->m_terms.append(PatternTerm(subpatternId));
}
- // deep copy the argument disjunction. If filterStartsWithBOL is true,
+ void atomNamedBackReference(String subpatternName)
+ {
+ ASSERT(m_pattern.m_namedGroupToParenIndex.find(subpatternName) != m_pattern.m_namedGroupToParenIndex.end());
+ atomBackReference(m_pattern.m_namedGroupToParenIndex.get(subpatternName));
+ }
+
+ // deep copy the argument disjunction. If filterStartsWithBOL is true,
// skip alternatives with m_startsWithBOL set true.
PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction, bool filterStartsWithBOL = false)
{
- OwnPtr<PatternDisjunction> newDisjunction;
+ std::unique_ptr<PatternDisjunction> newDisjunction;
for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
PatternAlternative* alternative = disjunction->m_alternatives[alt].get();
if (!filterStartsWithBOL || !alternative->m_startsWithBOL) {
if (!newDisjunction) {
- newDisjunction = adoptPtr(new PatternDisjunction());
+ newDisjunction = std::make_unique<PatternDisjunction>();
newDisjunction->m_parent = disjunction->m_parent;
}
PatternAlternative* newAlternative = newDisjunction->addNewAlternative();
@@ -501,7 +695,7 @@ public:
return 0;
PatternDisjunction* copiedDisjunction = newDisjunction.get();
- m_pattern.m_disjunctions.append(newDisjunction.release());
+ m_pattern.m_disjunctions.append(WTFMove(newDisjunction));
return copiedDisjunction;
}
@@ -512,6 +706,7 @@ public:
PatternTerm termCopy = term;
termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction, filterStartsWithBOL);
+ m_pattern.m_hasCopiedParenSubexpressions = true;
return termCopy;
}
@@ -527,7 +722,7 @@ public:
PatternTerm& term = m_alternative->lastTerm();
ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary);
- ASSERT((term.quantityCount == 1) && (term.quantityType == QuantifierFixedCount));
+ ASSERT(term.quantityMinCount == 1 && term.quantityMaxCount == 1 && term.quantityType == QuantifierFixedCount);
if (term.type == PatternTerm::TypeParentheticalAssertion) {
// If an assertion is quantified with a minimum count of zero, it can simply be removed.
@@ -549,12 +744,12 @@ public:
return;
}
- if (min == 0)
- term.quantify(max, greedy ? QuantifierGreedy : QuantifierNonGreedy);
- else if (min == max)
- term.quantify(min, QuantifierFixedCount);
+ if (min == max)
+ term.quantify(min, max, QuantifierFixedCount);
+ else if (!min || (term.type == PatternTerm::TypeParenthesesSubpattern && m_pattern.m_hasCopiedParenSubexpressions))
+ term.quantify(min, max, greedy ? QuantifierGreedy : QuantifierNonGreedy);
else {
- term.quantify(min, QuantifierFixedCount);
+ term.quantify(min, min, QuantifierFixedCount);
m_alternative->m_terms.append(copyTerm(term));
// NOTE: this term is interesting from an analysis perspective, in that it can be ignored.....
m_alternative->lastTerm().quantify((max == quantifyInfinite) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy);
@@ -568,10 +763,14 @@ public:
m_alternative = m_alternative->m_parent->addNewAlternative();
}
- unsigned setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition)
+ ErrorCode setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition, unsigned& newCallFrameSize) WARN_UNUSED_RETURN
{
+ if (UNLIKELY(!isSafeToRecurse()))
+ return ErrorCode::TooManyDisjunctions;
+
+ ErrorCode error = ErrorCode::NoError;
alternative->m_hasFixedSize = true;
- Checked<unsigned> currentInputPosition = initialInputPosition;
+ Checked<unsigned, RecordOverflow> currentInputPosition = initialInputPosition;
for (unsigned i = 0; i < alternative->m_terms.size(); ++i) {
PatternTerm& term = alternative->m_terms[i];
@@ -599,8 +798,14 @@ public:
term.frameLocation = currentCallFrameSize;
currentCallFrameSize += YarrStackSpaceForBackTrackInfoPatternCharacter;
alternative->m_hasFixedSize = false;
+ } else if (m_pattern.unicode()) {
+ Checked<unsigned, RecordOverflow> tempCount = term.quantityMaxCount;
+ tempCount *= U16_LENGTH(term.patternCharacter);
+ if (tempCount.hasOverflowed())
+ return ErrorCode::OffsetTooLarge;
+ currentInputPosition += tempCount;
} else
- currentInputPosition += term.quantityCount;
+ currentInputPosition += term.quantityMaxCount;
break;
case PatternTerm::TypeCharacterClass:
@@ -609,29 +814,39 @@ public:
term.frameLocation = currentCallFrameSize;
currentCallFrameSize += YarrStackSpaceForBackTrackInfoCharacterClass;
alternative->m_hasFixedSize = false;
+ } else if (m_pattern.unicode()) {
+ term.frameLocation = currentCallFrameSize;
+ currentCallFrameSize += YarrStackSpaceForBackTrackInfoCharacterClass;
+ currentInputPosition += term.quantityMaxCount;
+ alternative->m_hasFixedSize = false;
} else
- currentInputPosition += term.quantityCount;
+ currentInputPosition += term.quantityMaxCount;
break;
case PatternTerm::TypeParenthesesSubpattern:
// Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own.
term.frameLocation = currentCallFrameSize;
- if (term.quantityCount == 1 && !term.parentheses.isCopy) {
- if (term.quantityType != QuantifierFixedCount)
- currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesOnce;
- currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet());
+ if (term.quantityMaxCount == 1 && !term.parentheses.isCopy) {
+ currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesOnce;
+ error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet(), currentCallFrameSize);
+ if (hasError(error))
+ return error;
// If quantity is fixed, then pre-check its minimum size.
if (term.quantityType == QuantifierFixedCount)
currentInputPosition += term.parentheses.disjunction->m_minimumSize;
term.inputPosition = currentInputPosition.unsafeGet();
} else if (term.parentheses.isTerminal) {
currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesTerminal;
- currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet());
+ error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet(), currentCallFrameSize);
+ if (hasError(error))
+ return error;
term.inputPosition = currentInputPosition.unsafeGet();
} else {
term.inputPosition = currentInputPosition.unsafeGet();
- setupDisjunctionOffsets(term.parentheses.disjunction, 0, currentInputPosition.unsafeGet());
currentCallFrameSize += YarrStackSpaceForBackTrackInfoParentheses;
+ error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet(), currentCallFrameSize);
+ if (hasError(error))
+ return error;
}
// Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length.
alternative->m_hasFixedSize = false;
@@ -640,35 +855,53 @@ public:
case PatternTerm::TypeParentheticalAssertion:
term.inputPosition = currentInputPosition.unsafeGet();
term.frameLocation = currentCallFrameSize;
- currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + YarrStackSpaceForBackTrackInfoParentheticalAssertion, currentInputPosition.unsafeGet());
+ error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + YarrStackSpaceForBackTrackInfoParentheticalAssertion, currentInputPosition.unsafeGet(), currentCallFrameSize);
+ if (hasError(error))
+ return error;
break;
case PatternTerm::TypeDotStarEnclosure:
+ ASSERT(!m_pattern.m_saveInitialStartValue);
alternative->m_hasFixedSize = false;
term.inputPosition = initialInputPosition;
+ m_pattern.m_initialStartValueFrameLocation = currentCallFrameSize;
+ currentCallFrameSize += YarrStackSpaceForDotStarEnclosure;
+ m_pattern.m_saveInitialStartValue = true;
break;
}
+ if (currentInputPosition.hasOverflowed())
+ return ErrorCode::OffsetTooLarge;
}
alternative->m_minimumSize = (currentInputPosition - initialInputPosition).unsafeGet();
- return currentCallFrameSize;
+ newCallFrameSize = currentCallFrameSize;
+ return error;
}
- unsigned setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition)
+ ErrorCode setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition, unsigned& callFrameSize)
{
+ if (UNLIKELY(!isSafeToRecurse()))
+ return ErrorCode::TooManyDisjunctions;
+
if ((disjunction != m_pattern.m_body) && (disjunction->m_alternatives.size() > 1))
initialCallFrameSize += YarrStackSpaceForBackTrackInfoAlternative;
unsigned minimumInputSize = UINT_MAX;
unsigned maximumCallFrameSize = 0;
bool hasFixedSize = true;
+ ErrorCode error = ErrorCode::NoError;
for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
PatternAlternative* alternative = disjunction->m_alternatives[alt].get();
- unsigned currentAlternativeCallFrameSize = setupAlternativeOffsets(alternative, initialCallFrameSize, initialInputPosition);
+ unsigned currentAlternativeCallFrameSize;
+ error = setupAlternativeOffsets(alternative, initialCallFrameSize, initialInputPosition, currentAlternativeCallFrameSize);
+ if (hasError(error))
+ return error;
minimumInputSize = std::min(minimumInputSize, alternative->m_minimumSize);
maximumCallFrameSize = std::max(maximumCallFrameSize, currentAlternativeCallFrameSize);
hasFixedSize &= alternative->m_hasFixedSize;
+ if (alternative->m_minimumSize > INT_MAX)
+ m_pattern.m_containsUnsignedLengthPattern = true;
}
ASSERT(minimumInputSize != UINT_MAX);
@@ -677,12 +910,15 @@ public:
disjunction->m_hasFixedSize = hasFixedSize;
disjunction->m_minimumSize = minimumInputSize;
disjunction->m_callFrameSize = maximumCallFrameSize;
- return maximumCallFrameSize;
+ callFrameSize = maximumCallFrameSize;
+ return error;
}
- void setupOffsets()
+ ErrorCode setupOffsets()
{
- setupDisjunctionOffsets(m_pattern.m_body, 0, 0);
+ // FIXME: Yarr should not use the stack to handle subpatterns (rdar://problem/26436314).
+ unsigned ignoredCallFrameSize;
+ return setupDisjunctionOffsets(m_pattern.m_body, 0, 0, ignoredCallFrameSize);
}
// This optimization identifies sets of parentheses that we will never need to backtrack.
@@ -699,14 +935,15 @@ public:
if (m_pattern.m_numSubpatterns)
return;
- Vector<OwnPtr<PatternAlternative> >& alternatives = m_pattern.m_body->m_alternatives;
+ Vector<std::unique_ptr<PatternAlternative>>& alternatives = m_pattern.m_body->m_alternatives;
for (size_t i = 0; i < alternatives.size(); ++i) {
Vector<PatternTerm>& terms = alternatives[i]->m_terms;
if (terms.size()) {
PatternTerm& term = terms.last();
if (term.type == PatternTerm::TypeParenthesesSubpattern
&& term.quantityType == QuantifierGreedy
- && term.quantityCount == quantifyInfinite
+ && term.quantityMinCount == 0
+ && term.quantityMaxCount == quantifyInfinite
&& !term.capture())
term.parentheses.isTerminal = true;
}
@@ -722,7 +959,7 @@ public:
// At this point, this is only valid for non-multiline expressions.
PatternDisjunction* disjunction = m_pattern.m_body;
- if (!m_pattern.m_containsBOL || m_pattern.m_multiline)
+ if (!m_pattern.m_containsBOL || m_pattern.multiline())
return;
PatternDisjunction* loopDisjunction = copyDisjunction(disjunction, true);
@@ -740,11 +977,12 @@ public:
}
}
- bool containsCapturingTerms(PatternAlternative* alternative, size_t firstTermIndex, size_t lastTermIndex)
+ bool containsCapturingTerms(PatternAlternative* alternative, size_t firstTermIndex, size_t endIndex)
{
Vector<PatternTerm>& terms = alternative->m_terms;
- for (size_t termIndex = firstTermIndex; termIndex <= lastTermIndex; ++termIndex) {
+ ASSERT(endIndex <= terms.size());
+ for (size_t termIndex = firstTermIndex; termIndex < endIndex; ++termIndex) {
PatternTerm& term = terms[termIndex];
if (term.m_capture)
@@ -753,7 +991,7 @@ public:
if (term.type == PatternTerm::TypeParenthesesSubpattern) {
PatternDisjunction* nestedDisjunction = term.parentheses.disjunction;
for (unsigned alt = 0; alt < nestedDisjunction->m_alternatives.size(); ++alt) {
- if (containsCapturingTerms(nestedDisjunction->m_alternatives[alt].get(), 0, nestedDisjunction->m_alternatives[alt]->m_terms.size() - 1))
+ if (containsCapturingTerms(nestedDisjunction->m_alternatives[alt].get(), 0, nestedDisjunction->m_alternatives[alt]->m_terms.size()))
return true;
}
}
@@ -769,16 +1007,17 @@ public:
// beginning and the end of the match.
void optimizeDotStarWrappedExpressions()
{
- Vector<OwnPtr<PatternAlternative> >& alternatives = m_pattern.m_body->m_alternatives;
+ Vector<std::unique_ptr<PatternAlternative>>& alternatives = m_pattern.m_body->m_alternatives;
if (alternatives.size() != 1)
return;
+ CharacterClass* dotCharacterClass = m_pattern.dotAll() ? m_pattern.anyCharacterClass() : m_pattern.newlineCharacterClass();
PatternAlternative* alternative = alternatives[0].get();
Vector<PatternTerm>& terms = alternative->m_terms;
if (terms.size() >= 3) {
bool startsWithBOL = false;
bool endsWithEOL = false;
- size_t termIndex, firstExpressionTerm, lastExpressionTerm;
+ size_t termIndex, firstExpressionTerm;
termIndex = 0;
if (terms[termIndex].type == PatternTerm::TypeAssertionBOL) {
@@ -787,7 +1026,10 @@ public:
}
PatternTerm& firstNonAnchorTerm = terms[termIndex];
- if ((firstNonAnchorTerm.type != PatternTerm::TypeCharacterClass) || (firstNonAnchorTerm.characterClass != m_pattern.newlineCharacterClass()) || !((firstNonAnchorTerm.quantityType == QuantifierGreedy) || (firstNonAnchorTerm.quantityType == QuantifierNonGreedy)))
+ if (firstNonAnchorTerm.type != PatternTerm::TypeCharacterClass
+ || firstNonAnchorTerm.characterClass != dotCharacterClass
+ || firstNonAnchorTerm.quantityMinCount
+ || firstNonAnchorTerm.quantityMaxCount != quantifyInfinite)
return;
firstExpressionTerm = termIndex + 1;
@@ -799,16 +1041,19 @@ public:
}
PatternTerm& lastNonAnchorTerm = terms[termIndex];
- if ((lastNonAnchorTerm.type != PatternTerm::TypeCharacterClass) || (lastNonAnchorTerm.characterClass != m_pattern.newlineCharacterClass()) || (lastNonAnchorTerm.quantityType != QuantifierGreedy))
+ if (lastNonAnchorTerm.type != PatternTerm::TypeCharacterClass
+ || lastNonAnchorTerm.characterClass != dotCharacterClass
+ || lastNonAnchorTerm.quantityType != QuantifierGreedy
+ || lastNonAnchorTerm.quantityMinCount
+ || lastNonAnchorTerm.quantityMaxCount != quantifyInfinite)
return;
-
- lastExpressionTerm = termIndex - 1;
- if (firstExpressionTerm > lastExpressionTerm)
+ size_t endIndex = termIndex;
+ if (firstExpressionTerm >= endIndex)
return;
- if (!containsCapturingTerms(alternative, firstExpressionTerm, lastExpressionTerm)) {
- for (termIndex = terms.size() - 1; termIndex > lastExpressionTerm; --termIndex)
+ if (!containsCapturingTerms(alternative, firstExpressionTerm, endIndex)) {
+ for (termIndex = terms.size() - 1; termIndex >= endIndex; --termIndex)
terms.remove(termIndex);
for (termIndex = firstExpressionTerm; termIndex > 0; --termIndex)
@@ -822,62 +1067,445 @@ public:
}
private:
+ bool isSafeToRecurse() const
+ {
+ if (!m_stackLimit)
+ return true;
+ int8_t* curr = reinterpret_cast<int8_t*>(&curr);
+ int8_t* limit = reinterpret_cast<int8_t*>(m_stackLimit);
+ return curr >= limit;
+ }
+
YarrPattern& m_pattern;
PatternAlternative* m_alternative;
CharacterClassConstructor m_characterClassConstructor;
+ void* m_stackLimit;
bool m_invertCharacterClass;
- bool m_invertParentheticalAssertion;
+ bool m_invertParentheticalAssertion { false };
};
-const char* YarrPattern::compile(const String& patternString)
+ErrorCode YarrPattern::compile(const String& patternString, void* stackLimit)
{
- YarrPatternConstructor constructor(*this);
+ YarrPatternConstructor constructor(*this, stackLimit);
- if (const char* error = parse(constructor, patternString))
- return error;
+ if (m_flags == InvalidFlags)
+ return ErrorCode::InvalidRegularExpressionFlags;
+
+ {
+ ErrorCode error = parse(constructor, patternString, unicode());
+ if (hasError(error))
+ return error;
+ }
// If the pattern contains illegal backreferences reset & reparse.
// Quoting Netscape's "What's new in JavaScript 1.2",
// "Note: if the number of left parentheses is less than the number specified
// in \#, the \# is taken as an octal escape as described in the next row."
if (containsIllegalBackReference()) {
+ if (unicode())
+ return ErrorCode::InvalidBackreference;
+
unsigned numSubpatterns = m_numSubpatterns;
constructor.reset();
-#if !ASSERT_DISABLED
- const char* error =
-#endif
- parse(constructor, patternString, numSubpatterns);
-
- ASSERT(!error);
+ ErrorCode error = parse(constructor, patternString, unicode(), numSubpatterns);
+ ASSERT_UNUSED(error, !hasError(error));
ASSERT(numSubpatterns == m_numSubpatterns);
}
constructor.checkForTerminalParentheses();
constructor.optimizeDotStarWrappedExpressions();
constructor.optimizeBOL();
-
- constructor.setupOffsets();
- return 0;
+ {
+ ErrorCode error = constructor.setupOffsets();
+ if (hasError(error))
+ return error;
+ }
+
+ if (Options::dumpCompiledRegExpPatterns())
+ dumpPattern(patternString);
+
+ return ErrorCode::NoError;
}
-YarrPattern::YarrPattern(const String& pattern, bool ignoreCase, bool multiline, const char** error)
- : m_ignoreCase(ignoreCase)
- , m_multiline(multiline)
- , m_containsBackreferences(false)
+YarrPattern::YarrPattern(const String& pattern, RegExpFlags flags, ErrorCode& error, void* stackLimit)
+ : m_containsBackreferences(false)
, m_containsBOL(false)
- , m_numSubpatterns(0)
- , m_maxBackReference(0)
- , newlineCached(0)
- , digitsCached(0)
- , spacesCached(0)
- , wordcharCached(0)
- , nondigitsCached(0)
- , nonspacesCached(0)
- , nonwordcharCached(0)
+ , m_containsUnsignedLengthPattern(false)
+ , m_hasCopiedParenSubexpressions(false)
+ , m_saveInitialStartValue(false)
+ , m_flags(flags)
+{
+ error = compile(pattern, stackLimit);
+}
+
+void indentForNestingLevel(PrintStream& out, unsigned nestingDepth)
+{
+ out.print(" ");
+ for (; nestingDepth; --nestingDepth)
+ out.print(" ");
+}
+
+void dumpUChar32(PrintStream& out, UChar32 c)
+{
+ if (c >= ' '&& c <= 0xff)
+ out.printf("'%c'", static_cast<char>(c));
+ else
+ out.printf("0x%04x", c);
+}
+
+void dumpCharacterClass(PrintStream& out, YarrPattern* pattern, CharacterClass* characterClass)
+{
+ if (characterClass == pattern->anyCharacterClass())
+ out.print("<any character>");
+ else if (characterClass == pattern->newlineCharacterClass())
+ out.print("<newline>");
+ else if (characterClass == pattern->digitsCharacterClass())
+ out.print("<digits>");
+ else if (characterClass == pattern->spacesCharacterClass())
+ out.print("<whitespace>");
+ else if (characterClass == pattern->wordcharCharacterClass())
+ out.print("<word>");
+ else if (characterClass == pattern->wordUnicodeIgnoreCaseCharCharacterClass())
+ out.print("<unicode ignore case>");
+ else if (characterClass == pattern->nondigitsCharacterClass())
+ out.print("<non-digits>");
+ else if (characterClass == pattern->nonspacesCharacterClass())
+ out.print("<non-whitespace>");
+ else if (characterClass == pattern->nonwordcharCharacterClass())
+ out.print("<non-word>");
+ else if (characterClass == pattern->nonwordUnicodeIgnoreCaseCharCharacterClass())
+ out.print("<unicode non-ignore case>");
+ else {
+ bool needMatchesRangesSeperator = false;
+
+ auto dumpMatches = [&] (const char* prefix, Vector<UChar32> matches) {
+ size_t matchesSize = matches.size();
+ if (matchesSize) {
+ if (needMatchesRangesSeperator)
+ out.print(",");
+ needMatchesRangesSeperator = true;
+
+ out.print(prefix, ":(");
+ for (size_t i = 0; i < matchesSize; ++i) {
+ if (i)
+ out.print(",");
+ dumpUChar32(out, matches[i]);
+ }
+ out.print(")");
+ }
+ };
+
+ auto dumpRanges = [&] (const char* prefix, Vector<CharacterRange> ranges) {
+ size_t rangeSize = ranges.size();
+ if (rangeSize) {
+ if (needMatchesRangesSeperator)
+ out.print(",");
+ needMatchesRangesSeperator = true;
+
+ out.print(prefix, " ranges:(");
+ for (size_t i = 0; i < rangeSize; ++i) {
+ if (i)
+ out.print(",");
+ CharacterRange range = ranges[i];
+ out.print("(");
+ dumpUChar32(out, range.begin);
+ out.print("..");
+ dumpUChar32(out, range.end);
+ out.print(")");
+ }
+ out.print(")");
+ }
+ };
+
+ out.print("[");
+ dumpMatches("ASCII", characterClass->m_matches);
+ dumpRanges("ASCII", characterClass->m_ranges);
+ dumpMatches("Unicode", characterClass->m_matchesUnicode);
+ dumpRanges("Unicode", characterClass->m_rangesUnicode);
+ out.print("]");
+ }
+}
+
+void PatternAlternative::dump(PrintStream& out, YarrPattern* thisPattern, unsigned nestingDepth)
+{
+ out.print("minimum size: ", m_minimumSize);
+ if (m_hasFixedSize)
+ out.print(",fixed size");
+ if (m_onceThrough)
+ out.print(",once through");
+ if (m_startsWithBOL)
+ out.print(",starts with ^");
+ if (m_containsBOL)
+ out.print(",contains ^");
+ out.print("\n");
+
+ for (size_t i = 0; i < m_terms.size(); ++i)
+ m_terms[i].dump(out, thisPattern, nestingDepth);
+}
+
+void PatternTerm::dumpQuantifier(PrintStream& out)
+{
+ if (quantityType == QuantifierFixedCount && quantityMinCount == 1 && quantityMaxCount == 1)
+ return;
+ out.print(" {", quantityMinCount.unsafeGet());
+ if (quantityMinCount != quantityMaxCount) {
+ if (quantityMaxCount == UINT_MAX)
+ out.print(",...");
+ else
+ out.print(",", quantityMaxCount.unsafeGet());
+ }
+ out.print("}");
+ if (quantityType == QuantifierGreedy)
+ out.print(" greedy");
+ else if (quantityType == QuantifierNonGreedy)
+ out.print(" non-greedy");
+}
+
+void PatternTerm::dump(PrintStream& out, YarrPattern* thisPattern, unsigned nestingDepth)
+{
+ indentForNestingLevel(out, nestingDepth);
+
+ if (type != TypeParenthesesSubpattern && type != TypeParentheticalAssertion) {
+ if (invert())
+ out.print("not ");
+ }
+
+ switch (type) {
+ case TypeAssertionBOL:
+ out.println("BOL");
+ break;
+ case TypeAssertionEOL:
+ out.println("EOL");
+ break;
+ case TypeAssertionWordBoundary:
+ out.println("word boundary");
+ break;
+ case TypePatternCharacter:
+ out.printf("character ");
+ out.printf("inputPosition %u ", inputPosition);
+ if (thisPattern->ignoreCase() && isASCIIAlpha(patternCharacter)) {
+ dumpUChar32(out, toASCIIUpper(patternCharacter));
+ out.print("/");
+ dumpUChar32(out, toASCIILower(patternCharacter));
+ } else
+ dumpUChar32(out, patternCharacter);
+ dumpQuantifier(out);
+ if (quantityType != QuantifierFixedCount)
+ out.print(",frame location ", frameLocation);
+ out.println();
+ break;
+ case TypeCharacterClass:
+ out.print("character class ");
+ if (characterClass->m_anyCharacter)
+ out.print("<any character>");
+ else if (characterClass == thisPattern->newlineCharacterClass())
+ out.print("<newline>");
+ else if (characterClass == thisPattern->digitsCharacterClass())
+ out.print("<digits>");
+ else if (characterClass == thisPattern->spacesCharacterClass())
+ out.print("<whitespace>");
+ else if (characterClass == thisPattern->wordcharCharacterClass())
+ out.print("<word>");
+ else if (characterClass == thisPattern->wordUnicodeIgnoreCaseCharCharacterClass())
+ out.print("<unicode ignore case>");
+ else if (characterClass == thisPattern->nondigitsCharacterClass())
+ out.print("<non-digits>");
+ else if (characterClass == thisPattern->nonspacesCharacterClass())
+ out.print("<non-whitespace>");
+ else if (characterClass == thisPattern->nonwordcharCharacterClass())
+ out.print("<non-word>");
+ else if (characterClass == thisPattern->nonwordUnicodeIgnoreCaseCharCharacterClass())
+ out.print("<unicode non-ignore case>");
+ else {
+ bool needMatchesRangesSeperator = false;
+
+ auto dumpMatches = [&] (const char* prefix, Vector<UChar32> matches) {
+ size_t matchesSize = matches.size();
+ if (matchesSize) {
+ if (needMatchesRangesSeperator)
+ out.print(",");
+ needMatchesRangesSeperator = true;
+
+ out.print(prefix, ":(");
+ for (size_t i = 0; i < matchesSize; ++i) {
+ if (i)
+ out.print(",");
+ dumpUChar32(out, matches[i]);
+ }
+ out.print(")");
+ }
+ };
+
+ auto dumpRanges = [&] (const char* prefix, Vector<CharacterRange> ranges) {
+ size_t rangeSize = ranges.size();
+ if (rangeSize) {
+ if (needMatchesRangesSeperator)
+ out.print(",");
+ needMatchesRangesSeperator = true;
+
+ out.print(prefix, " ranges:(");
+ for (size_t i = 0; i < rangeSize; ++i) {
+ if (i)
+ out.print(",");
+ CharacterRange range = ranges[i];
+ out.print("(");
+ dumpUChar32(out, range.begin);
+ out.print("..");
+ dumpUChar32(out, range.end);
+ out.print(")");
+ }
+ out.print(")");
+ }
+ };
+
+ out.print("[");
+ dumpMatches("ASCII", characterClass->m_matches);
+ dumpRanges("ASCII", characterClass->m_ranges);
+ dumpMatches("Unicode", characterClass->m_matchesUnicode);
+ dumpRanges("Unicode", characterClass->m_rangesUnicode);
+ out.print("]");
+ }
+ dumpQuantifier(out);
+ if (quantityType != QuantifierFixedCount || thisPattern->unicode())
+ out.print(",frame location ", frameLocation);
+ out.println();
+ break;
+ case TypeBackReference:
+ out.print("back reference to subpattern #", backReferenceSubpatternId);
+ out.println(",frame location ", frameLocation);
+ break;
+ case TypeForwardReference:
+ out.println("forward reference");
+ break;
+ case TypeParenthesesSubpattern:
+ if (m_capture)
+ out.print("captured ");
+ else
+ out.print("non-captured ");
+
+ FALLTHROUGH;
+ case TypeParentheticalAssertion:
+ if (m_invert)
+ out.print("inverted ");
+
+ if (type == TypeParenthesesSubpattern)
+ out.print("subpattern");
+ else if (type == TypeParentheticalAssertion)
+ out.print("assertion");
+
+ if (m_capture)
+ out.print(" #", parentheses.subpatternId);
+
+ dumpQuantifier(out);
+
+ if (parentheses.isCopy)
+ out.print(",copy");
+
+ if (parentheses.isTerminal)
+ out.print(",terminal");
+
+ out.println(",frame location ", frameLocation);
+
+ if (parentheses.disjunction->m_alternatives.size() > 1) {
+ indentForNestingLevel(out, nestingDepth + 1);
+ unsigned alternativeFrameLocation = frameLocation;
+ if (quantityMaxCount == 1 && !parentheses.isCopy)
+ alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce;
+ else if (parentheses.isTerminal)
+ alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesTerminal;
+ else
+ alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParentheses;
+ out.println("alternative list,frame location ", alternativeFrameLocation);
+ }
+
+ parentheses.disjunction->dump(out, thisPattern, nestingDepth + 1);
+ break;
+ case TypeDotStarEnclosure:
+ out.println(".* enclosure,frame location ", thisPattern->m_initialStartValueFrameLocation);
+ break;
+ }
+}
+
+void PatternDisjunction::dump(PrintStream& out, YarrPattern* thisPattern, unsigned nestingDepth = 0)
+{
+ unsigned alternativeCount = m_alternatives.size();
+ for (unsigned i = 0; i < alternativeCount; ++i) {
+ indentForNestingLevel(out, nestingDepth);
+ if (alternativeCount > 1)
+ out.print("alternative #", i, ": ");
+ m_alternatives[i].get()->dump(out, thisPattern, nestingDepth + (alternativeCount > 1));
+ }
+}
+
+void YarrPattern::dumpPattern(const String& patternString)
+{
+ dumpPattern(WTF::dataFile(), patternString);
+}
+
+void YarrPattern::dumpPattern(PrintStream& out, const String& patternString)
+{
+ out.print("RegExp pattern for /");
+ out.print(patternString);
+ out.print("/");
+ if (global())
+ out.print("g");
+ if (ignoreCase())
+ out.print("i");
+ if (multiline())
+ out.print("m");
+ if (unicode())
+ out.print("u");
+ if (sticky())
+ out.print("y");
+ if (m_flags != NoFlags) {
+ bool printSeperator = false;
+ out.print(" (");
+ if (global()) {
+ out.print("global");
+ printSeperator = true;
+ }
+ if (ignoreCase()) {
+ if (printSeperator)
+ out.print("|");
+ out.print("ignore case");
+ printSeperator = true;
+ }
+ if (multiline()) {
+ if (printSeperator)
+ out.print("|");
+ out.print("multiline");
+ printSeperator = true;
+ }
+ if (unicode()) {
+ if (printSeperator)
+ out.print("|");
+ out.print("unicode");
+ printSeperator = true;
+ }
+ if (sticky()) {
+ if (printSeperator)
+ out.print("|");
+ out.print("sticky");
+ printSeperator = true;
+ }
+ out.print(")");
+ }
+ out.print(":\n");
+ if (m_body->m_callFrameSize)
+ out.print(" callframe size: ", m_body->m_callFrameSize, "\n");
+ m_body->dump(out, this);
+}
+
+std::unique_ptr<CharacterClass> anycharCreate()
{
- *error = compile(pattern);
+ auto characterClass = std::make_unique<CharacterClass>();
+ characterClass->m_ranges.append(CharacterRange(0x00, 0x7f));
+ characterClass->m_rangesUnicode.append(CharacterRange(0x0080, 0x10ffff));
+ characterClass->m_hasNonBMPCharacters = true;
+ characterClass->m_anyCharacter = true;
+ return characterClass;
}
-} }
+} } // namespace JSC::Yarr