aboutsummaryrefslogtreecommitdiffstats
path: root/src/3rdparty/masm/yarr
diff options
context:
space:
mode:
authorJędrzej Nowacki <jedrzej.nowacki@digia.com>2013-01-29 14:20:50 +0100
committerLars Knoll <lars.knoll@digia.com>2013-01-30 13:20:40 +0100
commit74fba4d8069c946c1ba12b9ac4d4026aaf14118b (patch)
tree138cc204ec33ef0c838c98355ea84846e11ca1a7 /src/3rdparty/masm/yarr
parent0781ecb087b027cccc1c44de1a1c7520cc89e2d2 (diff)
Say hello to QtV4 module.
Change-Id: I507cd5707b7d7223a0d901cf939896fb2649b684 Reviewed-by: Lars Knoll <lars.knoll@digia.com>
Diffstat (limited to 'src/3rdparty/masm/yarr')
-rw-r--r--src/3rdparty/masm/yarr/Yarr.h69
-rw-r--r--src/3rdparty/masm/yarr/YarrCanonicalizeUCS2.cpp463
-rw-r--r--src/3rdparty/masm/yarr/YarrCanonicalizeUCS2.h138
-rw-r--r--src/3rdparty/masm/yarr/YarrCanonicalizeUCS2.js219
-rw-r--r--src/3rdparty/masm/yarr/YarrInterpreter.cpp1964
-rw-r--r--src/3rdparty/masm/yarr/YarrInterpreter.h385
-rw-r--r--src/3rdparty/masm/yarr/YarrJIT.cpp2667
-rw-r--r--src/3rdparty/masm/yarr/YarrJIT.h141
-rw-r--r--src/3rdparty/masm/yarr/YarrParser.h880
-rw-r--r--src/3rdparty/masm/yarr/YarrPattern.cpp874
-rw-r--r--src/3rdparty/masm/yarr/YarrPattern.h421
-rw-r--r--src/3rdparty/masm/yarr/YarrSyntaxChecker.cpp59
-rw-r--r--src/3rdparty/masm/yarr/YarrSyntaxChecker.h38
-rw-r--r--src/3rdparty/masm/yarr/yarr.pri12
14 files changed, 8330 insertions, 0 deletions
diff --git a/src/3rdparty/masm/yarr/Yarr.h b/src/3rdparty/masm/yarr/Yarr.h
new file mode 100644
index 0000000000..d393e9fa90
--- /dev/null
+++ b/src/3rdparty/masm/yarr/Yarr.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (C) 2009 Apple Inc. All rights reserved.
+ * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF SZEGED ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL UNIVERSITY OF SZEGED OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef Yarr_h
+#define Yarr_h
+
+#include "YarrInterpreter.h"
+#include "YarrPattern.h"
+
+namespace JSC { namespace Yarr {
+
+#define YarrStackSpaceForBackTrackInfoPatternCharacter 1 // Only for !fixed quantifiers.
+#define YarrStackSpaceForBackTrackInfoCharacterClass 1 // Only for !fixed quantifiers.
+#define YarrStackSpaceForBackTrackInfoBackReference 2
+#define YarrStackSpaceForBackTrackInfoAlternative 1 // One per alternative.
+#define YarrStackSpaceForBackTrackInfoParentheticalAssertion 1
+#define YarrStackSpaceForBackTrackInfoParenthesesOnce 1 // Only for !fixed quantifiers.
+#define YarrStackSpaceForBackTrackInfoParenthesesTerminal 1
+#define YarrStackSpaceForBackTrackInfoParentheses 2
+
+static const unsigned quantifyInfinite = UINT_MAX;
+static const unsigned offsetNoMatch = (unsigned)-1;
+
+// The below limit restricts the number of "recursive" match calls in order to
+// avoid spending exponential time on complex regular expressions.
+static const unsigned matchLimit = 1000000;
+
+enum JSRegExpResult {
+ JSRegExpMatch = 1,
+ JSRegExpNoMatch = 0,
+ JSRegExpErrorNoMatch = -1,
+ JSRegExpErrorHitLimit = -2,
+ JSRegExpErrorNoMemory = -3,
+ JSRegExpErrorInternal = -4
+};
+
+enum YarrCharSize {
+ Char8,
+ Char16
+};
+
+} } // namespace JSC::Yarr
+
+#endif // Yarr_h
+
diff --git a/src/3rdparty/masm/yarr/YarrCanonicalizeUCS2.cpp b/src/3rdparty/masm/yarr/YarrCanonicalizeUCS2.cpp
new file mode 100644
index 0000000000..7bb3d08eb5
--- /dev/null
+++ b/src/3rdparty/masm/yarr/YarrCanonicalizeUCS2.cpp
@@ -0,0 +1,463 @@
+/*
+ * Copyright (C) 2012 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// DO NOT EDIT! - this file autogenerated by YarrCanonicalizeUCS2.js
+
+#include "config.h"
+#include "YarrCanonicalizeUCS2.h"
+
+namespace JSC { namespace Yarr {
+
+#include <stdint.h>
+
+uint16_t ucs2CharacterSet0[] = { 0x01c4u, 0x01c5u, 0x01c6u, 0 };
+uint16_t ucs2CharacterSet1[] = { 0x01c7u, 0x01c8u, 0x01c9u, 0 };
+uint16_t ucs2CharacterSet2[] = { 0x01cau, 0x01cbu, 0x01ccu, 0 };
+uint16_t ucs2CharacterSet3[] = { 0x01f1u, 0x01f2u, 0x01f3u, 0 };
+uint16_t ucs2CharacterSet4[] = { 0x0392u, 0x03b2u, 0x03d0u, 0 };
+uint16_t ucs2CharacterSet5[] = { 0x0395u, 0x03b5u, 0x03f5u, 0 };
+uint16_t ucs2CharacterSet6[] = { 0x0398u, 0x03b8u, 0x03d1u, 0 };
+uint16_t ucs2CharacterSet7[] = { 0x0345u, 0x0399u, 0x03b9u, 0x1fbeu, 0 };
+uint16_t ucs2CharacterSet8[] = { 0x039au, 0x03bau, 0x03f0u, 0 };
+uint16_t ucs2CharacterSet9[] = { 0x00b5u, 0x039cu, 0x03bcu, 0 };
+uint16_t ucs2CharacterSet10[] = { 0x03a0u, 0x03c0u, 0x03d6u, 0 };
+uint16_t ucs2CharacterSet11[] = { 0x03a1u, 0x03c1u, 0x03f1u, 0 };
+uint16_t ucs2CharacterSet12[] = { 0x03a3u, 0x03c2u, 0x03c3u, 0 };
+uint16_t ucs2CharacterSet13[] = { 0x03a6u, 0x03c6u, 0x03d5u, 0 };
+uint16_t ucs2CharacterSet14[] = { 0x1e60u, 0x1e61u, 0x1e9bu, 0 };
+
+static const size_t UCS2_CANONICALIZATION_SETS = 15;
+uint16_t* characterSetInfo[UCS2_CANONICALIZATION_SETS] = {
+ ucs2CharacterSet0,
+ ucs2CharacterSet1,
+ ucs2CharacterSet2,
+ ucs2CharacterSet3,
+ ucs2CharacterSet4,
+ ucs2CharacterSet5,
+ ucs2CharacterSet6,
+ ucs2CharacterSet7,
+ ucs2CharacterSet8,
+ ucs2CharacterSet9,
+ ucs2CharacterSet10,
+ ucs2CharacterSet11,
+ ucs2CharacterSet12,
+ ucs2CharacterSet13,
+ ucs2CharacterSet14,
+};
+
+const size_t UCS2_CANONICALIZATION_RANGES = 364;
+UCS2CanonicalizationRange rangeInfo[UCS2_CANONICALIZATION_RANGES] = {
+ { 0x0000u, 0x0040u, 0x0000u, CanonicalizeUnique },
+ { 0x0041u, 0x005au, 0x0020u, CanonicalizeRangeLo },
+ { 0x005bu, 0x0060u, 0x0000u, CanonicalizeUnique },
+ { 0x0061u, 0x007au, 0x0020u, CanonicalizeRangeHi },
+ { 0x007bu, 0x00b4u, 0x0000u, CanonicalizeUnique },
+ { 0x00b5u, 0x00b5u, 0x0009u, CanonicalizeSet },
+ { 0x00b6u, 0x00bfu, 0x0000u, CanonicalizeUnique },
+ { 0x00c0u, 0x00d6u, 0x0020u, CanonicalizeRangeLo },
+ { 0x00d7u, 0x00d7u, 0x0000u, CanonicalizeUnique },
+ { 0x00d8u, 0x00deu, 0x0020u, CanonicalizeRangeLo },
+ { 0x00dfu, 0x00dfu, 0x0000u, CanonicalizeUnique },
+ { 0x00e0u, 0x00f6u, 0x0020u, CanonicalizeRangeHi },
+ { 0x00f7u, 0x00f7u, 0x0000u, CanonicalizeUnique },
+ { 0x00f8u, 0x00feu, 0x0020u, CanonicalizeRangeHi },
+ { 0x00ffu, 0x00ffu, 0x0079u, CanonicalizeRangeLo },
+ { 0x0100u, 0x012fu, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x0130u, 0x0131u, 0x0000u, CanonicalizeUnique },
+ { 0x0132u, 0x0137u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x0138u, 0x0138u, 0x0000u, CanonicalizeUnique },
+ { 0x0139u, 0x0148u, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0x0149u, 0x0149u, 0x0000u, CanonicalizeUnique },
+ { 0x014au, 0x0177u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x0178u, 0x0178u, 0x0079u, CanonicalizeRangeHi },
+ { 0x0179u, 0x017eu, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0x017fu, 0x017fu, 0x0000u, CanonicalizeUnique },
+ { 0x0180u, 0x0180u, 0x00c3u, CanonicalizeRangeLo },
+ { 0x0181u, 0x0181u, 0x00d2u, CanonicalizeRangeLo },
+ { 0x0182u, 0x0185u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x0186u, 0x0186u, 0x00ceu, CanonicalizeRangeLo },
+ { 0x0187u, 0x0188u, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0x0189u, 0x018au, 0x00cdu, CanonicalizeRangeLo },
+ { 0x018bu, 0x018cu, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0x018du, 0x018du, 0x0000u, CanonicalizeUnique },
+ { 0x018eu, 0x018eu, 0x004fu, CanonicalizeRangeLo },
+ { 0x018fu, 0x018fu, 0x00cau, CanonicalizeRangeLo },
+ { 0x0190u, 0x0190u, 0x00cbu, CanonicalizeRangeLo },
+ { 0x0191u, 0x0192u, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0x0193u, 0x0193u, 0x00cdu, CanonicalizeRangeLo },
+ { 0x0194u, 0x0194u, 0x00cfu, CanonicalizeRangeLo },
+ { 0x0195u, 0x0195u, 0x0061u, CanonicalizeRangeLo },
+ { 0x0196u, 0x0196u, 0x00d3u, CanonicalizeRangeLo },
+ { 0x0197u, 0x0197u, 0x00d1u, CanonicalizeRangeLo },
+ { 0x0198u, 0x0199u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x019au, 0x019au, 0x00a3u, CanonicalizeRangeLo },
+ { 0x019bu, 0x019bu, 0x0000u, CanonicalizeUnique },
+ { 0x019cu, 0x019cu, 0x00d3u, CanonicalizeRangeLo },
+ { 0x019du, 0x019du, 0x00d5u, CanonicalizeRangeLo },
+ { 0x019eu, 0x019eu, 0x0082u, CanonicalizeRangeLo },
+ { 0x019fu, 0x019fu, 0x00d6u, CanonicalizeRangeLo },
+ { 0x01a0u, 0x01a5u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x01a6u, 0x01a6u, 0x00dau, CanonicalizeRangeLo },
+ { 0x01a7u, 0x01a8u, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0x01a9u, 0x01a9u, 0x00dau, CanonicalizeRangeLo },
+ { 0x01aau, 0x01abu, 0x0000u, CanonicalizeUnique },
+ { 0x01acu, 0x01adu, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x01aeu, 0x01aeu, 0x00dau, CanonicalizeRangeLo },
+ { 0x01afu, 0x01b0u, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0x01b1u, 0x01b2u, 0x00d9u, CanonicalizeRangeLo },
+ { 0x01b3u, 0x01b6u, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0x01b7u, 0x01b7u, 0x00dbu, CanonicalizeRangeLo },
+ { 0x01b8u, 0x01b9u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x01bau, 0x01bbu, 0x0000u, CanonicalizeUnique },
+ { 0x01bcu, 0x01bdu, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x01beu, 0x01beu, 0x0000u, CanonicalizeUnique },
+ { 0x01bfu, 0x01bfu, 0x0038u, CanonicalizeRangeLo },
+ { 0x01c0u, 0x01c3u, 0x0000u, CanonicalizeUnique },
+ { 0x01c4u, 0x01c6u, 0x0000u, CanonicalizeSet },
+ { 0x01c7u, 0x01c9u, 0x0001u, CanonicalizeSet },
+ { 0x01cau, 0x01ccu, 0x0002u, CanonicalizeSet },
+ { 0x01cdu, 0x01dcu, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0x01ddu, 0x01ddu, 0x004fu, CanonicalizeRangeHi },
+ { 0x01deu, 0x01efu, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x01f0u, 0x01f0u, 0x0000u, CanonicalizeUnique },
+ { 0x01f1u, 0x01f3u, 0x0003u, CanonicalizeSet },
+ { 0x01f4u, 0x01f5u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x01f6u, 0x01f6u, 0x0061u, CanonicalizeRangeHi },
+ { 0x01f7u, 0x01f7u, 0x0038u, CanonicalizeRangeHi },
+ { 0x01f8u, 0x021fu, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x0220u, 0x0220u, 0x0082u, CanonicalizeRangeHi },
+ { 0x0221u, 0x0221u, 0x0000u, CanonicalizeUnique },
+ { 0x0222u, 0x0233u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x0234u, 0x0239u, 0x0000u, CanonicalizeUnique },
+ { 0x023au, 0x023au, 0x2a2bu, CanonicalizeRangeLo },
+ { 0x023bu, 0x023cu, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0x023du, 0x023du, 0x00a3u, CanonicalizeRangeHi },
+ { 0x023eu, 0x023eu, 0x2a28u, CanonicalizeRangeLo },
+ { 0x023fu, 0x0240u, 0x2a3fu, CanonicalizeRangeLo },
+ { 0x0241u, 0x0242u, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0x0243u, 0x0243u, 0x00c3u, CanonicalizeRangeHi },
+ { 0x0244u, 0x0244u, 0x0045u, CanonicalizeRangeLo },
+ { 0x0245u, 0x0245u, 0x0047u, CanonicalizeRangeLo },
+ { 0x0246u, 0x024fu, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x0250u, 0x0250u, 0x2a1fu, CanonicalizeRangeLo },
+ { 0x0251u, 0x0251u, 0x2a1cu, CanonicalizeRangeLo },
+ { 0x0252u, 0x0252u, 0x2a1eu, CanonicalizeRangeLo },
+ { 0x0253u, 0x0253u, 0x00d2u, CanonicalizeRangeHi },
+ { 0x0254u, 0x0254u, 0x00ceu, CanonicalizeRangeHi },
+ { 0x0255u, 0x0255u, 0x0000u, CanonicalizeUnique },
+ { 0x0256u, 0x0257u, 0x00cdu, CanonicalizeRangeHi },
+ { 0x0258u, 0x0258u, 0x0000u, CanonicalizeUnique },
+ { 0x0259u, 0x0259u, 0x00cau, CanonicalizeRangeHi },
+ { 0x025au, 0x025au, 0x0000u, CanonicalizeUnique },
+ { 0x025bu, 0x025bu, 0x00cbu, CanonicalizeRangeHi },
+ { 0x025cu, 0x025fu, 0x0000u, CanonicalizeUnique },
+ { 0x0260u, 0x0260u, 0x00cdu, CanonicalizeRangeHi },
+ { 0x0261u, 0x0262u, 0x0000u, CanonicalizeUnique },
+ { 0x0263u, 0x0263u, 0x00cfu, CanonicalizeRangeHi },
+ { 0x0264u, 0x0264u, 0x0000u, CanonicalizeUnique },
+ { 0x0265u, 0x0265u, 0xa528u, CanonicalizeRangeLo },
+ { 0x0266u, 0x0267u, 0x0000u, CanonicalizeUnique },
+ { 0x0268u, 0x0268u, 0x00d1u, CanonicalizeRangeHi },
+ { 0x0269u, 0x0269u, 0x00d3u, CanonicalizeRangeHi },
+ { 0x026au, 0x026au, 0x0000u, CanonicalizeUnique },
+ { 0x026bu, 0x026bu, 0x29f7u, CanonicalizeRangeLo },
+ { 0x026cu, 0x026eu, 0x0000u, CanonicalizeUnique },
+ { 0x026fu, 0x026fu, 0x00d3u, CanonicalizeRangeHi },
+ { 0x0270u, 0x0270u, 0x0000u, CanonicalizeUnique },
+ { 0x0271u, 0x0271u, 0x29fdu, CanonicalizeRangeLo },
+ { 0x0272u, 0x0272u, 0x00d5u, CanonicalizeRangeHi },
+ { 0x0273u, 0x0274u, 0x0000u, CanonicalizeUnique },
+ { 0x0275u, 0x0275u, 0x00d6u, CanonicalizeRangeHi },
+ { 0x0276u, 0x027cu, 0x0000u, CanonicalizeUnique },
+ { 0x027du, 0x027du, 0x29e7u, CanonicalizeRangeLo },
+ { 0x027eu, 0x027fu, 0x0000u, CanonicalizeUnique },
+ { 0x0280u, 0x0280u, 0x00dau, CanonicalizeRangeHi },
+ { 0x0281u, 0x0282u, 0x0000u, CanonicalizeUnique },
+ { 0x0283u, 0x0283u, 0x00dau, CanonicalizeRangeHi },
+ { 0x0284u, 0x0287u, 0x0000u, CanonicalizeUnique },
+ { 0x0288u, 0x0288u, 0x00dau, CanonicalizeRangeHi },
+ { 0x0289u, 0x0289u, 0x0045u, CanonicalizeRangeHi },
+ { 0x028au, 0x028bu, 0x00d9u, CanonicalizeRangeHi },
+ { 0x028cu, 0x028cu, 0x0047u, CanonicalizeRangeHi },
+ { 0x028du, 0x0291u, 0x0000u, CanonicalizeUnique },
+ { 0x0292u, 0x0292u, 0x00dbu, CanonicalizeRangeHi },
+ { 0x0293u, 0x0344u, 0x0000u, CanonicalizeUnique },
+ { 0x0345u, 0x0345u, 0x0007u, CanonicalizeSet },
+ { 0x0346u, 0x036fu, 0x0000u, CanonicalizeUnique },
+ { 0x0370u, 0x0373u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x0374u, 0x0375u, 0x0000u, CanonicalizeUnique },
+ { 0x0376u, 0x0377u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x0378u, 0x037au, 0x0000u, CanonicalizeUnique },
+ { 0x037bu, 0x037du, 0x0082u, CanonicalizeRangeLo },
+ { 0x037eu, 0x0385u, 0x0000u, CanonicalizeUnique },
+ { 0x0386u, 0x0386u, 0x0026u, CanonicalizeRangeLo },
+ { 0x0387u, 0x0387u, 0x0000u, CanonicalizeUnique },
+ { 0x0388u, 0x038au, 0x0025u, CanonicalizeRangeLo },
+ { 0x038bu, 0x038bu, 0x0000u, CanonicalizeUnique },
+ { 0x038cu, 0x038cu, 0x0040u, CanonicalizeRangeLo },
+ { 0x038du, 0x038du, 0x0000u, CanonicalizeUnique },
+ { 0x038eu, 0x038fu, 0x003fu, CanonicalizeRangeLo },
+ { 0x0390u, 0x0390u, 0x0000u, CanonicalizeUnique },
+ { 0x0391u, 0x0391u, 0x0020u, CanonicalizeRangeLo },
+ { 0x0392u, 0x0392u, 0x0004u, CanonicalizeSet },
+ { 0x0393u, 0x0394u, 0x0020u, CanonicalizeRangeLo },
+ { 0x0395u, 0x0395u, 0x0005u, CanonicalizeSet },
+ { 0x0396u, 0x0397u, 0x0020u, CanonicalizeRangeLo },
+ { 0x0398u, 0x0398u, 0x0006u, CanonicalizeSet },
+ { 0x0399u, 0x0399u, 0x0007u, CanonicalizeSet },
+ { 0x039au, 0x039au, 0x0008u, CanonicalizeSet },
+ { 0x039bu, 0x039bu, 0x0020u, CanonicalizeRangeLo },
+ { 0x039cu, 0x039cu, 0x0009u, CanonicalizeSet },
+ { 0x039du, 0x039fu, 0x0020u, CanonicalizeRangeLo },
+ { 0x03a0u, 0x03a0u, 0x000au, CanonicalizeSet },
+ { 0x03a1u, 0x03a1u, 0x000bu, CanonicalizeSet },
+ { 0x03a2u, 0x03a2u, 0x0000u, CanonicalizeUnique },
+ { 0x03a3u, 0x03a3u, 0x000cu, CanonicalizeSet },
+ { 0x03a4u, 0x03a5u, 0x0020u, CanonicalizeRangeLo },
+ { 0x03a6u, 0x03a6u, 0x000du, CanonicalizeSet },
+ { 0x03a7u, 0x03abu, 0x0020u, CanonicalizeRangeLo },
+ { 0x03acu, 0x03acu, 0x0026u, CanonicalizeRangeHi },
+ { 0x03adu, 0x03afu, 0x0025u, CanonicalizeRangeHi },
+ { 0x03b0u, 0x03b0u, 0x0000u, CanonicalizeUnique },
+ { 0x03b1u, 0x03b1u, 0x0020u, CanonicalizeRangeHi },
+ { 0x03b2u, 0x03b2u, 0x0004u, CanonicalizeSet },
+ { 0x03b3u, 0x03b4u, 0x0020u, CanonicalizeRangeHi },
+ { 0x03b5u, 0x03b5u, 0x0005u, CanonicalizeSet },
+ { 0x03b6u, 0x03b7u, 0x0020u, CanonicalizeRangeHi },
+ { 0x03b8u, 0x03b8u, 0x0006u, CanonicalizeSet },
+ { 0x03b9u, 0x03b9u, 0x0007u, CanonicalizeSet },
+ { 0x03bau, 0x03bau, 0x0008u, CanonicalizeSet },
+ { 0x03bbu, 0x03bbu, 0x0020u, CanonicalizeRangeHi },
+ { 0x03bcu, 0x03bcu, 0x0009u, CanonicalizeSet },
+ { 0x03bdu, 0x03bfu, 0x0020u, CanonicalizeRangeHi },
+ { 0x03c0u, 0x03c0u, 0x000au, CanonicalizeSet },
+ { 0x03c1u, 0x03c1u, 0x000bu, CanonicalizeSet },
+ { 0x03c2u, 0x03c3u, 0x000cu, CanonicalizeSet },
+ { 0x03c4u, 0x03c5u, 0x0020u, CanonicalizeRangeHi },
+ { 0x03c6u, 0x03c6u, 0x000du, CanonicalizeSet },
+ { 0x03c7u, 0x03cbu, 0x0020u, CanonicalizeRangeHi },
+ { 0x03ccu, 0x03ccu, 0x0040u, CanonicalizeRangeHi },
+ { 0x03cdu, 0x03ceu, 0x003fu, CanonicalizeRangeHi },
+ { 0x03cfu, 0x03cfu, 0x0008u, CanonicalizeRangeLo },
+ { 0x03d0u, 0x03d0u, 0x0004u, CanonicalizeSet },
+ { 0x03d1u, 0x03d1u, 0x0006u, CanonicalizeSet },
+ { 0x03d2u, 0x03d4u, 0x0000u, CanonicalizeUnique },
+ { 0x03d5u, 0x03d5u, 0x000du, CanonicalizeSet },
+ { 0x03d6u, 0x03d6u, 0x000au, CanonicalizeSet },
+ { 0x03d7u, 0x03d7u, 0x0008u, CanonicalizeRangeHi },
+ { 0x03d8u, 0x03efu, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x03f0u, 0x03f0u, 0x0008u, CanonicalizeSet },
+ { 0x03f1u, 0x03f1u, 0x000bu, CanonicalizeSet },
+ { 0x03f2u, 0x03f2u, 0x0007u, CanonicalizeRangeLo },
+ { 0x03f3u, 0x03f4u, 0x0000u, CanonicalizeUnique },
+ { 0x03f5u, 0x03f5u, 0x0005u, CanonicalizeSet },
+ { 0x03f6u, 0x03f6u, 0x0000u, CanonicalizeUnique },
+ { 0x03f7u, 0x03f8u, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0x03f9u, 0x03f9u, 0x0007u, CanonicalizeRangeHi },
+ { 0x03fau, 0x03fbu, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x03fcu, 0x03fcu, 0x0000u, CanonicalizeUnique },
+ { 0x03fdu, 0x03ffu, 0x0082u, CanonicalizeRangeHi },
+ { 0x0400u, 0x040fu, 0x0050u, CanonicalizeRangeLo },
+ { 0x0410u, 0x042fu, 0x0020u, CanonicalizeRangeLo },
+ { 0x0430u, 0x044fu, 0x0020u, CanonicalizeRangeHi },
+ { 0x0450u, 0x045fu, 0x0050u, CanonicalizeRangeHi },
+ { 0x0460u, 0x0481u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x0482u, 0x0489u, 0x0000u, CanonicalizeUnique },
+ { 0x048au, 0x04bfu, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x04c0u, 0x04c0u, 0x000fu, CanonicalizeRangeLo },
+ { 0x04c1u, 0x04ceu, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0x04cfu, 0x04cfu, 0x000fu, CanonicalizeRangeHi },
+ { 0x04d0u, 0x0527u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x0528u, 0x0530u, 0x0000u, CanonicalizeUnique },
+ { 0x0531u, 0x0556u, 0x0030u, CanonicalizeRangeLo },
+ { 0x0557u, 0x0560u, 0x0000u, CanonicalizeUnique },
+ { 0x0561u, 0x0586u, 0x0030u, CanonicalizeRangeHi },
+ { 0x0587u, 0x109fu, 0x0000u, CanonicalizeUnique },
+ { 0x10a0u, 0x10c5u, 0x1c60u, CanonicalizeRangeLo },
+ { 0x10c6u, 0x1d78u, 0x0000u, CanonicalizeUnique },
+ { 0x1d79u, 0x1d79u, 0x8a04u, CanonicalizeRangeLo },
+ { 0x1d7au, 0x1d7cu, 0x0000u, CanonicalizeUnique },
+ { 0x1d7du, 0x1d7du, 0x0ee6u, CanonicalizeRangeLo },
+ { 0x1d7eu, 0x1dffu, 0x0000u, CanonicalizeUnique },
+ { 0x1e00u, 0x1e5fu, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x1e60u, 0x1e61u, 0x000eu, CanonicalizeSet },
+ { 0x1e62u, 0x1e95u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x1e96u, 0x1e9au, 0x0000u, CanonicalizeUnique },
+ { 0x1e9bu, 0x1e9bu, 0x000eu, CanonicalizeSet },
+ { 0x1e9cu, 0x1e9fu, 0x0000u, CanonicalizeUnique },
+ { 0x1ea0u, 0x1effu, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x1f00u, 0x1f07u, 0x0008u, CanonicalizeRangeLo },
+ { 0x1f08u, 0x1f0fu, 0x0008u, CanonicalizeRangeHi },
+ { 0x1f10u, 0x1f15u, 0x0008u, CanonicalizeRangeLo },
+ { 0x1f16u, 0x1f17u, 0x0000u, CanonicalizeUnique },
+ { 0x1f18u, 0x1f1du, 0x0008u, CanonicalizeRangeHi },
+ { 0x1f1eu, 0x1f1fu, 0x0000u, CanonicalizeUnique },
+ { 0x1f20u, 0x1f27u, 0x0008u, CanonicalizeRangeLo },
+ { 0x1f28u, 0x1f2fu, 0x0008u, CanonicalizeRangeHi },
+ { 0x1f30u, 0x1f37u, 0x0008u, CanonicalizeRangeLo },
+ { 0x1f38u, 0x1f3fu, 0x0008u, CanonicalizeRangeHi },
+ { 0x1f40u, 0x1f45u, 0x0008u, CanonicalizeRangeLo },
+ { 0x1f46u, 0x1f47u, 0x0000u, CanonicalizeUnique },
+ { 0x1f48u, 0x1f4du, 0x0008u, CanonicalizeRangeHi },
+ { 0x1f4eu, 0x1f50u, 0x0000u, CanonicalizeUnique },
+ { 0x1f51u, 0x1f51u, 0x0008u, CanonicalizeRangeLo },
+ { 0x1f52u, 0x1f52u, 0x0000u, CanonicalizeUnique },
+ { 0x1f53u, 0x1f53u, 0x0008u, CanonicalizeRangeLo },
+ { 0x1f54u, 0x1f54u, 0x0000u, CanonicalizeUnique },
+ { 0x1f55u, 0x1f55u, 0x0008u, CanonicalizeRangeLo },
+ { 0x1f56u, 0x1f56u, 0x0000u, CanonicalizeUnique },
+ { 0x1f57u, 0x1f57u, 0x0008u, CanonicalizeRangeLo },
+ { 0x1f58u, 0x1f58u, 0x0000u, CanonicalizeUnique },
+ { 0x1f59u, 0x1f59u, 0x0008u, CanonicalizeRangeHi },
+ { 0x1f5au, 0x1f5au, 0x0000u, CanonicalizeUnique },
+ { 0x1f5bu, 0x1f5bu, 0x0008u, CanonicalizeRangeHi },
+ { 0x1f5cu, 0x1f5cu, 0x0000u, CanonicalizeUnique },
+ { 0x1f5du, 0x1f5du, 0x0008u, CanonicalizeRangeHi },
+ { 0x1f5eu, 0x1f5eu, 0x0000u, CanonicalizeUnique },
+ { 0x1f5fu, 0x1f5fu, 0x0008u, CanonicalizeRangeHi },
+ { 0x1f60u, 0x1f67u, 0x0008u, CanonicalizeRangeLo },
+ { 0x1f68u, 0x1f6fu, 0x0008u, CanonicalizeRangeHi },
+ { 0x1f70u, 0x1f71u, 0x004au, CanonicalizeRangeLo },
+ { 0x1f72u, 0x1f75u, 0x0056u, CanonicalizeRangeLo },
+ { 0x1f76u, 0x1f77u, 0x0064u, CanonicalizeRangeLo },
+ { 0x1f78u, 0x1f79u, 0x0080u, CanonicalizeRangeLo },
+ { 0x1f7au, 0x1f7bu, 0x0070u, CanonicalizeRangeLo },
+ { 0x1f7cu, 0x1f7du, 0x007eu, CanonicalizeRangeLo },
+ { 0x1f7eu, 0x1fafu, 0x0000u, CanonicalizeUnique },
+ { 0x1fb0u, 0x1fb1u, 0x0008u, CanonicalizeRangeLo },
+ { 0x1fb2u, 0x1fb7u, 0x0000u, CanonicalizeUnique },
+ { 0x1fb8u, 0x1fb9u, 0x0008u, CanonicalizeRangeHi },
+ { 0x1fbau, 0x1fbbu, 0x004au, CanonicalizeRangeHi },
+ { 0x1fbcu, 0x1fbdu, 0x0000u, CanonicalizeUnique },
+ { 0x1fbeu, 0x1fbeu, 0x0007u, CanonicalizeSet },
+ { 0x1fbfu, 0x1fc7u, 0x0000u, CanonicalizeUnique },
+ { 0x1fc8u, 0x1fcbu, 0x0056u, CanonicalizeRangeHi },
+ { 0x1fccu, 0x1fcfu, 0x0000u, CanonicalizeUnique },
+ { 0x1fd0u, 0x1fd1u, 0x0008u, CanonicalizeRangeLo },
+ { 0x1fd2u, 0x1fd7u, 0x0000u, CanonicalizeUnique },
+ { 0x1fd8u, 0x1fd9u, 0x0008u, CanonicalizeRangeHi },
+ { 0x1fdau, 0x1fdbu, 0x0064u, CanonicalizeRangeHi },
+ { 0x1fdcu, 0x1fdfu, 0x0000u, CanonicalizeUnique },
+ { 0x1fe0u, 0x1fe1u, 0x0008u, CanonicalizeRangeLo },
+ { 0x1fe2u, 0x1fe4u, 0x0000u, CanonicalizeUnique },
+ { 0x1fe5u, 0x1fe5u, 0x0007u, CanonicalizeRangeLo },
+ { 0x1fe6u, 0x1fe7u, 0x0000u, CanonicalizeUnique },
+ { 0x1fe8u, 0x1fe9u, 0x0008u, CanonicalizeRangeHi },
+ { 0x1feau, 0x1febu, 0x0070u, CanonicalizeRangeHi },
+ { 0x1fecu, 0x1fecu, 0x0007u, CanonicalizeRangeHi },
+ { 0x1fedu, 0x1ff7u, 0x0000u, CanonicalizeUnique },
+ { 0x1ff8u, 0x1ff9u, 0x0080u, CanonicalizeRangeHi },
+ { 0x1ffau, 0x1ffbu, 0x007eu, CanonicalizeRangeHi },
+ { 0x1ffcu, 0x2131u, 0x0000u, CanonicalizeUnique },
+ { 0x2132u, 0x2132u, 0x001cu, CanonicalizeRangeLo },
+ { 0x2133u, 0x214du, 0x0000u, CanonicalizeUnique },
+ { 0x214eu, 0x214eu, 0x001cu, CanonicalizeRangeHi },
+ { 0x214fu, 0x215fu, 0x0000u, CanonicalizeUnique },
+ { 0x2160u, 0x216fu, 0x0010u, CanonicalizeRangeLo },
+ { 0x2170u, 0x217fu, 0x0010u, CanonicalizeRangeHi },
+ { 0x2180u, 0x2182u, 0x0000u, CanonicalizeUnique },
+ { 0x2183u, 0x2184u, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0x2185u, 0x24b5u, 0x0000u, CanonicalizeUnique },
+ { 0x24b6u, 0x24cfu, 0x001au, CanonicalizeRangeLo },
+ { 0x24d0u, 0x24e9u, 0x001au, CanonicalizeRangeHi },
+ { 0x24eau, 0x2bffu, 0x0000u, CanonicalizeUnique },
+ { 0x2c00u, 0x2c2eu, 0x0030u, CanonicalizeRangeLo },
+ { 0x2c2fu, 0x2c2fu, 0x0000u, CanonicalizeUnique },
+ { 0x2c30u, 0x2c5eu, 0x0030u, CanonicalizeRangeHi },
+ { 0x2c5fu, 0x2c5fu, 0x0000u, CanonicalizeUnique },
+ { 0x2c60u, 0x2c61u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x2c62u, 0x2c62u, 0x29f7u, CanonicalizeRangeHi },
+ { 0x2c63u, 0x2c63u, 0x0ee6u, CanonicalizeRangeHi },
+ { 0x2c64u, 0x2c64u, 0x29e7u, CanonicalizeRangeHi },
+ { 0x2c65u, 0x2c65u, 0x2a2bu, CanonicalizeRangeHi },
+ { 0x2c66u, 0x2c66u, 0x2a28u, CanonicalizeRangeHi },
+ { 0x2c67u, 0x2c6cu, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0x2c6du, 0x2c6du, 0x2a1cu, CanonicalizeRangeHi },
+ { 0x2c6eu, 0x2c6eu, 0x29fdu, CanonicalizeRangeHi },
+ { 0x2c6fu, 0x2c6fu, 0x2a1fu, CanonicalizeRangeHi },
+ { 0x2c70u, 0x2c70u, 0x2a1eu, CanonicalizeRangeHi },
+ { 0x2c71u, 0x2c71u, 0x0000u, CanonicalizeUnique },
+ { 0x2c72u, 0x2c73u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x2c74u, 0x2c74u, 0x0000u, CanonicalizeUnique },
+ { 0x2c75u, 0x2c76u, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0x2c77u, 0x2c7du, 0x0000u, CanonicalizeUnique },
+ { 0x2c7eu, 0x2c7fu, 0x2a3fu, CanonicalizeRangeHi },
+ { 0x2c80u, 0x2ce3u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0x2ce4u, 0x2ceau, 0x0000u, CanonicalizeUnique },
+ { 0x2cebu, 0x2ceeu, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0x2cefu, 0x2cffu, 0x0000u, CanonicalizeUnique },
+ { 0x2d00u, 0x2d25u, 0x1c60u, CanonicalizeRangeHi },
+ { 0x2d26u, 0xa63fu, 0x0000u, CanonicalizeUnique },
+ { 0xa640u, 0xa66du, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0xa66eu, 0xa67fu, 0x0000u, CanonicalizeUnique },
+ { 0xa680u, 0xa697u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0xa698u, 0xa721u, 0x0000u, CanonicalizeUnique },
+ { 0xa722u, 0xa72fu, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0xa730u, 0xa731u, 0x0000u, CanonicalizeUnique },
+ { 0xa732u, 0xa76fu, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0xa770u, 0xa778u, 0x0000u, CanonicalizeUnique },
+ { 0xa779u, 0xa77cu, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0xa77du, 0xa77du, 0x8a04u, CanonicalizeRangeHi },
+ { 0xa77eu, 0xa787u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0xa788u, 0xa78au, 0x0000u, CanonicalizeUnique },
+ { 0xa78bu, 0xa78cu, 0x0000u, CanonicalizeAlternatingUnaligned },
+ { 0xa78du, 0xa78du, 0xa528u, CanonicalizeRangeHi },
+ { 0xa78eu, 0xa78fu, 0x0000u, CanonicalizeUnique },
+ { 0xa790u, 0xa791u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0xa792u, 0xa79fu, 0x0000u, CanonicalizeUnique },
+ { 0xa7a0u, 0xa7a9u, 0x0000u, CanonicalizeAlternatingAligned },
+ { 0xa7aau, 0xff20u, 0x0000u, CanonicalizeUnique },
+ { 0xff21u, 0xff3au, 0x0020u, CanonicalizeRangeLo },
+ { 0xff3bu, 0xff40u, 0x0000u, CanonicalizeUnique },
+ { 0xff41u, 0xff5au, 0x0020u, CanonicalizeRangeHi },
+ { 0xff5bu, 0xffffu, 0x0000u, CanonicalizeUnique },
+};
+
+const size_t LATIN_CANONICALIZATION_RANGES = 20;
+LatinCanonicalizationRange latinRangeInfo[LATIN_CANONICALIZATION_RANGES] = {
+ { 0x0000u, 0x0040u, 0x0000u, CanonicalizeLatinSelf },
+ { 0x0041u, 0x005au, 0x0000u, CanonicalizeLatinMask0x20 },
+ { 0x005bu, 0x0060u, 0x0000u, CanonicalizeLatinSelf },
+ { 0x0061u, 0x007au, 0x0000u, CanonicalizeLatinMask0x20 },
+ { 0x007bu, 0x00bfu, 0x0000u, CanonicalizeLatinSelf },
+ { 0x00c0u, 0x00d6u, 0x0000u, CanonicalizeLatinMask0x20 },
+ { 0x00d7u, 0x00d7u, 0x0000u, CanonicalizeLatinSelf },
+ { 0x00d8u, 0x00deu, 0x0000u, CanonicalizeLatinMask0x20 },
+ { 0x00dfu, 0x00dfu, 0x0000u, CanonicalizeLatinSelf },
+ { 0x00e0u, 0x00f6u, 0x0000u, CanonicalizeLatinMask0x20 },
+ { 0x00f7u, 0x00f7u, 0x0000u, CanonicalizeLatinSelf },
+ { 0x00f8u, 0x00feu, 0x0000u, CanonicalizeLatinMask0x20 },
+ { 0x00ffu, 0x00ffu, 0x0000u, CanonicalizeLatinSelf },
+ { 0x0100u, 0x0177u, 0x0000u, CanonicalizeLatinInvalid },
+ { 0x0178u, 0x0178u, 0x00ffu, CanonicalizeLatinOther },
+ { 0x0179u, 0x039bu, 0x0000u, CanonicalizeLatinInvalid },
+ { 0x039cu, 0x039cu, 0x00b5u, CanonicalizeLatinOther },
+ { 0x039du, 0x03bbu, 0x0000u, CanonicalizeLatinInvalid },
+ { 0x03bcu, 0x03bcu, 0x00b5u, CanonicalizeLatinOther },
+ { 0x03bdu, 0xffffu, 0x0000u, CanonicalizeLatinInvalid },
+};
+
+} } // JSC::Yarr
+
diff --git a/src/3rdparty/masm/yarr/YarrCanonicalizeUCS2.h b/src/3rdparty/masm/yarr/YarrCanonicalizeUCS2.h
new file mode 100644
index 0000000000..be0ead43d2
--- /dev/null
+++ b/src/3rdparty/masm/yarr/YarrCanonicalizeUCS2.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright (C) 2012 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef YarrCanonicalizeUCS2_H
+#define YarrCanonicalizeUCS2_H
+
+#include <stdint.h>
+#include <wtf/unicode/Unicode.h>
+
+namespace JSC { namespace Yarr {
+
+// This set of data (autogenerated using YarrCanonicalizeUCS2.js into YarrCanonicalizeUCS2.cpp)
+// provides information for each UCS2 code point as to the set of code points that it should
+// match under the ES5.1 case insensitive RegExp matching rules, specified in 15.10.2.8.
+enum UCS2CanonicalizationType {
+ CanonicalizeUnique, // No canonically equal values, e.g. 0x0.
+ CanonicalizeSet, // Value indicates a set in characterSetInfo.
+ CanonicalizeRangeLo, // Value is positive delta to pair, E.g. 0x41 has value 0x20, -> 0x61.
+ CanonicalizeRangeHi, // Value is positive delta to pair, E.g. 0x61 has value 0x20, -> 0x41.
+ CanonicalizeAlternatingAligned, // Aligned consequtive pair, e.g. 0x1f4,0x1f5.
+ CanonicalizeAlternatingUnaligned, // Unaligned consequtive pair, e.g. 0x241,0x242.
+};
+struct UCS2CanonicalizationRange { uint16_t begin, end, value, type; };
+extern const size_t UCS2_CANONICALIZATION_RANGES;
+extern uint16_t* characterSetInfo[];
+extern UCS2CanonicalizationRange rangeInfo[];
+
+// This table is similar to the full rangeInfo table, however this maps from UCS2 codepoints to
+// the set of Latin1 codepoints that could match.
+enum LatinCanonicalizationType {
+ CanonicalizeLatinSelf, // This character is in the Latin1 range, but has no canonical equivalent in the range.
+ CanonicalizeLatinMask0x20, // One of a pair of characters, under the mask 0x20.
+ CanonicalizeLatinOther, // This character is not in the Latin1 range, but canonicalizes to another that is.
+ CanonicalizeLatinInvalid, // Cannot match against Latin1 input.
+};
+struct LatinCanonicalizationRange { uint16_t begin, end, value, type; };
+extern const size_t LATIN_CANONICALIZATION_RANGES;
+extern LatinCanonicalizationRange latinRangeInfo[];
+
+// This searches in log2 time over ~364 entries, so should typically result in 8 compares.
+inline UCS2CanonicalizationRange* rangeInfoFor(UChar ch)
+{
+ UCS2CanonicalizationRange* info = rangeInfo;
+ size_t entries = UCS2_CANONICALIZATION_RANGES;
+
+ while (true) {
+ size_t candidate = entries >> 1;
+ UCS2CanonicalizationRange* candidateInfo = info + candidate;
+ if (ch < candidateInfo->begin)
+ entries = candidate;
+ else if (ch <= candidateInfo->end)
+ return candidateInfo;
+ else {
+ info = candidateInfo + 1;
+ entries -= (candidate + 1);
+ }
+ }
+}
+
+// Should only be called for characters that have one canonically matching value.
+inline UChar getCanonicalPair(UCS2CanonicalizationRange* info, UChar ch)
+{
+ ASSERT(ch >= info->begin && ch <= info->end);
+ switch (info->type) {
+ case CanonicalizeRangeLo:
+ return ch + info->value;
+ case CanonicalizeRangeHi:
+ return ch - info->value;
+ case CanonicalizeAlternatingAligned:
+ return ch ^ 1;
+ case CanonicalizeAlternatingUnaligned:
+ return ((ch - 1) ^ 1) + 1;
+ default:
+ ASSERT_NOT_REACHED();
+ }
+ ASSERT_NOT_REACHED();
+ return 0;
+}
+
+// Returns true if no other UCS2 codepoint can match this value.
+inline bool isCanonicallyUnique(UChar ch)
+{
+ return rangeInfoFor(ch)->type == CanonicalizeUnique;
+}
+
+// Returns true if values are equal, under the canonicalization rules.
+inline bool areCanonicallyEquivalent(UChar a, UChar b)
+{
+ UCS2CanonicalizationRange* info = rangeInfoFor(a);
+ switch (info->type) {
+ case CanonicalizeUnique:
+ return a == b;
+ case CanonicalizeSet: {
+ for (uint16_t* set = characterSetInfo[info->value]; (a = *set); ++set) {
+ if (a == b)
+ return true;
+ }
+ return false;
+ }
+ case CanonicalizeRangeLo:
+ return (a == b) || (a + info->value == b);
+ case CanonicalizeRangeHi:
+ return (a == b) || (a - info->value == b);
+ case CanonicalizeAlternatingAligned:
+ return (a | 1) == (b | 1);
+ case CanonicalizeAlternatingUnaligned:
+ return ((a - 1) | 1) == ((b - 1) | 1);
+ }
+
+ ASSERT_NOT_REACHED();
+ return false;
+}
+
+} } // JSC::Yarr
+
+#endif
diff --git a/src/3rdparty/masm/yarr/YarrCanonicalizeUCS2.js b/src/3rdparty/masm/yarr/YarrCanonicalizeUCS2.js
new file mode 100644
index 0000000000..00361dd46e
--- /dev/null
+++ b/src/3rdparty/masm/yarr/YarrCanonicalizeUCS2.js
@@ -0,0 +1,219 @@
+/*
+ * Copyright (C) 2012 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// See ES 5.1, 15.10.2.8
+function canonicalize(ch)
+{
+ var u = String.fromCharCode(ch).toUpperCase();
+ if (u.length > 1)
+ return ch;
+ var cu = u.charCodeAt(0);
+ if (ch >= 128 && cu < 128)
+ return ch;
+ return cu;
+}
+
+var MAX_UCS2 = 0xFFFF;
+var MAX_LATIN = 0xFF;
+
+var groupedCanonically = [];
+// Pass 1: populate groupedCanonically - this is mapping from canonicalized
+// values back to the set of character code that canonicalize to them.
+for (var i = 0; i <= MAX_UCS2; ++i) {
+ var ch = canonicalize(i);
+ if (!groupedCanonically[ch])
+ groupedCanonically[ch] = [];
+ groupedCanonically[ch].push(i);
+}
+
+var typeInfo = [];
+var latinTypeInfo = [];
+var characterSetInfo = [];
+// Pass 2: populate typeInfo & characterSetInfo. For every character calculate
+// a typeInfo value, described by the types above, and a value payload.
+for (cu in groupedCanonically) {
+ // The set of characters that canonicalize to cu
+ var characters = groupedCanonically[cu];
+
+ // If there is only one, it is unique.
+ if (characters.length == 1) {
+ typeInfo[characters[0]] = "CanonicalizeUnique:0";
+ latinTypeInfo[characters[0]] = characters[0] <= MAX_LATIN ? "CanonicalizeLatinSelf:0" : "CanonicalizeLatinInvalid:0";
+ continue;
+ }
+
+ // Sort the array.
+ characters.sort(function(x,y){return x-y;});
+
+ // If there are more than two characters, create an entry in characterSetInfo.
+ if (characters.length > 2) {
+ for (i in characters)
+ typeInfo[characters[i]] = "CanonicalizeSet:" + characterSetInfo.length;
+ characterSetInfo.push(characters);
+
+ if (characters[1] <= MAX_LATIN)
+ throw new Error("sets with more than one latin character not supported!");
+ if (characters[0] <= MAX_LATIN) {
+ for (i in characters)
+ latinTypeInfo[characters[i]] = "CanonicalizeLatinOther:" + characters[0];
+ latinTypeInfo[characters[0]] = "CanonicalizeLatinSelf:0";
+ } else {
+ for (i in characters)
+ latinTypeInfo[characters[i]] = "CanonicalizeLatinInvalid:0";
+ }
+
+ continue;
+ }
+
+ // We have a pair, mark alternating ranges, otherwise track whether this is the low or high partner.
+ var lo = characters[0];
+ var hi = characters[1];
+ var delta = hi - lo;
+ if (delta == 1) {
+ var type = lo & 1 ? "CanonicalizeAlternatingUnaligned:0" : "CanonicalizeAlternatingAligned:0";
+ typeInfo[lo] = type;
+ typeInfo[hi] = type;
+ } else {
+ typeInfo[lo] = "CanonicalizeRangeLo:" + delta;
+ typeInfo[hi] = "CanonicalizeRangeHi:" + delta;
+ }
+
+ if (lo > MAX_LATIN) {
+ latinTypeInfo[lo] = "CanonicalizeLatinInvalid:0";
+ latinTypeInfo[hi] = "CanonicalizeLatinInvalid:0";
+ } else if (hi > MAX_LATIN) {
+ latinTypeInfo[lo] = "CanonicalizeLatinSelf:0";
+ latinTypeInfo[hi] = "CanonicalizeLatinOther:" + lo;
+ } else {
+ if (delta != 0x20 || lo & 0x20)
+ throw new Error("pairs of latin characters that don't mask with 0x20 not supported!");
+ latinTypeInfo[lo] = "CanonicalizeLatinMask0x20:0";
+ latinTypeInfo[hi] = "CanonicalizeLatinMask0x20:0";
+ }
+}
+
+var rangeInfo = [];
+// Pass 3: coallesce types into ranges.
+for (var end = 0; end <= MAX_UCS2; ++end) {
+ var begin = end;
+ var type = typeInfo[end];
+ while (end < MAX_UCS2 && typeInfo[end + 1] == type)
+ ++end;
+ rangeInfo.push({begin:begin, end:end, type:type});
+}
+
+var latinRangeInfo = [];
+// Pass 4: coallesce latin-1 types into ranges.
+for (var end = 0; end <= MAX_UCS2; ++end) {
+ var begin = end;
+ var type = latinTypeInfo[end];
+ while (end < MAX_UCS2 && latinTypeInfo[end + 1] == type)
+ ++end;
+ latinRangeInfo.push({begin:begin, end:end, type:type});
+}
+
+
+// Helper function to convert a number to a fixed width hex representation of a C uint16_t.
+function hex(x)
+{
+ var s = Number(x).toString(16);
+ while (s.length < 4)
+ s = 0 + s;
+ return "0x" + s + "u";
+}
+
+var copyright = (
+ "/*" + "\n" +
+ " * Copyright (C) 2012 Apple Inc. All rights reserved." + "\n" +
+ " *" + "\n" +
+ " * Redistribution and use in source and binary forms, with or without" + "\n" +
+ " * modification, are permitted provided that the following conditions" + "\n" +
+ " * are met:" + "\n" +
+ " * 1. Redistributions of source code must retain the above copyright" + "\n" +
+ " * notice, this list of conditions and the following disclaimer." + "\n" +
+ " * 2. Redistributions in binary form must reproduce the above copyright" + "\n" +
+ " * notice, this list of conditions and the following disclaimer in the" + "\n" +
+ " * documentation and/or other materials provided with the distribution." + "\n" +
+ " *" + "\n" +
+ " * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY" + "\n" +
+ " * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE" + "\n" +
+ " * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR" + "\n" +
+ " * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR" + "\n" +
+ " * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL," + "\n" +
+ " * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO," + "\n" +
+ " * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR" + "\n" +
+ " * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY" + "\n" +
+ " * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT" + "\n" +
+ " * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE" + "\n" +
+ " * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. " + "\n" +
+ " */");
+
+print(copyright);
+print();
+print("// DO NOT EDIT! - this file autogenerated by YarrCanonicalizeUCS2.js");
+print();
+print('#include "config.h"');
+print('#include "YarrCanonicalizeUCS2.h"');
+print();
+print("namespace JSC { namespace Yarr {");
+print();
+print("#include <stdint.h>");
+print();
+
+for (i in characterSetInfo) {
+ var characters = ""
+ var set = characterSetInfo[i];
+ for (var j in set)
+ characters += hex(set[j]) + ", ";
+ print("uint16_t ucs2CharacterSet" + i + "[] = { " + characters + "0 };");
+}
+print();
+print("static const size_t UCS2_CANONICALIZATION_SETS = " + characterSetInfo.length + ";");
+print("uint16_t* characterSetInfo[UCS2_CANONICALIZATION_SETS] = {");
+for (i in characterSetInfo)
+print(" ucs2CharacterSet" + i + ",");
+print("};");
+print();
+print("const size_t UCS2_CANONICALIZATION_RANGES = " + rangeInfo.length + ";");
+print("UCS2CanonicalizationRange rangeInfo[UCS2_CANONICALIZATION_RANGES] = {");
+for (i in rangeInfo) {
+ var info = rangeInfo[i];
+ var typeAndValue = info.type.split(':');
+ print(" { " + hex(info.begin) + ", " + hex(info.end) + ", " + hex(typeAndValue[1]) + ", " + typeAndValue[0] + " },");
+}
+print("};");
+print();
+print("const size_t LATIN_CANONICALIZATION_RANGES = " + latinRangeInfo.length + ";");
+print("LatinCanonicalizationRange latinRangeInfo[LATIN_CANONICALIZATION_RANGES] = {");
+for (i in latinRangeInfo) {
+ var info = latinRangeInfo[i];
+ var typeAndValue = info.type.split(':');
+ print(" { " + hex(info.begin) + ", " + hex(info.end) + ", " + hex(typeAndValue[1]) + ", " + typeAndValue[0] + " },");
+}
+print("};");
+print();
+print("} } // JSC::Yarr");
+print();
+
diff --git a/src/3rdparty/masm/yarr/YarrInterpreter.cpp b/src/3rdparty/masm/yarr/YarrInterpreter.cpp
new file mode 100644
index 0000000000..31603f6d34
--- /dev/null
+++ b/src/3rdparty/masm/yarr/YarrInterpreter.cpp
@@ -0,0 +1,1964 @@
+/*
+ * Copyright (C) 2009 Apple Inc. All rights reserved.
+ * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "YarrInterpreter.h"
+
+#include "Yarr.h"
+#include "YarrCanonicalizeUCS2.h"
+#include <wtf/BumpPointerAllocator.h>
+#include <wtf/DataLog.h>
+#include <wtf/text/CString.h>
+#include <wtf/text/WTFString.h>
+
+#ifndef NDEBUG
+#include <stdio.h>
+#endif
+
+using namespace WTF;
+
+namespace JSC { namespace Yarr {
+
+template<typename CharType>
+class Interpreter {
+public:
+ struct ParenthesesDisjunctionContext;
+
+ struct BackTrackInfoPatternCharacter {
+ uintptr_t matchAmount;
+ };
+ struct BackTrackInfoCharacterClass {
+ uintptr_t matchAmount;
+ };
+ struct BackTrackInfoBackReference {
+ uintptr_t begin; // Not really needed for greedy quantifiers.
+ uintptr_t matchAmount; // Not really needed for fixed quantifiers.
+ };
+ struct BackTrackInfoAlternative {
+ uintptr_t offset;
+ };
+ struct BackTrackInfoParentheticalAssertion {
+ uintptr_t begin;
+ };
+ struct BackTrackInfoParenthesesOnce {
+ uintptr_t begin;
+ };
+ struct BackTrackInfoParenthesesTerminal {
+ uintptr_t begin;
+ };
+ struct BackTrackInfoParentheses {
+ uintptr_t matchAmount;
+ ParenthesesDisjunctionContext* lastContext;
+ };
+
+ static inline void appendParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack, ParenthesesDisjunctionContext* context)
+ {
+ context->next = backTrack->lastContext;
+ backTrack->lastContext = context;
+ ++backTrack->matchAmount;
+ }
+
+ static inline void popParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack)
+ {
+ ASSERT(backTrack->matchAmount);
+ ASSERT(backTrack->lastContext);
+ backTrack->lastContext = backTrack->lastContext->next;
+ --backTrack->matchAmount;
+ }
+
+ struct DisjunctionContext
+ {
+ DisjunctionContext()
+ : term(0)
+ {
+ }
+
+ void* operator new(size_t, void* where)
+ {
+ return where;
+ }
+
+ int term;
+ unsigned matchBegin;
+ unsigned matchEnd;
+ uintptr_t frame[1];
+ };
+
+ DisjunctionContext* allocDisjunctionContext(ByteDisjunction* disjunction)
+ {
+ size_t size = sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t);
+ allocatorPool = allocatorPool->ensureCapacity(size);
+ if (!allocatorPool)
+ CRASH();
+ return new (allocatorPool->alloc(size)) DisjunctionContext();
+ }
+
+ void freeDisjunctionContext(DisjunctionContext* context)
+ {
+ allocatorPool = allocatorPool->dealloc(context);
+ }
+
+ struct ParenthesesDisjunctionContext
+ {
+ ParenthesesDisjunctionContext(unsigned* output, ByteTerm& term)
+ : next(0)
+ {
+ unsigned firstSubpatternId = term.atom.subpatternId;
+ unsigned numNestedSubpatterns = term.atom.parenthesesDisjunction->m_numSubpatterns;
+
+ for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) {
+ subpatternBackup[i] = output[(firstSubpatternId << 1) + i];
+ output[(firstSubpatternId << 1) + i] = offsetNoMatch;
+ }
+
+ new (getDisjunctionContext(term)) DisjunctionContext();
+ }
+
+ void* operator new(size_t, void* where)
+ {
+ return where;
+ }
+
+ void restoreOutput(unsigned* output, unsigned firstSubpatternId, unsigned numNestedSubpatterns)
+ {
+ for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i)
+ output[(firstSubpatternId << 1) + i] = subpatternBackup[i];
+ }
+
+ DisjunctionContext* getDisjunctionContext(ByteTerm& term)
+ {
+ return reinterpret_cast<DisjunctionContext*>(&(subpatternBackup[term.atom.parenthesesDisjunction->m_numSubpatterns << 1]));
+ }
+
+ ParenthesesDisjunctionContext* next;
+ unsigned subpatternBackup[1];
+ };
+
+ ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, unsigned* output, ByteTerm& term)
+ {
+ size_t size = sizeof(ParenthesesDisjunctionContext) - sizeof(unsigned) + (term.atom.parenthesesDisjunction->m_numSubpatterns << 1) * sizeof(unsigned) + sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t);
+ allocatorPool = allocatorPool->ensureCapacity(size);
+ if (!allocatorPool)
+ CRASH();
+ return new (allocatorPool->alloc(size)) ParenthesesDisjunctionContext(output, term);
+ }
+
+ void freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext* context)
+ {
+ allocatorPool = allocatorPool->dealloc(context);
+ }
+
+ class InputStream {
+ public:
+ InputStream(const CharType* input, unsigned start, unsigned length)
+ : input(input)
+ , pos(start)
+ , length(length)
+ {
+ }
+
+ void next()
+ {
+ ++pos;
+ }
+
+ void rewind(unsigned amount)
+ {
+ ASSERT(pos >= amount);
+ pos -= amount;
+ }
+
+ int read()
+ {
+ ASSERT(pos < length);
+ if (pos < length)
+ return input[pos];
+ return -1;
+ }
+
+ int readPair()
+ {
+ ASSERT(pos + 1 < length);
+ return input[pos] | input[pos + 1] << 16;
+ }
+
+ int readChecked(unsigned negativePositionOffest)
+ {
+ if (pos < negativePositionOffest)
+ CRASH();
+ unsigned p = pos - negativePositionOffest;
+ ASSERT(p < length);
+ return input[p];
+ }
+
+ int reread(unsigned from)
+ {
+ ASSERT(from < length);
+ return input[from];
+ }
+
+ int prev()
+ {
+ ASSERT(!(pos > length));
+ if (pos && length)
+ return input[pos - 1];
+ return -1;
+ }
+
+ unsigned getPos()
+ {
+ return pos;
+ }
+
+ void setPos(unsigned p)
+ {
+ pos = p;
+ }
+
+ bool atStart()
+ {
+ return pos == 0;
+ }
+
+ bool atEnd()
+ {
+ return pos == length;
+ }
+
+ unsigned end()
+ {
+ return length;
+ }
+
+ bool checkInput(unsigned count)
+ {
+ if (((pos + count) <= length) && ((pos + count) >= pos)) {
+ pos += count;
+ return true;
+ }
+ return false;
+ }
+
+ void uncheckInput(unsigned count)
+ {
+ if (pos < count)
+ CRASH();
+ pos -= count;
+ }
+
+ bool atStart(unsigned negativePositionOffest)
+ {
+ return pos == negativePositionOffest;
+ }
+
+ bool atEnd(unsigned negativePositionOffest)
+ {
+ if (pos < negativePositionOffest)
+ CRASH();
+ return (pos - negativePositionOffest) == length;
+ }
+
+ bool isAvailableInput(unsigned offset)
+ {
+ return (((pos + offset) <= length) && ((pos + offset) >= pos));
+ }
+
+ private:
+ const CharType* input;
+ unsigned pos;
+ unsigned length;
+ };
+
+ bool testCharacterClass(CharacterClass* characterClass, int ch)
+ {
+ if (ch & 0xFF80) {
+ for (unsigned i = 0; i < characterClass->m_matchesUnicode.size(); ++i)
+ if (ch == characterClass->m_matchesUnicode[i])
+ return true;
+ for (unsigned i = 0; i < characterClass->m_rangesUnicode.size(); ++i)
+ if ((ch >= characterClass->m_rangesUnicode[i].begin) && (ch <= characterClass->m_rangesUnicode[i].end))
+ return true;
+ } else {
+ for (unsigned i = 0; i < characterClass->m_matches.size(); ++i)
+ if (ch == characterClass->m_matches[i])
+ return true;
+ for (unsigned i = 0; i < characterClass->m_ranges.size(); ++i)
+ if ((ch >= characterClass->m_ranges[i].begin) && (ch <= characterClass->m_ranges[i].end))
+ return true;
+ }
+
+ return false;
+ }
+
+ bool checkCharacter(int testChar, unsigned negativeInputOffset)
+ {
+ return testChar == input.readChecked(negativeInputOffset);
+ }
+
+ bool checkCasedCharacter(int loChar, int hiChar, unsigned negativeInputOffset)
+ {
+ int ch = input.readChecked(negativeInputOffset);
+ return (loChar == ch) || (hiChar == ch);
+ }
+
+ bool checkCharacterClass(CharacterClass* characterClass, bool invert, unsigned negativeInputOffset)
+ {
+ bool match = testCharacterClass(characterClass, input.readChecked(negativeInputOffset));
+ return invert ? !match : match;
+ }
+
+ bool tryConsumeBackReference(int matchBegin, int matchEnd, unsigned negativeInputOffset)
+ {
+ unsigned matchSize = (unsigned)(matchEnd - matchBegin);
+
+ if (!input.checkInput(matchSize))
+ return false;
+
+ if (pattern->m_ignoreCase) {
+ for (unsigned i = 0; i < matchSize; ++i) {
+ int oldCh = input.reread(matchBegin + i);
+ int ch = input.readChecked(negativeInputOffset + matchSize - i);
+
+ if (oldCh == ch)
+ continue;
+
+ // The definition for canonicalize (see ES 5.1, 15.10.2.8) means that
+ // unicode values are never allowed to match against ascii ones.
+ if (isASCII(oldCh) || isASCII(ch)) {
+ if (toASCIIUpper(oldCh) == toASCIIUpper(ch))
+ continue;
+ } else if (areCanonicallyEquivalent(oldCh, ch))
+ continue;
+
+ input.uncheckInput(matchSize);
+ return false;
+ }
+ } else {
+ for (unsigned i = 0; i < matchSize; ++i) {
+ if (!checkCharacter(input.reread(matchBegin + i), negativeInputOffset + matchSize - i)) {
+ input.uncheckInput(matchSize);
+ return false;
+ }
+ }
+ }
+
+ return true;
+ }
+
+ bool matchAssertionBOL(ByteTerm& term)
+ {
+ return (input.atStart(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition + 1)));
+ }
+
+ bool matchAssertionEOL(ByteTerm& term)
+ {
+ if (term.inputPosition)
+ return (input.atEnd(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition)));
+
+ return (input.atEnd()) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.read()));
+ }
+
+ bool matchAssertionWordBoundary(ByteTerm& term)
+ {
+ bool prevIsWordchar = !input.atStart(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition + 1));
+ bool readIsWordchar;
+ if (term.inputPosition)
+ readIsWordchar = !input.atEnd(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition));
+ else
+ readIsWordchar = !input.atEnd() && testCharacterClass(pattern->wordcharCharacterClass, input.read());
+
+ bool wordBoundary = prevIsWordchar != readIsWordchar;
+ return term.invert() ? !wordBoundary : wordBoundary;
+ }
+
+ bool backtrackPatternCharacter(ByteTerm& term, DisjunctionContext* context)
+ {
+ BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
+
+ switch (term.atom.quantityType) {
+ case QuantifierFixedCount:
+ break;
+
+ case QuantifierGreedy:
+ if (backTrack->matchAmount) {
+ --backTrack->matchAmount;
+ input.uncheckInput(1);
+ return true;
+ }
+ break;
+
+ case QuantifierNonGreedy:
+ if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
+ ++backTrack->matchAmount;
+ if (checkCharacter(term.atom.patternCharacter, term.inputPosition + 1))
+ return true;
+ }
+ input.uncheckInput(backTrack->matchAmount);
+ break;
+ }
+
+ return false;
+ }
+
+ bool backtrackPatternCasedCharacter(ByteTerm& term, DisjunctionContext* context)
+ {
+ BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
+
+ switch (term.atom.quantityType) {
+ case QuantifierFixedCount:
+ break;
+
+ case QuantifierGreedy:
+ if (backTrack->matchAmount) {
+ --backTrack->matchAmount;
+ input.uncheckInput(1);
+ return true;
+ }
+ break;
+
+ case QuantifierNonGreedy:
+ if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
+ ++backTrack->matchAmount;
+ if (checkCasedCharacter(term.atom.casedCharacter.lo, term.atom.casedCharacter.hi, term.inputPosition + 1))
+ return true;
+ }
+ input.uncheckInput(backTrack->matchAmount);
+ break;
+ }
+
+ return false;
+ }
+
+ bool matchCharacterClass(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeCharacterClass);
+ BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
+
+ switch (term.atom.quantityType) {
+ case QuantifierFixedCount: {
+ for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) {
+ if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - matchAmount))
+ return false;
+ }
+ return true;
+ }
+
+ case QuantifierGreedy: {
+ unsigned matchAmount = 0;
+ while ((matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
+ if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + 1)) {
+ input.uncheckInput(1);
+ break;
+ }
+ ++matchAmount;
+ }
+ backTrack->matchAmount = matchAmount;
+
+ return true;
+ }
+
+ case QuantifierNonGreedy:
+ backTrack->matchAmount = 0;
+ return true;
+ }
+
+ ASSERT_NOT_REACHED();
+ return false;
+ }
+
+ bool backtrackCharacterClass(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeCharacterClass);
+ BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
+
+ switch (term.atom.quantityType) {
+ case QuantifierFixedCount:
+ break;
+
+ case QuantifierGreedy:
+ if (backTrack->matchAmount) {
+ --backTrack->matchAmount;
+ input.uncheckInput(1);
+ return true;
+ }
+ break;
+
+ case QuantifierNonGreedy:
+ if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
+ ++backTrack->matchAmount;
+ if (checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + 1))
+ return true;
+ }
+ input.uncheckInput(backTrack->matchAmount);
+ break;
+ }
+
+ return false;
+ }
+
+ bool matchBackReference(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeBackReference);
+ BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
+
+ unsigned matchBegin = output[(term.atom.subpatternId << 1)];
+ unsigned matchEnd = output[(term.atom.subpatternId << 1) + 1];
+
+ // If the end position of the referenced match hasn't set yet then the backreference in the same parentheses where it references to that.
+ // In this case the result of match is empty string like when it references to a parentheses with zero-width match.
+ // Eg.: /(a\1)/
+ if (matchEnd == offsetNoMatch)
+ return true;
+
+ if (matchBegin == offsetNoMatch)
+ return true;
+
+ ASSERT(matchBegin <= matchEnd);
+
+ if (matchBegin == matchEnd)
+ return true;
+
+ switch (term.atom.quantityType) {
+ case QuantifierFixedCount: {
+ backTrack->begin = input.getPos();
+ for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) {
+ if (!tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) {
+ input.setPos(backTrack->begin);
+ return false;
+ }
+ }
+ return true;
+ }
+
+ case QuantifierGreedy: {
+ unsigned matchAmount = 0;
+ while ((matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition))
+ ++matchAmount;
+ backTrack->matchAmount = matchAmount;
+ return true;
+ }
+
+ case QuantifierNonGreedy:
+ backTrack->begin = input.getPos();
+ backTrack->matchAmount = 0;
+ return true;
+ }
+
+ ASSERT_NOT_REACHED();
+ return false;
+ }
+
+ bool backtrackBackReference(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeBackReference);
+ BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
+
+ unsigned matchBegin = output[(term.atom.subpatternId << 1)];
+ unsigned matchEnd = output[(term.atom.subpatternId << 1) + 1];
+
+ if (matchBegin == offsetNoMatch)
+ return false;
+
+ ASSERT(matchBegin <= matchEnd);
+
+ if (matchBegin == matchEnd)
+ return false;
+
+ switch (term.atom.quantityType) {
+ case QuantifierFixedCount:
+ // for quantityCount == 1, could rewind.
+ input.setPos(backTrack->begin);
+ break;
+
+ case QuantifierGreedy:
+ if (backTrack->matchAmount) {
+ --backTrack->matchAmount;
+ input.rewind(matchEnd - matchBegin);
+ return true;
+ }
+ break;
+
+ case QuantifierNonGreedy:
+ if ((backTrack->matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) {
+ ++backTrack->matchAmount;
+ return true;
+ }
+ input.setPos(backTrack->begin);
+ break;
+ }
+
+ return false;
+ }
+
+ void recordParenthesesMatch(ByteTerm& term, ParenthesesDisjunctionContext* context)
+ {
+ if (term.capture()) {
+ unsigned subpatternId = term.atom.subpatternId;
+ output[(subpatternId << 1)] = context->getDisjunctionContext(term)->matchBegin + term.inputPosition;
+ output[(subpatternId << 1) + 1] = context->getDisjunctionContext(term)->matchEnd + term.inputPosition;
+ }
+ }
+ void resetMatches(ByteTerm& term, ParenthesesDisjunctionContext* context)
+ {
+ unsigned firstSubpatternId = term.atom.subpatternId;
+ unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns;
+ context->restoreOutput(output, firstSubpatternId, count);
+ }
+ JSRegExpResult parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack)
+ {
+ while (backTrack->matchAmount) {
+ ParenthesesDisjunctionContext* context = backTrack->lastContext;
+
+ JSRegExpResult result = matchDisjunction(term.atom.parenthesesDisjunction, context->getDisjunctionContext(term), true);
+ if (result == JSRegExpMatch)
+ return JSRegExpMatch;
+
+ resetMatches(term, context);
+ popParenthesesDisjunctionContext(backTrack);
+ freeParenthesesDisjunctionContext(context);
+
+ if (result != JSRegExpNoMatch)
+ return result;
+ }
+
+ return JSRegExpNoMatch;
+ }
+
+ bool matchParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
+ ASSERT(term.atom.quantityCount == 1);
+
+ BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
+
+ switch (term.atom.quantityType) {
+ case QuantifierGreedy: {
+ // set this speculatively; if we get to the parens end this will be true.
+ backTrack->begin = input.getPos();
+ break;
+ }
+ case QuantifierNonGreedy: {
+ backTrack->begin = notFound;
+ context->term += term.atom.parenthesesWidth;
+ return true;
+ }
+ case QuantifierFixedCount:
+ break;
+ }
+
+ if (term.capture()) {
+ unsigned subpatternId = term.atom.subpatternId;
+ output[(subpatternId << 1)] = input.getPos() - term.inputPosition;
+ }
+
+ return true;
+ }
+
+ bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
+ ASSERT(term.atom.quantityCount == 1);
+
+ if (term.capture()) {
+ unsigned subpatternId = term.atom.subpatternId;
+ output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition;
+ }
+
+ if (term.atom.quantityType == QuantifierFixedCount)
+ return true;
+
+ BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
+ return backTrack->begin != input.getPos();
+ }
+
+ bool backtrackParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
+ ASSERT(term.atom.quantityCount == 1);
+
+ BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
+
+ if (term.capture()) {
+ unsigned subpatternId = term.atom.subpatternId;
+ output[(subpatternId << 1)] = offsetNoMatch;
+ output[(subpatternId << 1) + 1] = offsetNoMatch;
+ }
+
+ switch (term.atom.quantityType) {
+ case QuantifierGreedy:
+ // if we backtrack to this point, there is another chance - try matching nothing.
+ ASSERT(backTrack->begin != notFound);
+ backTrack->begin = notFound;
+ context->term += term.atom.parenthesesWidth;
+ return true;
+ case QuantifierNonGreedy:
+ ASSERT(backTrack->begin != notFound);
+ case QuantifierFixedCount:
+ break;
+ }
+
+ return false;
+ }
+
+ bool backtrackParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
+ ASSERT(term.atom.quantityCount == 1);
+
+ BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
+
+ switch (term.atom.quantityType) {
+ case QuantifierGreedy:
+ if (backTrack->begin == notFound) {
+ context->term -= term.atom.parenthesesWidth;
+ return false;
+ }
+ case QuantifierNonGreedy:
+ if (backTrack->begin == notFound) {
+ backTrack->begin = input.getPos();
+ if (term.capture()) {
+ // Technically this access to inputPosition should be accessing the begin term's
+ // inputPosition, but for repeats other than fixed these values should be
+ // the same anyway! (We don't pre-check for greedy or non-greedy matches.)
+ ASSERT((&term - term.atom.parenthesesWidth)->type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
+ ASSERT((&term - term.atom.parenthesesWidth)->inputPosition == term.inputPosition);
+ unsigned subpatternId = term.atom.subpatternId;
+ output[subpatternId << 1] = input.getPos() + term.inputPosition;
+ }
+ context->term -= term.atom.parenthesesWidth;
+ return true;
+ }
+ case QuantifierFixedCount:
+ break;
+ }
+
+ return false;
+ }
+
+ bool matchParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
+ ASSERT(term.atom.quantityType == QuantifierGreedy);
+ ASSERT(term.atom.quantityCount == quantifyInfinite);
+ ASSERT(!term.capture());
+
+ BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation);
+ backTrack->begin = input.getPos();
+ return true;
+ }
+
+ bool matchParenthesesTerminalEnd(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalEnd);
+
+ BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation);
+ // Empty match is a failed match.
+ if (backTrack->begin == input.getPos())
+ return false;
+
+ // Successful match! Okay, what's next? - loop around and try to match moar!
+ context->term -= (term.atom.parenthesesWidth + 1);
+ return true;
+ }
+
+ bool backtrackParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
+ ASSERT(term.atom.quantityType == QuantifierGreedy);
+ ASSERT(term.atom.quantityCount == quantifyInfinite);
+ ASSERT(!term.capture());
+
+ // If we backtrack to this point, we have failed to match this iteration of the parens.
+ // Since this is greedy / zero minimum a failed is also accepted as a match!
+ context->term += term.atom.parenthesesWidth;
+ return true;
+ }
+
+ bool backtrackParenthesesTerminalEnd(ByteTerm&, DisjunctionContext*)
+ {
+ // 'Terminal' parentheses are at the end of the regex, and as such a match past end
+ // should always be returned as a successful match - we should never backtrack to here.
+ ASSERT_NOT_REACHED();
+ return false;
+ }
+
+ bool matchParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
+ ASSERT(term.atom.quantityCount == 1);
+
+ BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
+
+ backTrack->begin = input.getPos();
+ return true;
+ }
+
+ bool matchParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
+ ASSERT(term.atom.quantityCount == 1);
+
+ BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
+
+ input.setPos(backTrack->begin);
+
+ // We've reached the end of the parens; if they are inverted, this is failure.
+ if (term.invert()) {
+ context->term -= term.atom.parenthesesWidth;
+ return false;
+ }
+
+ return true;
+ }
+
+ bool backtrackParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
+ ASSERT(term.atom.quantityCount == 1);
+
+ // We've failed to match parens; if they are inverted, this is win!
+ if (term.invert()) {
+ context->term += term.atom.parenthesesWidth;
+ return true;
+ }
+
+ return false;
+ }
+
+ bool backtrackParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
+ ASSERT(term.atom.quantityCount == 1);
+
+ BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
+
+ input.setPos(backTrack->begin);
+
+ context->term -= term.atom.parenthesesWidth;
+ return false;
+ }
+
+ JSRegExpResult matchParentheses(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
+
+ BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
+ ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
+
+ backTrack->matchAmount = 0;
+ backTrack->lastContext = 0;
+
+ switch (term.atom.quantityType) {
+ case QuantifierFixedCount: {
+ // While we haven't yet reached our fixed limit,
+ while (backTrack->matchAmount < term.atom.quantityCount) {
+ // Try to do a match, and it it succeeds, add it to the list.
+ ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
+ JSRegExpResult result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term));
+ if (result == JSRegExpMatch)
+ appendParenthesesDisjunctionContext(backTrack, context);
+ else {
+ // The match failed; try to find an alternate point to carry on from.
+ resetMatches(term, context);
+ freeParenthesesDisjunctionContext(context);
+
+ if (result != JSRegExpNoMatch)
+ return result;
+ JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack);
+ if (backtrackResult != JSRegExpMatch)
+ return backtrackResult;
+ }
+ }
+
+ ASSERT(backTrack->matchAmount == term.atom.quantityCount);
+ ParenthesesDisjunctionContext* context = backTrack->lastContext;
+ recordParenthesesMatch(term, context);
+ return JSRegExpMatch;
+ }
+
+ case QuantifierGreedy: {
+ while (backTrack->matchAmount < term.atom.quantityCount) {
+ ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
+ JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
+ if (result == JSRegExpMatch)
+ appendParenthesesDisjunctionContext(backTrack, context);
+ else {
+ resetMatches(term, context);
+ freeParenthesesDisjunctionContext(context);
+
+ if (result != JSRegExpNoMatch)
+ return result;
+
+ break;
+ }
+ }
+
+ if (backTrack->matchAmount) {
+ ParenthesesDisjunctionContext* context = backTrack->lastContext;
+ recordParenthesesMatch(term, context);
+ }
+ return JSRegExpMatch;
+ }
+
+ case QuantifierNonGreedy:
+ return JSRegExpMatch;
+ }
+
+ ASSERT_NOT_REACHED();
+ return JSRegExpErrorNoMatch;
+ }
+
+ // Rules for backtracking differ depending on whether this is greedy or non-greedy.
+ //
+ // Greedy matches never should try just adding more - you should already have done
+ // the 'more' cases. Always backtrack, at least a leetle bit. However cases where
+ // you backtrack an item off the list needs checking, since we'll never have matched
+ // the one less case. Tracking forwards, still add as much as possible.
+ //
+ // Non-greedy, we've already done the one less case, so don't match on popping.
+ // We haven't done the one more case, so always try to add that.
+ //
+ JSRegExpResult backtrackParentheses(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
+
+ BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
+ ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
+
+ switch (term.atom.quantityType) {
+ case QuantifierFixedCount: {
+ ASSERT(backTrack->matchAmount == term.atom.quantityCount);
+
+ ParenthesesDisjunctionContext* context = 0;
+ JSRegExpResult result = parenthesesDoBacktrack(term, backTrack);
+
+ if (result != JSRegExpMatch)
+ return result;
+
+ // While we haven't yet reached our fixed limit,
+ while (backTrack->matchAmount < term.atom.quantityCount) {
+ // Try to do a match, and it it succeeds, add it to the list.
+ context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
+ result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term));
+
+ if (result == JSRegExpMatch)
+ appendParenthesesDisjunctionContext(backTrack, context);
+ else {
+ // The match failed; try to find an alternate point to carry on from.
+ resetMatches(term, context);
+ freeParenthesesDisjunctionContext(context);
+
+ if (result != JSRegExpNoMatch)
+ return result;
+ JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack);
+ if (backtrackResult != JSRegExpMatch)
+ return backtrackResult;
+ }
+ }
+
+ ASSERT(backTrack->matchAmount == term.atom.quantityCount);
+ context = backTrack->lastContext;
+ recordParenthesesMatch(term, context);
+ return JSRegExpMatch;
+ }
+
+ case QuantifierGreedy: {
+ if (!backTrack->matchAmount)
+ return JSRegExpNoMatch;
+
+ ParenthesesDisjunctionContext* context = backTrack->lastContext;
+ JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true);
+ if (result == JSRegExpMatch) {
+ while (backTrack->matchAmount < term.atom.quantityCount) {
+ ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
+ JSRegExpResult parenthesesResult = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
+ if (parenthesesResult == JSRegExpMatch)
+ appendParenthesesDisjunctionContext(backTrack, context);
+ else {
+ resetMatches(term, context);
+ freeParenthesesDisjunctionContext(context);
+
+ if (parenthesesResult != JSRegExpNoMatch)
+ return parenthesesResult;
+
+ break;
+ }
+ }
+ } else {
+ resetMatches(term, context);
+ popParenthesesDisjunctionContext(backTrack);
+ freeParenthesesDisjunctionContext(context);
+
+ if (result != JSRegExpNoMatch)
+ return result;
+ }
+
+ if (backTrack->matchAmount) {
+ ParenthesesDisjunctionContext* context = backTrack->lastContext;
+ recordParenthesesMatch(term, context);
+ }
+ return JSRegExpMatch;
+ }
+
+ case QuantifierNonGreedy: {
+ // If we've not reached the limit, try to add one more match.
+ if (backTrack->matchAmount < term.atom.quantityCount) {
+ ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
+ JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
+ if (result == JSRegExpMatch) {
+ appendParenthesesDisjunctionContext(backTrack, context);
+ recordParenthesesMatch(term, context);
+ return JSRegExpMatch;
+ }
+
+ resetMatches(term, context);
+ freeParenthesesDisjunctionContext(context);
+
+ if (result != JSRegExpNoMatch)
+ return result;
+ }
+
+ // Nope - okay backtrack looking for an alternative.
+ while (backTrack->matchAmount) {
+ ParenthesesDisjunctionContext* context = backTrack->lastContext;
+ JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true);
+ if (result == JSRegExpMatch) {
+ // successful backtrack! we're back in the game!
+ if (backTrack->matchAmount) {
+ context = backTrack->lastContext;
+ recordParenthesesMatch(term, context);
+ }
+ return JSRegExpMatch;
+ }
+
+ // pop a match off the stack
+ resetMatches(term, context);
+ popParenthesesDisjunctionContext(backTrack);
+ freeParenthesesDisjunctionContext(context);
+
+ if (result != JSRegExpNoMatch)
+ return result;
+ }
+
+ return JSRegExpNoMatch;
+ }
+ }
+
+ ASSERT_NOT_REACHED();
+ return JSRegExpErrorNoMatch;
+ }
+
+ bool matchDotStarEnclosure(ByteTerm& term, DisjunctionContext* context)
+ {
+ UNUSED_PARAM(term);
+ unsigned matchBegin = context->matchBegin;
+
+ if (matchBegin) {
+ for (matchBegin--; true; matchBegin--) {
+ if (testCharacterClass(pattern->newlineCharacterClass, input.reread(matchBegin))) {
+ ++matchBegin;
+ break;
+ }
+
+ if (!matchBegin)
+ break;
+ }
+ }
+
+ unsigned matchEnd = input.getPos();
+
+ for (; (matchEnd != input.end())
+ && (!testCharacterClass(pattern->newlineCharacterClass, input.reread(matchEnd))); matchEnd++) { }
+
+ if (((matchBegin && term.anchors.m_bol)
+ || ((matchEnd != input.end()) && term.anchors.m_eol))
+ && !pattern->m_multiline)
+ return false;
+
+ context->matchBegin = matchBegin;
+ context->matchEnd = matchEnd;
+ return true;
+ }
+
+#define MATCH_NEXT() { ++context->term; goto matchAgain; }
+#define BACKTRACK() { --context->term; goto backtrack; }
+#define currentTerm() (disjunction->terms[context->term])
+ JSRegExpResult matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
+ {
+ if (!--remainingMatchCount)
+ return JSRegExpErrorHitLimit;
+
+ if (btrack)
+ BACKTRACK();
+
+ context->matchBegin = input.getPos();
+ context->term = 0;
+
+ matchAgain:
+ ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
+
+ switch (currentTerm().type) {
+ case ByteTerm::TypeSubpatternBegin:
+ MATCH_NEXT();
+ case ByteTerm::TypeSubpatternEnd:
+ context->matchEnd = input.getPos();
+ return JSRegExpMatch;
+
+ case ByteTerm::TypeBodyAlternativeBegin:
+ MATCH_NEXT();
+ case ByteTerm::TypeBodyAlternativeDisjunction:
+ case ByteTerm::TypeBodyAlternativeEnd:
+ context->matchEnd = input.getPos();
+ return JSRegExpMatch;
+
+ case ByteTerm::TypeAlternativeBegin:
+ MATCH_NEXT();
+ case ByteTerm::TypeAlternativeDisjunction:
+ case ByteTerm::TypeAlternativeEnd: {
+ int offset = currentTerm().alternative.end;
+ BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
+ backTrack->offset = offset;
+ context->term += offset;
+ MATCH_NEXT();
+ }
+
+ case ByteTerm::TypeAssertionBOL:
+ if (matchAssertionBOL(currentTerm()))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeAssertionEOL:
+ if (matchAssertionEOL(currentTerm()))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeAssertionWordBoundary:
+ if (matchAssertionWordBoundary(currentTerm()))
+ MATCH_NEXT();
+ BACKTRACK();
+
+ case ByteTerm::TypePatternCharacterOnce:
+ case ByteTerm::TypePatternCharacterFixed: {
+ for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) {
+ if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - matchAmount))
+ BACKTRACK();
+ }
+ MATCH_NEXT();
+ }
+ case ByteTerm::TypePatternCharacterGreedy: {
+ BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
+ unsigned matchAmount = 0;
+ while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) {
+ if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition + 1)) {
+ input.uncheckInput(1);
+ break;
+ }
+ ++matchAmount;
+ }
+ backTrack->matchAmount = matchAmount;
+
+ MATCH_NEXT();
+ }
+ case ByteTerm::TypePatternCharacterNonGreedy: {
+ BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
+ backTrack->matchAmount = 0;
+ MATCH_NEXT();
+ }
+
+ case ByteTerm::TypePatternCasedCharacterOnce:
+ case ByteTerm::TypePatternCasedCharacterFixed: {
+ for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) {
+ if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - matchAmount))
+ BACKTRACK();
+ }
+ MATCH_NEXT();
+ }
+ case ByteTerm::TypePatternCasedCharacterGreedy: {
+ BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
+ unsigned matchAmount = 0;
+ while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) {
+ if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition + 1)) {
+ input.uncheckInput(1);
+ break;
+ }
+ ++matchAmount;
+ }
+ backTrack->matchAmount = matchAmount;
+
+ MATCH_NEXT();
+ }
+ case ByteTerm::TypePatternCasedCharacterNonGreedy: {
+ BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
+ backTrack->matchAmount = 0;
+ MATCH_NEXT();
+ }
+
+ case ByteTerm::TypeCharacterClass:
+ if (matchCharacterClass(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeBackReference:
+ if (matchBackReference(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeParenthesesSubpattern: {
+ JSRegExpResult result = matchParentheses(currentTerm(), context);
+
+ if (result == JSRegExpMatch) {
+ MATCH_NEXT();
+ } else if (result != JSRegExpNoMatch)
+ return result;
+
+ BACKTRACK();
+ }
+ case ByteTerm::TypeParenthesesSubpatternOnceBegin:
+ if (matchParenthesesOnceBegin(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeParenthesesSubpatternOnceEnd:
+ if (matchParenthesesOnceEnd(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeParenthesesSubpatternTerminalBegin:
+ if (matchParenthesesTerminalBegin(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeParenthesesSubpatternTerminalEnd:
+ if (matchParenthesesTerminalEnd(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeParentheticalAssertionBegin:
+ if (matchParentheticalAssertionBegin(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeParentheticalAssertionEnd:
+ if (matchParentheticalAssertionEnd(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+
+ case ByteTerm::TypeCheckInput:
+ if (input.checkInput(currentTerm().checkInputCount))
+ MATCH_NEXT();
+ BACKTRACK();
+
+ case ByteTerm::TypeUncheckInput:
+ input.uncheckInput(currentTerm().checkInputCount);
+ MATCH_NEXT();
+
+ case ByteTerm::TypeDotStarEnclosure:
+ if (matchDotStarEnclosure(currentTerm(), context))
+ return JSRegExpMatch;
+ BACKTRACK();
+ }
+
+ // We should never fall-through to here.
+ ASSERT_NOT_REACHED();
+
+ backtrack:
+ ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
+
+ switch (currentTerm().type) {
+ case ByteTerm::TypeSubpatternBegin:
+ return JSRegExpNoMatch;
+ case ByteTerm::TypeSubpatternEnd:
+ ASSERT_NOT_REACHED();
+
+ case ByteTerm::TypeBodyAlternativeBegin:
+ case ByteTerm::TypeBodyAlternativeDisjunction: {
+ int offset = currentTerm().alternative.next;
+ context->term += offset;
+ if (offset > 0)
+ MATCH_NEXT();
+
+ if (input.atEnd())
+ return JSRegExpNoMatch;
+
+ input.next();
+
+ context->matchBegin = input.getPos();
+
+ if (currentTerm().alternative.onceThrough)
+ context->term += currentTerm().alternative.next;
+
+ MATCH_NEXT();
+ }
+ case ByteTerm::TypeBodyAlternativeEnd:
+ ASSERT_NOT_REACHED();
+
+ case ByteTerm::TypeAlternativeBegin:
+ case ByteTerm::TypeAlternativeDisjunction: {
+ int offset = currentTerm().alternative.next;
+ context->term += offset;
+ if (offset > 0)
+ MATCH_NEXT();
+ BACKTRACK();
+ }
+ case ByteTerm::TypeAlternativeEnd: {
+ // We should never backtrack back into an alternative of the main body of the regex.
+ BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
+ unsigned offset = backTrack->offset;
+ context->term -= offset;
+ BACKTRACK();
+ }
+
+ case ByteTerm::TypeAssertionBOL:
+ case ByteTerm::TypeAssertionEOL:
+ case ByteTerm::TypeAssertionWordBoundary:
+ BACKTRACK();
+
+ case ByteTerm::TypePatternCharacterOnce:
+ case ByteTerm::TypePatternCharacterFixed:
+ case ByteTerm::TypePatternCharacterGreedy:
+ case ByteTerm::TypePatternCharacterNonGreedy:
+ if (backtrackPatternCharacter(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypePatternCasedCharacterOnce:
+ case ByteTerm::TypePatternCasedCharacterFixed:
+ case ByteTerm::TypePatternCasedCharacterGreedy:
+ case ByteTerm::TypePatternCasedCharacterNonGreedy:
+ if (backtrackPatternCasedCharacter(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeCharacterClass:
+ if (backtrackCharacterClass(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeBackReference:
+ if (backtrackBackReference(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeParenthesesSubpattern: {
+ JSRegExpResult result = backtrackParentheses(currentTerm(), context);
+
+ if (result == JSRegExpMatch) {
+ MATCH_NEXT();
+ } else if (result != JSRegExpNoMatch)
+ return result;
+
+ BACKTRACK();
+ }
+ case ByteTerm::TypeParenthesesSubpatternOnceBegin:
+ if (backtrackParenthesesOnceBegin(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeParenthesesSubpatternOnceEnd:
+ if (backtrackParenthesesOnceEnd(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeParenthesesSubpatternTerminalBegin:
+ if (backtrackParenthesesTerminalBegin(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeParenthesesSubpatternTerminalEnd:
+ if (backtrackParenthesesTerminalEnd(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeParentheticalAssertionBegin:
+ if (backtrackParentheticalAssertionBegin(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeParentheticalAssertionEnd:
+ if (backtrackParentheticalAssertionEnd(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+
+ case ByteTerm::TypeCheckInput:
+ input.uncheckInput(currentTerm().checkInputCount);
+ BACKTRACK();
+
+ case ByteTerm::TypeUncheckInput:
+ input.checkInput(currentTerm().checkInputCount);
+ BACKTRACK();
+
+ case ByteTerm::TypeDotStarEnclosure:
+ ASSERT_NOT_REACHED();
+ }
+
+ ASSERT_NOT_REACHED();
+ return JSRegExpErrorNoMatch;
+ }
+
+ JSRegExpResult matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
+ {
+ JSRegExpResult result = matchDisjunction(disjunction, context, btrack);
+
+ if (result == JSRegExpMatch) {
+ while (context->matchBegin == context->matchEnd) {
+ result = matchDisjunction(disjunction, context, true);
+ if (result != JSRegExpMatch)
+ return result;
+ }
+ return JSRegExpMatch;
+ }
+
+ return result;
+ }
+
+ unsigned interpret()
+ {
+ if (!input.isAvailableInput(0))
+ return offsetNoMatch;
+
+ for (unsigned i = 0; i < pattern->m_body->m_numSubpatterns + 1; ++i)
+ output[i << 1] = offsetNoMatch;
+
+ allocatorPool = pattern->m_allocator->startAllocator();
+ if (!allocatorPool)
+ CRASH();
+
+ DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get());
+
+ JSRegExpResult result = matchDisjunction(pattern->m_body.get(), context, false);
+ if (result == JSRegExpMatch) {
+ output[0] = context->matchBegin;
+ output[1] = context->matchEnd;
+ }
+
+ freeDisjunctionContext(context);
+
+ pattern->m_allocator->stopAllocator();
+
+ ASSERT((result == JSRegExpMatch) == (output[0] != offsetNoMatch));
+ return output[0];
+ }
+
+ Interpreter(BytecodePattern* pattern, unsigned* output, const CharType* input, unsigned length, unsigned start)
+ : pattern(pattern)
+ , output(output)
+ , input(input, start, length)
+ , allocatorPool(0)
+ , remainingMatchCount(matchLimit)
+ {
+ }
+
+private:
+ BytecodePattern* pattern;
+ unsigned* output;
+ InputStream input;
+ BumpPointerPool* allocatorPool;
+ unsigned remainingMatchCount;
+};
+
+
+
+class ByteCompiler {
+ struct ParenthesesStackEntry {
+ unsigned beginTerm;
+ unsigned savedAlternativeIndex;
+ ParenthesesStackEntry(unsigned beginTerm, unsigned savedAlternativeIndex/*, unsigned subpatternId, bool capture = false*/)
+ : beginTerm(beginTerm)
+ , savedAlternativeIndex(savedAlternativeIndex)
+ {
+ }
+ };
+
+public:
+ ByteCompiler(YarrPattern& pattern)
+ : m_pattern(pattern)
+ {
+ m_currentAlternativeIndex = 0;
+ }
+
+ PassOwnPtr<BytecodePattern> compile(BumpPointerAllocator* allocator)
+ {
+ regexBegin(m_pattern.m_numSubpatterns, m_pattern.m_body->m_callFrameSize, m_pattern.m_body->m_alternatives[0]->onceThrough());
+ emitDisjunction(m_pattern.m_body);
+ regexEnd();
+
+ return adoptPtr(new BytecodePattern(m_bodyDisjunction.release(), m_allParenthesesInfo, m_pattern, allocator));
+ }
+
+ void checkInput(unsigned count)
+ {
+ m_bodyDisjunction->terms.append(ByteTerm::CheckInput(count));
+ }
+
+ void uncheckInput(unsigned count)
+ {
+ m_bodyDisjunction->terms.append(ByteTerm::UncheckInput(count));
+ }
+
+ void assertionBOL(unsigned inputPosition)
+ {
+ m_bodyDisjunction->terms.append(ByteTerm::BOL(inputPosition));
+ }
+
+ void assertionEOL(unsigned inputPosition)
+ {
+ m_bodyDisjunction->terms.append(ByteTerm::EOL(inputPosition));
+ }
+
+ void assertionWordBoundary(bool invert, unsigned inputPosition)
+ {
+ m_bodyDisjunction->terms.append(ByteTerm::WordBoundary(invert, inputPosition));
+ }
+
+ void atomPatternCharacter(UChar ch, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
+ {
+ if (m_pattern.m_ignoreCase) {
+ UChar lo = Unicode::toLower(ch);
+ UChar hi = Unicode::toUpper(ch);
+
+ if (lo != hi) {
+ m_bodyDisjunction->terms.append(ByteTerm(lo, hi, inputPosition, frameLocation, quantityCount, quantityType));
+ return;
+ }
+ }
+
+ m_bodyDisjunction->terms.append(ByteTerm(ch, inputPosition, frameLocation, quantityCount, quantityType));
+ }
+
+ void atomCharacterClass(CharacterClass* characterClass, bool invert, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
+ {
+ m_bodyDisjunction->terms.append(ByteTerm(characterClass, invert, inputPosition));
+
+ m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount.unsafeGet();
+ m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType;
+ m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
+ }
+
+ void atomBackReference(unsigned subpatternId, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
+ {
+ ASSERT(subpatternId);
+
+ m_bodyDisjunction->terms.append(ByteTerm::BackReference(subpatternId, inputPosition));
+
+ m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount.unsafeGet();
+ m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType;
+ m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
+ }
+
+ void atomParenthesesOnceBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
+ {
+ int beginTerm = m_bodyDisjunction->terms.size();
+
+ m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition));
+ m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
+ m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
+ m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
+
+ m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
+ m_currentAlternativeIndex = beginTerm + 1;
+ }
+
+ void atomParenthesesTerminalBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
+ {
+ int beginTerm = m_bodyDisjunction->terms.size();
+
+ m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalBegin, subpatternId, capture, false, inputPosition));
+ m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
+ m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
+ m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
+
+ m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
+ m_currentAlternativeIndex = beginTerm + 1;
+ }
+
+ void atomParenthesesSubpatternBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
+ {
+ // Errrk! - this is a little crazy, we initially generate as a TypeParenthesesSubpatternOnceBegin,
+ // then fix this up at the end! - simplifying this should make it much clearer.
+ // https://bugs.webkit.org/show_bug.cgi?id=50136
+
+ int beginTerm = m_bodyDisjunction->terms.size();
+
+ m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition));
+ m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
+ m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
+ m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
+
+ m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
+ m_currentAlternativeIndex = beginTerm + 1;
+ }
+
+ void atomParentheticalAssertionBegin(unsigned subpatternId, bool invert, unsigned frameLocation, unsigned alternativeFrameLocation)
+ {
+ int beginTerm = m_bodyDisjunction->terms.size();
+
+ m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionBegin, subpatternId, false, invert, 0));
+ m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
+ m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
+ m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
+
+ m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
+ m_currentAlternativeIndex = beginTerm + 1;
+ }
+
+ void atomParentheticalAssertionEnd(unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
+ {
+ unsigned beginTerm = popParenthesesStack();
+ closeAlternative(beginTerm + 1);
+ unsigned endTerm = m_bodyDisjunction->terms.size();
+
+ ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParentheticalAssertionBegin);
+
+ bool invert = m_bodyDisjunction->terms[beginTerm].invert();
+ unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
+
+ m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionEnd, subpatternId, false, invert, inputPosition));
+ m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
+ m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
+ m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
+
+ m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet();
+ m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
+ m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount.unsafeGet();
+ m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
+ }
+
+ void assertionDotStarEnclosure(bool bolAnchored, bool eolAnchored)
+ {
+ m_bodyDisjunction->terms.append(ByteTerm::DotStarEnclosure(bolAnchored, eolAnchored));
+ }
+
+ unsigned popParenthesesStack()
+ {
+ ASSERT(m_parenthesesStack.size());
+ int stackEnd = m_parenthesesStack.size() - 1;
+ unsigned beginTerm = m_parenthesesStack[stackEnd].beginTerm;
+ m_currentAlternativeIndex = m_parenthesesStack[stackEnd].savedAlternativeIndex;
+ m_parenthesesStack.shrink(stackEnd);
+
+ ASSERT(beginTerm < m_bodyDisjunction->terms.size());
+ ASSERT(m_currentAlternativeIndex < m_bodyDisjunction->terms.size());
+
+ return beginTerm;
+ }
+
+#ifndef NDEBUG
+ void dumpDisjunction(ByteDisjunction* disjunction)
+ {
+ dataLogF("ByteDisjunction(%p):\n\t", disjunction);
+ for (unsigned i = 0; i < disjunction->terms.size(); ++i)
+ dataLogF("{ %d } ", disjunction->terms[i].type);
+ dataLogF("\n");
+ }
+#endif
+
+ void closeAlternative(int beginTerm)
+ {
+ int origBeginTerm = beginTerm;
+ ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeBegin);
+ int endIndex = m_bodyDisjunction->terms.size();
+
+ unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation;
+
+ if (!m_bodyDisjunction->terms[beginTerm].alternative.next)
+ m_bodyDisjunction->terms.remove(beginTerm);
+ else {
+ while (m_bodyDisjunction->terms[beginTerm].alternative.next) {
+ beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next;
+ ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeDisjunction);
+ m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm;
+ m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
+ }
+
+ m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm;
+
+ m_bodyDisjunction->terms.append(ByteTerm::AlternativeEnd());
+ m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
+ }
+ }
+
+ void closeBodyAlternative()
+ {
+ int beginTerm = 0;
+ int origBeginTerm = 0;
+ ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeBegin);
+ int endIndex = m_bodyDisjunction->terms.size();
+
+ unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation;
+
+ while (m_bodyDisjunction->terms[beginTerm].alternative.next) {
+ beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next;
+ ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeDisjunction);
+ m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm;
+ m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
+ }
+
+ m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm;
+
+ m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeEnd());
+ m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
+ }
+
+ void atomParenthesesSubpatternEnd(unsigned lastSubpatternId, int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType, unsigned callFrameSize = 0)
+ {
+ unsigned beginTerm = popParenthesesStack();
+ closeAlternative(beginTerm + 1);
+ unsigned endTerm = m_bodyDisjunction->terms.size();
+
+ ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
+
+ ByteTerm& parenthesesBegin = m_bodyDisjunction->terms[beginTerm];
+
+ bool capture = parenthesesBegin.capture();
+ unsigned subpatternId = parenthesesBegin.atom.subpatternId;
+
+ unsigned numSubpatterns = lastSubpatternId - subpatternId + 1;
+ ByteDisjunction* parenthesesDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize);
+
+ parenthesesDisjunction->terms.append(ByteTerm::SubpatternBegin());
+ for (unsigned termInParentheses = beginTerm + 1; termInParentheses < endTerm; ++termInParentheses)
+ parenthesesDisjunction->terms.append(m_bodyDisjunction->terms[termInParentheses]);
+ parenthesesDisjunction->terms.append(ByteTerm::SubpatternEnd());
+
+ m_bodyDisjunction->terms.shrink(beginTerm);
+
+ m_allParenthesesInfo.append(parenthesesDisjunction);
+ m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, capture, inputPosition));
+
+ m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet();
+ m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
+ m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
+ }
+
+ void atomParenthesesOnceEnd(int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
+ {
+ unsigned beginTerm = popParenthesesStack();
+ closeAlternative(beginTerm + 1);
+ unsigned endTerm = m_bodyDisjunction->terms.size();
+
+ ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
+
+ bool capture = m_bodyDisjunction->terms[beginTerm].capture();
+ unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
+
+ m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceEnd, subpatternId, capture, false, inputPosition));
+ m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
+ m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
+ m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
+
+ m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet();
+ m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
+ m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount.unsafeGet();
+ m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
+ }
+
+ void atomParenthesesTerminalEnd(int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
+ {
+ unsigned beginTerm = popParenthesesStack();
+ closeAlternative(beginTerm + 1);
+ unsigned endTerm = m_bodyDisjunction->terms.size();
+
+ ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
+
+ bool capture = m_bodyDisjunction->terms[beginTerm].capture();
+ unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
+
+ m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalEnd, subpatternId, capture, false, inputPosition));
+ m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
+ m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
+ m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
+
+ m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet();
+ m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
+ m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount.unsafeGet();
+ m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
+ }
+
+ void regexBegin(unsigned numSubpatterns, unsigned callFrameSize, bool onceThrough)
+ {
+ m_bodyDisjunction = adoptPtr(new ByteDisjunction(numSubpatterns, callFrameSize));
+ m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeBegin(onceThrough));
+ m_bodyDisjunction->terms[0].frameLocation = 0;
+ m_currentAlternativeIndex = 0;
+ }
+
+ void regexEnd()
+ {
+ closeBodyAlternative();
+ }
+
+ void alternativeBodyDisjunction(bool onceThrough)
+ {
+ int newAlternativeIndex = m_bodyDisjunction->terms.size();
+ m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex;
+ m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeDisjunction(onceThrough));
+
+ m_currentAlternativeIndex = newAlternativeIndex;
+ }
+
+ void alternativeDisjunction()
+ {
+ int newAlternativeIndex = m_bodyDisjunction->terms.size();
+ m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex;
+ m_bodyDisjunction->terms.append(ByteTerm::AlternativeDisjunction());
+
+ m_currentAlternativeIndex = newAlternativeIndex;
+ }
+
+ void emitDisjunction(PatternDisjunction* disjunction, unsigned inputCountAlreadyChecked = 0, unsigned parenthesesInputCountAlreadyChecked = 0)
+ {
+ for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
+ unsigned currentCountAlreadyChecked = inputCountAlreadyChecked;
+
+ PatternAlternative* alternative = disjunction->m_alternatives[alt];
+
+ if (alt) {
+ if (disjunction == m_pattern.m_body)
+ alternativeBodyDisjunction(alternative->onceThrough());
+ else
+ alternativeDisjunction();
+ }
+
+ unsigned minimumSize = alternative->m_minimumSize;
+ ASSERT(minimumSize >= parenthesesInputCountAlreadyChecked);
+ unsigned countToCheck = minimumSize - parenthesesInputCountAlreadyChecked;
+
+ if (countToCheck) {
+ checkInput(countToCheck);
+ currentCountAlreadyChecked += countToCheck;
+ }
+
+ for (unsigned i = 0; i < alternative->m_terms.size(); ++i) {
+ PatternTerm& term = alternative->m_terms[i];
+
+ switch (term.type) {
+ case PatternTerm::TypeAssertionBOL:
+ assertionBOL(currentCountAlreadyChecked - term.inputPosition);
+ break;
+
+ case PatternTerm::TypeAssertionEOL:
+ assertionEOL(currentCountAlreadyChecked - term.inputPosition);
+ break;
+
+ case PatternTerm::TypeAssertionWordBoundary:
+ assertionWordBoundary(term.invert(), currentCountAlreadyChecked - term.inputPosition);
+ break;
+
+ case PatternTerm::TypePatternCharacter:
+ atomPatternCharacter(term.patternCharacter, currentCountAlreadyChecked - term.inputPosition, term.frameLocation, term.quantityCount, term.quantityType);
+ break;
+
+ case PatternTerm::TypeCharacterClass:
+ atomCharacterClass(term.characterClass, term.invert(), currentCountAlreadyChecked- term.inputPosition, term.frameLocation, term.quantityCount, term.quantityType);
+ break;
+
+ case PatternTerm::TypeBackReference:
+ atomBackReference(term.backReferenceSubpatternId, currentCountAlreadyChecked - term.inputPosition, term.frameLocation, term.quantityCount, term.quantityType);
+ break;
+
+ case PatternTerm::TypeForwardReference:
+ break;
+
+ case PatternTerm::TypeParenthesesSubpattern: {
+ unsigned disjunctionAlreadyCheckedCount = 0;
+ if (term.quantityCount == 1 && !term.parentheses.isCopy) {
+ unsigned alternativeFrameLocation = term.frameLocation;
+ // For QuantifierFixedCount we pre-check the minimum size; for greedy/non-greedy we reserve a slot in the frame.
+ if (term.quantityType == QuantifierFixedCount)
+ disjunctionAlreadyCheckedCount = term.parentheses.disjunction->m_minimumSize;
+ else
+ alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce;
+ unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
+ atomParenthesesOnceBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount - delegateEndInputOffset, term.frameLocation, alternativeFrameLocation);
+ emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount);
+ atomParenthesesOnceEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType);
+ } else if (term.parentheses.isTerminal) {
+ unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
+ atomParenthesesTerminalBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount - delegateEndInputOffset, term.frameLocation, term.frameLocation + YarrStackSpaceForBackTrackInfoParenthesesOnce);
+ emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount);
+ atomParenthesesTerminalEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType);
+ } else {
+ unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
+ atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount - delegateEndInputOffset, term.frameLocation, 0);
+ emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0);
+ atomParenthesesSubpatternEnd(term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize);
+ }
+ break;
+ }
+
+ case PatternTerm::TypeParentheticalAssertion: {
+ unsigned alternativeFrameLocation = term.frameLocation + YarrStackSpaceForBackTrackInfoParentheticalAssertion;
+
+ ASSERT(currentCountAlreadyChecked >= static_cast<unsigned>(term.inputPosition));
+ unsigned positiveInputOffset = currentCountAlreadyChecked - static_cast<unsigned>(term.inputPosition);
+ unsigned uncheckAmount = 0;
+ if (positiveInputOffset > term.parentheses.disjunction->m_minimumSize) {
+ uncheckAmount = positiveInputOffset - term.parentheses.disjunction->m_minimumSize;
+ uncheckInput(uncheckAmount);
+ currentCountAlreadyChecked -= uncheckAmount;
+ }
+
+ atomParentheticalAssertionBegin(term.parentheses.subpatternId, term.invert(), term.frameLocation, alternativeFrameLocation);
+ emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, positiveInputOffset - uncheckAmount);
+ atomParentheticalAssertionEnd(0, term.frameLocation, term.quantityCount, term.quantityType);
+ if (uncheckAmount) {
+ checkInput(uncheckAmount);
+ currentCountAlreadyChecked += uncheckAmount;
+ }
+ break;
+ }
+
+ case PatternTerm::TypeDotStarEnclosure:
+ assertionDotStarEnclosure(term.anchors.bolAnchor, term.anchors.eolAnchor);
+ break;
+ }
+ }
+ }
+ }
+
+private:
+ YarrPattern& m_pattern;
+ OwnPtr<ByteDisjunction> m_bodyDisjunction;
+ unsigned m_currentAlternativeIndex;
+ Vector<ParenthesesStackEntry> m_parenthesesStack;
+ Vector<ByteDisjunction*> m_allParenthesesInfo;
+};
+
+PassOwnPtr<BytecodePattern> byteCompile(YarrPattern& pattern, BumpPointerAllocator* allocator)
+{
+ return ByteCompiler(pattern).compile(allocator);
+}
+
+unsigned interpret(BytecodePattern* bytecode, const String& input, unsigned start, unsigned* output)
+{
+ if (input.is8Bit())
+ return Interpreter<LChar>(bytecode, output, input.characters8(), input.length(), start).interpret();
+ return Interpreter<UChar>(bytecode, output, input.characters16(), input.length(), start).interpret();
+}
+
+unsigned interpret(BytecodePattern* bytecode, const LChar* input, unsigned length, unsigned start, unsigned* output)
+{
+ return Interpreter<LChar>(bytecode, output, input, length, start).interpret();
+}
+
+unsigned interpret(BytecodePattern* bytecode, const UChar* input, unsigned length, unsigned start, unsigned* output)
+{
+ return Interpreter<UChar>(bytecode, output, input, length, start).interpret();
+}
+
+// These should be the same for both UChar & LChar.
+COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoPatternCharacter) == (YarrStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoPatternCharacter);
+COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoCharacterClass) == (YarrStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoCharacterClass);
+COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoBackReference) == (YarrStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoBackReference);
+COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoAlternative) == (YarrStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoAlternative);
+COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoParentheticalAssertion) == (YarrStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheticalAssertion);
+COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoParenthesesOnce) == (YarrStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParenthesesOnce);
+COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoParentheses) == (YarrStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheses);
+
+
+} }
diff --git a/src/3rdparty/masm/yarr/YarrInterpreter.h b/src/3rdparty/masm/yarr/YarrInterpreter.h
new file mode 100644
index 0000000000..fb60bd979d
--- /dev/null
+++ b/src/3rdparty/masm/yarr/YarrInterpreter.h
@@ -0,0 +1,385 @@
+/*
+ * Copyright (C) 2009, 2010 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef YarrInterpreter_h
+#define YarrInterpreter_h
+
+#include "YarrPattern.h"
+#include <wtf/PassOwnPtr.h>
+#include <wtf/unicode/Unicode.h>
+
+namespace WTF {
+class BumpPointerAllocator;
+}
+using WTF::BumpPointerAllocator;
+
+namespace JSC { namespace Yarr {
+
+class ByteDisjunction;
+
+struct ByteTerm {
+ enum Type {
+ TypeBodyAlternativeBegin,
+ TypeBodyAlternativeDisjunction,
+ TypeBodyAlternativeEnd,
+ TypeAlternativeBegin,
+ TypeAlternativeDisjunction,
+ TypeAlternativeEnd,
+ TypeSubpatternBegin,
+ TypeSubpatternEnd,
+ TypeAssertionBOL,
+ TypeAssertionEOL,
+ TypeAssertionWordBoundary,
+ TypePatternCharacterOnce,
+ TypePatternCharacterFixed,
+ TypePatternCharacterGreedy,
+ TypePatternCharacterNonGreedy,
+ TypePatternCasedCharacterOnce,
+ TypePatternCasedCharacterFixed,
+ TypePatternCasedCharacterGreedy,
+ TypePatternCasedCharacterNonGreedy,
+ TypeCharacterClass,
+ TypeBackReference,
+ TypeParenthesesSubpattern,
+ TypeParenthesesSubpatternOnceBegin,
+ TypeParenthesesSubpatternOnceEnd,
+ TypeParenthesesSubpatternTerminalBegin,
+ TypeParenthesesSubpatternTerminalEnd,
+ TypeParentheticalAssertionBegin,
+ TypeParentheticalAssertionEnd,
+ TypeCheckInput,
+ TypeUncheckInput,
+ TypeDotStarEnclosure,
+ } type;
+ union {
+ struct {
+ union {
+ UChar patternCharacter;
+ struct {
+ UChar lo;
+ UChar hi;
+ } casedCharacter;
+ CharacterClass* characterClass;
+ unsigned subpatternId;
+ };
+ union {
+ ByteDisjunction* parenthesesDisjunction;
+ unsigned parenthesesWidth;
+ };
+ QuantifierType quantityType;
+ unsigned quantityCount;
+ } atom;
+ struct {
+ int next;
+ int end;
+ bool onceThrough;
+ } alternative;
+ struct {
+ bool m_bol : 1;
+ bool m_eol : 1;
+ } anchors;
+ unsigned checkInputCount;
+ };
+ unsigned frameLocation;
+ bool m_capture : 1;
+ bool m_invert : 1;
+ unsigned inputPosition;
+
+ ByteTerm(UChar ch, int inputPos, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
+ : frameLocation(frameLocation)
+ , m_capture(false)
+ , m_invert(false)
+ {
+ switch (quantityType) {
+ case QuantifierFixedCount:
+ type = (quantityCount == 1) ? ByteTerm::TypePatternCharacterOnce : ByteTerm::TypePatternCharacterFixed;
+ break;
+ case QuantifierGreedy:
+ type = ByteTerm::TypePatternCharacterGreedy;
+ break;
+ case QuantifierNonGreedy:
+ type = ByteTerm::TypePatternCharacterNonGreedy;
+ break;
+ }
+
+ atom.patternCharacter = ch;
+ atom.quantityType = quantityType;
+ atom.quantityCount = quantityCount.unsafeGet();
+ inputPosition = inputPos;
+ }
+
+ ByteTerm(UChar lo, UChar hi, int inputPos, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
+ : frameLocation(frameLocation)
+ , m_capture(false)
+ , m_invert(false)
+ {
+ switch (quantityType) {
+ case QuantifierFixedCount:
+ type = (quantityCount == 1) ? ByteTerm::TypePatternCasedCharacterOnce : ByteTerm::TypePatternCasedCharacterFixed;
+ break;
+ case QuantifierGreedy:
+ type = ByteTerm::TypePatternCasedCharacterGreedy;
+ break;
+ case QuantifierNonGreedy:
+ type = ByteTerm::TypePatternCasedCharacterNonGreedy;
+ break;
+ }
+
+ atom.casedCharacter.lo = lo;
+ atom.casedCharacter.hi = hi;
+ atom.quantityType = quantityType;
+ atom.quantityCount = quantityCount.unsafeGet();
+ inputPosition = inputPos;
+ }
+
+ ByteTerm(CharacterClass* characterClass, bool invert, int inputPos)
+ : type(ByteTerm::TypeCharacterClass)
+ , m_capture(false)
+ , m_invert(invert)
+ {
+ atom.characterClass = characterClass;
+ atom.quantityType = QuantifierFixedCount;
+ atom.quantityCount = 1;
+ inputPosition = inputPos;
+ }
+
+ ByteTerm(Type type, unsigned subpatternId, ByteDisjunction* parenthesesInfo, bool capture, int inputPos)
+ : type(type)
+ , m_capture(capture)
+ , m_invert(false)
+ {
+ atom.subpatternId = subpatternId;
+ atom.parenthesesDisjunction = parenthesesInfo;
+ atom.quantityType = QuantifierFixedCount;
+ atom.quantityCount = 1;
+ inputPosition = inputPos;
+ }
+
+ ByteTerm(Type type, bool invert = false)
+ : type(type)
+ , m_capture(false)
+ , m_invert(invert)
+ {
+ atom.quantityType = QuantifierFixedCount;
+ atom.quantityCount = 1;
+ }
+
+ ByteTerm(Type type, unsigned subpatternId, bool capture, bool invert, int inputPos)
+ : type(type)
+ , m_capture(capture)
+ , m_invert(invert)
+ {
+ atom.subpatternId = subpatternId;
+ atom.quantityType = QuantifierFixedCount;
+ atom.quantityCount = 1;
+ inputPosition = inputPos;
+ }
+
+ static ByteTerm BOL(int inputPos)
+ {
+ ByteTerm term(TypeAssertionBOL);
+ term.inputPosition = inputPos;
+ return term;
+ }
+
+ static ByteTerm CheckInput(Checked<unsigned> count)
+ {
+ ByteTerm term(TypeCheckInput);
+ term.checkInputCount = count.unsafeGet();
+ return term;
+ }
+
+ static ByteTerm UncheckInput(Checked<unsigned> count)
+ {
+ ByteTerm term(TypeUncheckInput);
+ term.checkInputCount = count.unsafeGet();
+ return term;
+ }
+
+ static ByteTerm EOL(int inputPos)
+ {
+ ByteTerm term(TypeAssertionEOL);
+ term.inputPosition = inputPos;
+ return term;
+ }
+
+ static ByteTerm WordBoundary(bool invert, int inputPos)
+ {
+ ByteTerm term(TypeAssertionWordBoundary, invert);
+ term.inputPosition = inputPos;
+ return term;
+ }
+
+ static ByteTerm BackReference(unsigned subpatternId, int inputPos)
+ {
+ return ByteTerm(TypeBackReference, subpatternId, false, false, inputPos);
+ }
+
+ static ByteTerm BodyAlternativeBegin(bool onceThrough)
+ {
+ ByteTerm term(TypeBodyAlternativeBegin);
+ term.alternative.next = 0;
+ term.alternative.end = 0;
+ term.alternative.onceThrough = onceThrough;
+ return term;
+ }
+
+ static ByteTerm BodyAlternativeDisjunction(bool onceThrough)
+ {
+ ByteTerm term(TypeBodyAlternativeDisjunction);
+ term.alternative.next = 0;
+ term.alternative.end = 0;
+ term.alternative.onceThrough = onceThrough;
+ return term;
+ }
+
+ static ByteTerm BodyAlternativeEnd()
+ {
+ ByteTerm term(TypeBodyAlternativeEnd);
+ term.alternative.next = 0;
+ term.alternative.end = 0;
+ term.alternative.onceThrough = false;
+ return term;
+ }
+
+ static ByteTerm AlternativeBegin()
+ {
+ ByteTerm term(TypeAlternativeBegin);
+ term.alternative.next = 0;
+ term.alternative.end = 0;
+ term.alternative.onceThrough = false;
+ return term;
+ }
+
+ static ByteTerm AlternativeDisjunction()
+ {
+ ByteTerm term(TypeAlternativeDisjunction);
+ term.alternative.next = 0;
+ term.alternative.end = 0;
+ term.alternative.onceThrough = false;
+ return term;
+ }
+
+ static ByteTerm AlternativeEnd()
+ {
+ ByteTerm term(TypeAlternativeEnd);
+ term.alternative.next = 0;
+ term.alternative.end = 0;
+ term.alternative.onceThrough = false;
+ return term;
+ }
+
+ static ByteTerm SubpatternBegin()
+ {
+ return ByteTerm(TypeSubpatternBegin);
+ }
+
+ static ByteTerm SubpatternEnd()
+ {
+ return ByteTerm(TypeSubpatternEnd);
+ }
+
+ static ByteTerm DotStarEnclosure(bool bolAnchor, bool eolAnchor)
+ {
+ ByteTerm term(TypeDotStarEnclosure);
+ term.anchors.m_bol = bolAnchor;
+ term.anchors.m_eol = eolAnchor;
+ return term;
+ }
+
+ bool invert()
+ {
+ return m_invert;
+ }
+
+ bool capture()
+ {
+ return m_capture;
+ }
+};
+
+class ByteDisjunction {
+ WTF_MAKE_FAST_ALLOCATED;
+public:
+ ByteDisjunction(unsigned numSubpatterns, unsigned frameSize)
+ : m_numSubpatterns(numSubpatterns)
+ , m_frameSize(frameSize)
+ {
+ }
+
+ Vector<ByteTerm> terms;
+ unsigned m_numSubpatterns;
+ unsigned m_frameSize;
+};
+
+struct BytecodePattern {
+ WTF_MAKE_FAST_ALLOCATED;
+public:
+ BytecodePattern(PassOwnPtr<ByteDisjunction> body, Vector<ByteDisjunction*> allParenthesesInfo, YarrPattern& pattern, BumpPointerAllocator* allocator)
+ : m_body(body)
+ , m_ignoreCase(pattern.m_ignoreCase)
+ , m_multiline(pattern.m_multiline)
+ , m_allocator(allocator)
+ {
+ newlineCharacterClass = pattern.newlineCharacterClass();
+ wordcharCharacterClass = pattern.wordcharCharacterClass();
+
+ m_allParenthesesInfo.append(allParenthesesInfo);
+ m_userCharacterClasses.append(pattern.m_userCharacterClasses);
+ // 'Steal' the YarrPattern's CharacterClasses! We clear its
+ // array, so that it won't delete them on destruction. We'll
+ // take responsibility for that.
+ pattern.m_userCharacterClasses.clear();
+ }
+
+ ~BytecodePattern()
+ {
+ deleteAllValues(m_allParenthesesInfo);
+ deleteAllValues(m_userCharacterClasses);
+ }
+
+ OwnPtr<ByteDisjunction> m_body;
+ bool m_ignoreCase;
+ bool m_multiline;
+ // Each BytecodePattern is associated with a RegExp, each RegExp is associated
+ // with a JSGlobalData. Cache a pointer to out JSGlobalData's m_regExpAllocator.
+ BumpPointerAllocator* m_allocator;
+
+ CharacterClass* newlineCharacterClass;
+ CharacterClass* wordcharCharacterClass;
+
+private:
+ Vector<ByteDisjunction*> m_allParenthesesInfo;
+ Vector<CharacterClass*> m_userCharacterClasses;
+};
+
+JS_EXPORT_PRIVATE PassOwnPtr<BytecodePattern> byteCompile(YarrPattern&, BumpPointerAllocator*);
+JS_EXPORT_PRIVATE unsigned interpret(BytecodePattern*, const String& input, unsigned start, unsigned* output);
+unsigned interpret(BytecodePattern*, const LChar* input, unsigned length, unsigned start, unsigned* output);
+unsigned interpret(BytecodePattern*, const UChar* input, unsigned length, unsigned start, unsigned* output);
+
+} } // namespace JSC::Yarr
+
+#endif // YarrInterpreter_h
diff --git a/src/3rdparty/masm/yarr/YarrJIT.cpp b/src/3rdparty/masm/yarr/YarrJIT.cpp
new file mode 100644
index 0000000000..ce84e2c74f
--- /dev/null
+++ b/src/3rdparty/masm/yarr/YarrJIT.cpp
@@ -0,0 +1,2667 @@
+/*
+ * Copyright (C) 2009 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "YarrJIT.h"
+
+#include <wtf/ASCIICType.h>
+#include "LinkBuffer.h"
+#include "Options.h"
+#include "Yarr.h"
+#include "YarrCanonicalizeUCS2.h"
+
+#if ENABLE(YARR_JIT)
+
+using namespace WTF;
+
+namespace JSC { namespace Yarr {
+
+template<YarrJITCompileMode compileMode>
+class YarrGenerator : private MacroAssembler {
+ friend void jitCompile(JSGlobalData*, YarrCodeBlock& jitObject, const String& pattern, unsigned& numSubpatterns, const char*& error, bool ignoreCase, bool multiline);
+
+#if CPU(ARM)
+ static const RegisterID input = ARMRegisters::r0;
+ static const RegisterID index = ARMRegisters::r1;
+ static const RegisterID length = ARMRegisters::r2;
+ static const RegisterID output = ARMRegisters::r4;
+
+ static const RegisterID regT0 = ARMRegisters::r5;
+ static const RegisterID regT1 = ARMRegisters::r6;
+
+ static const RegisterID returnRegister = ARMRegisters::r0;
+ static const RegisterID returnRegister2 = ARMRegisters::r1;
+#elif CPU(MIPS)
+ static const RegisterID input = MIPSRegisters::a0;
+ static const RegisterID index = MIPSRegisters::a1;
+ static const RegisterID length = MIPSRegisters::a2;
+ static const RegisterID output = MIPSRegisters::a3;
+
+ static const RegisterID regT0 = MIPSRegisters::t4;
+ static const RegisterID regT1 = MIPSRegisters::t5;
+
+ static const RegisterID returnRegister = MIPSRegisters::v0;
+ static const RegisterID returnRegister2 = MIPSRegisters::v1;
+#elif CPU(SH4)
+ static const RegisterID input = SH4Registers::r4;
+ static const RegisterID index = SH4Registers::r5;
+ static const RegisterID length = SH4Registers::r6;
+ static const RegisterID output = SH4Registers::r7;
+
+ static const RegisterID regT0 = SH4Registers::r0;
+ static const RegisterID regT1 = SH4Registers::r1;
+
+ static const RegisterID returnRegister = SH4Registers::r0;
+ static const RegisterID returnRegister2 = SH4Registers::r1;
+#elif CPU(X86)
+ static const RegisterID input = X86Registers::eax;
+ static const RegisterID index = X86Registers::edx;
+ static const RegisterID length = X86Registers::ecx;
+ static const RegisterID output = X86Registers::edi;
+
+ static const RegisterID regT0 = X86Registers::ebx;
+ static const RegisterID regT1 = X86Registers::esi;
+
+ static const RegisterID returnRegister = X86Registers::eax;
+ static const RegisterID returnRegister2 = X86Registers::edx;
+#elif CPU(X86_64)
+ static const RegisterID input = X86Registers::edi;
+ static const RegisterID index = X86Registers::esi;
+ static const RegisterID length = X86Registers::edx;
+ static const RegisterID output = X86Registers::ecx;
+
+ static const RegisterID regT0 = X86Registers::eax;
+ static const RegisterID regT1 = X86Registers::ebx;
+
+ static const RegisterID returnRegister = X86Registers::eax;
+ static const RegisterID returnRegister2 = X86Registers::edx;
+#endif
+
+ void optimizeAlternative(PatternAlternative* alternative)
+ {
+ if (!alternative->m_terms.size())
+ return;
+
+ for (unsigned i = 0; i < alternative->m_terms.size() - 1; ++i) {
+ PatternTerm& term = alternative->m_terms[i];
+ PatternTerm& nextTerm = alternative->m_terms[i + 1];
+
+ if ((term.type == PatternTerm::TypeCharacterClass)
+ && (term.quantityType == QuantifierFixedCount)
+ && (nextTerm.type == PatternTerm::TypePatternCharacter)
+ && (nextTerm.quantityType == QuantifierFixedCount)) {
+ PatternTerm termCopy = term;
+ alternative->m_terms[i] = nextTerm;
+ alternative->m_terms[i + 1] = termCopy;
+ }
+ }
+ }
+
+ void matchCharacterClassRange(RegisterID character, JumpList& failures, JumpList& matchDest, const CharacterRange* ranges, unsigned count, unsigned* matchIndex, const UChar* matches, unsigned matchCount)
+ {
+ do {
+ // pick which range we're going to generate
+ int which = count >> 1;
+ char lo = ranges[which].begin;
+ char hi = ranges[which].end;
+
+ // check if there are any ranges or matches below lo. If not, just jl to failure -
+ // if there is anything else to check, check that first, if it falls through jmp to failure.
+ if ((*matchIndex < matchCount) && (matches[*matchIndex] < lo)) {
+ Jump loOrAbove = branch32(GreaterThanOrEqual, character, Imm32((unsigned short)lo));
+
+ // generate code for all ranges before this one
+ if (which)
+ matchCharacterClassRange(character, failures, matchDest, ranges, which, matchIndex, matches, matchCount);
+
+ while ((*matchIndex < matchCount) && (matches[*matchIndex] < lo)) {
+ matchDest.append(branch32(Equal, character, Imm32((unsigned short)matches[*matchIndex])));
+ ++*matchIndex;
+ }
+ failures.append(jump());
+
+ loOrAbove.link(this);
+ } else if (which) {
+ Jump loOrAbove = branch32(GreaterThanOrEqual, character, Imm32((unsigned short)lo));
+
+ matchCharacterClassRange(character, failures, matchDest, ranges, which, matchIndex, matches, matchCount);
+ failures.append(jump());
+
+ loOrAbove.link(this);
+ } else
+ failures.append(branch32(LessThan, character, Imm32((unsigned short)lo)));
+
+ while ((*matchIndex < matchCount) && (matches[*matchIndex] <= hi))
+ ++*matchIndex;
+
+ matchDest.append(branch32(LessThanOrEqual, character, Imm32((unsigned short)hi)));
+ // fall through to here, the value is above hi.
+
+ // shuffle along & loop around if there are any more matches to handle.
+ unsigned next = which + 1;
+ ranges += next;
+ count -= next;
+ } while (count);
+ }
+
+ void matchCharacterClass(RegisterID character, JumpList& matchDest, const CharacterClass* charClass)
+ {
+ if (charClass->m_table) {
+ ExtendedAddress tableEntry(character, reinterpret_cast<intptr_t>(charClass->m_table->m_table));
+ matchDest.append(branchTest8(charClass->m_table->m_inverted ? Zero : NonZero, tableEntry));
+ return;
+ }
+ Jump unicodeFail;
+ if (charClass->m_matchesUnicode.size() || charClass->m_rangesUnicode.size()) {
+ Jump isAscii = branch32(LessThanOrEqual, character, TrustedImm32(0x7f));
+
+ if (charClass->m_matchesUnicode.size()) {
+ for (unsigned i = 0; i < charClass->m_matchesUnicode.size(); ++i) {
+ UChar ch = charClass->m_matchesUnicode[i];
+ matchDest.append(branch32(Equal, character, Imm32(ch)));
+ }
+ }
+
+ if (charClass->m_rangesUnicode.size()) {
+ for (unsigned i = 0; i < charClass->m_rangesUnicode.size(); ++i) {
+ UChar lo = charClass->m_rangesUnicode[i].begin;
+ UChar hi = charClass->m_rangesUnicode[i].end;
+
+ Jump below = branch32(LessThan, character, Imm32(lo));
+ matchDest.append(branch32(LessThanOrEqual, character, Imm32(hi)));
+ below.link(this);
+ }
+ }
+
+ unicodeFail = jump();
+ isAscii.link(this);
+ }
+
+ if (charClass->m_ranges.size()) {
+ unsigned matchIndex = 0;
+ JumpList failures;
+ matchCharacterClassRange(character, failures, matchDest, charClass->m_ranges.begin(), charClass->m_ranges.size(), &matchIndex, charClass->m_matches.begin(), charClass->m_matches.size());
+ while (matchIndex < charClass->m_matches.size())
+ matchDest.append(branch32(Equal, character, Imm32((unsigned short)charClass->m_matches[matchIndex++])));
+
+ failures.link(this);
+ } else if (charClass->m_matches.size()) {
+ // optimization: gather 'a','A' etc back together, can mask & test once.
+ Vector<char> matchesAZaz;
+
+ for (unsigned i = 0; i < charClass->m_matches.size(); ++i) {
+ char ch = charClass->m_matches[i];
+ if (m_pattern.m_ignoreCase) {
+ if (isASCIILower(ch)) {
+ matchesAZaz.append(ch);
+ continue;
+ }
+ if (isASCIIUpper(ch))
+ continue;
+ }
+ matchDest.append(branch32(Equal, character, Imm32((unsigned short)ch)));
+ }
+
+ if (unsigned countAZaz = matchesAZaz.size()) {
+ or32(TrustedImm32(32), character);
+ for (unsigned i = 0; i < countAZaz; ++i)
+ matchDest.append(branch32(Equal, character, TrustedImm32(matchesAZaz[i])));
+ }
+ }
+
+ if (charClass->m_matchesUnicode.size() || charClass->m_rangesUnicode.size())
+ unicodeFail.link(this);
+ }
+
+ // Jumps if input not available; will have (incorrectly) incremented already!
+ Jump jumpIfNoAvailableInput(unsigned countToCheck = 0)
+ {
+ if (countToCheck)
+ add32(Imm32(countToCheck), index);
+ return branch32(Above, index, length);
+ }
+
+ Jump jumpIfAvailableInput(unsigned countToCheck)
+ {
+ add32(Imm32(countToCheck), index);
+ return branch32(BelowOrEqual, index, length);
+ }
+
+ Jump checkInput()
+ {
+ return branch32(BelowOrEqual, index, length);
+ }
+
+ Jump atEndOfInput()
+ {
+ return branch32(Equal, index, length);
+ }
+
+ Jump notAtEndOfInput()
+ {
+ return branch32(NotEqual, index, length);
+ }
+
+ Jump jumpIfCharNotEquals(UChar ch, int inputPosition, RegisterID character)
+ {
+ readCharacter(inputPosition, character);
+
+ // For case-insesitive compares, non-ascii characters that have different
+ // upper & lower case representations are converted to a character class.
+ ASSERT(!m_pattern.m_ignoreCase || isASCIIAlpha(ch) || isCanonicallyUnique(ch));
+ if (m_pattern.m_ignoreCase && isASCIIAlpha(ch)) {
+ or32(TrustedImm32(0x20), character);
+ ch |= 0x20;
+ }
+
+ return branch32(NotEqual, character, Imm32(ch));
+ }
+
+ void readCharacter(int inputPosition, RegisterID reg)
+ {
+ if (m_charSize == Char8)
+ load8(BaseIndex(input, index, TimesOne, inputPosition * sizeof(char)), reg);
+ else
+ load16(BaseIndex(input, index, TimesTwo, inputPosition * sizeof(UChar)), reg);
+ }
+
+ void storeToFrame(RegisterID reg, unsigned frameLocation)
+ {
+ poke(reg, frameLocation);
+ }
+
+ void storeToFrame(TrustedImm32 imm, unsigned frameLocation)
+ {
+ poke(imm, frameLocation);
+ }
+
+ DataLabelPtr storeToFrameWithPatch(unsigned frameLocation)
+ {
+ return storePtrWithPatch(TrustedImmPtr(0), Address(stackPointerRegister, frameLocation * sizeof(void*)));
+ }
+
+ void loadFromFrame(unsigned frameLocation, RegisterID reg)
+ {
+ peek(reg, frameLocation);
+ }
+
+ void loadFromFrameAndJump(unsigned frameLocation)
+ {
+ jump(Address(stackPointerRegister, frameLocation * sizeof(void*)));
+ }
+
+ void initCallFrame()
+ {
+ unsigned callFrameSize = m_pattern.m_body->m_callFrameSize;
+ if (callFrameSize)
+ subPtr(Imm32(callFrameSize * sizeof(void*)), stackPointerRegister);
+ }
+ void removeCallFrame()
+ {
+ unsigned callFrameSize = m_pattern.m_body->m_callFrameSize;
+ if (callFrameSize)
+ addPtr(Imm32(callFrameSize * sizeof(void*)), stackPointerRegister);
+ }
+
+ // Used to record subpatters, should only be called if compileMode is IncludeSubpatterns.
+ void setSubpatternStart(RegisterID reg, unsigned subpattern)
+ {
+ ASSERT(subpattern);
+ // FIXME: should be able to ASSERT(compileMode == IncludeSubpatterns), but then this function is conditionally NORETURN. :-(
+ store32(reg, Address(output, (subpattern << 1) * sizeof(int)));
+ }
+ void setSubpatternEnd(RegisterID reg, unsigned subpattern)
+ {
+ ASSERT(subpattern);
+ // FIXME: should be able to ASSERT(compileMode == IncludeSubpatterns), but then this function is conditionally NORETURN. :-(
+ store32(reg, Address(output, ((subpattern << 1) + 1) * sizeof(int)));
+ }
+ void clearSubpatternStart(unsigned subpattern)
+ {
+ ASSERT(subpattern);
+ // FIXME: should be able to ASSERT(compileMode == IncludeSubpatterns), but then this function is conditionally NORETURN. :-(
+ store32(TrustedImm32(-1), Address(output, (subpattern << 1) * sizeof(int)));
+ }
+
+ // We use one of three different strategies to track the start of the current match,
+ // while matching.
+ // 1) If the pattern has a fixed size, do nothing! - we calculate the value lazily
+ // at the end of matching. This is irrespective of compileMode, and in this case
+ // these methods should never be called.
+ // 2) If we're compiling IncludeSubpatterns, 'output' contains a pointer to an output
+ // vector, store the match start in the output vector.
+ // 3) If we're compiling MatchOnly, 'output' is unused, store the match start directly
+ // in this register.
+ void setMatchStart(RegisterID reg)
+ {
+ ASSERT(!m_pattern.m_body->m_hasFixedSize);
+ if (compileMode == IncludeSubpatterns)
+ store32(reg, output);
+ else
+ move(reg, output);
+ }
+ void getMatchStart(RegisterID reg)
+ {
+ ASSERT(!m_pattern.m_body->m_hasFixedSize);
+ if (compileMode == IncludeSubpatterns)
+ load32(output, reg);
+ else
+ move(output, reg);
+ }
+
+ enum YarrOpCode {
+ // These nodes wrap body alternatives - those in the main disjunction,
+ // rather than subpatterns or assertions. These are chained together in
+ // a doubly linked list, with a 'begin' node for the first alternative,
+ // a 'next' node for each subsequent alternative, and an 'end' node at
+ // the end. In the case of repeating alternatives, the 'end' node also
+ // has a reference back to 'begin'.
+ OpBodyAlternativeBegin,
+ OpBodyAlternativeNext,
+ OpBodyAlternativeEnd,
+ // Similar to the body alternatives, but used for subpatterns with two
+ // or more alternatives.
+ OpNestedAlternativeBegin,
+ OpNestedAlternativeNext,
+ OpNestedAlternativeEnd,
+ // Used for alternatives in subpatterns where there is only a single
+ // alternative (backtrackingis easier in these cases), or for alternatives
+ // which never need to be backtracked (those in parenthetical assertions,
+ // terminal subpatterns).
+ OpSimpleNestedAlternativeBegin,
+ OpSimpleNestedAlternativeNext,
+ OpSimpleNestedAlternativeEnd,
+ // Used to wrap 'Once' subpattern matches (quantityCount == 1).
+ OpParenthesesSubpatternOnceBegin,
+ OpParenthesesSubpatternOnceEnd,
+ // Used to wrap 'Terminal' subpattern matches (at the end of the regexp).
+ OpParenthesesSubpatternTerminalBegin,
+ OpParenthesesSubpatternTerminalEnd,
+ // Used to wrap parenthetical assertions.
+ OpParentheticalAssertionBegin,
+ OpParentheticalAssertionEnd,
+ // Wraps all simple terms (pattern characters, character classes).
+ OpTerm,
+ // Where an expression contains only 'once through' body alternatives
+ // and no repeating ones, this op is used to return match failure.
+ OpMatchFailed
+ };
+
+ // This structure is used to hold the compiled opcode information,
+ // including reference back to the original PatternTerm/PatternAlternatives,
+ // and JIT compilation data structures.
+ struct YarrOp {
+ explicit YarrOp(PatternTerm* term)
+ : m_op(OpTerm)
+ , m_term(term)
+ , m_isDeadCode(false)
+ {
+ }
+
+ explicit YarrOp(YarrOpCode op)
+ : m_op(op)
+ , m_isDeadCode(false)
+ {
+ }
+
+ // The operation, as a YarrOpCode, and also a reference to the PatternTerm.
+ YarrOpCode m_op;
+ PatternTerm* m_term;
+
+ // For alternatives, this holds the PatternAlternative and doubly linked
+ // references to this alternative's siblings. In the case of the
+ // OpBodyAlternativeEnd node at the end of a section of repeating nodes,
+ // m_nextOp will reference the OpBodyAlternativeBegin node of the first
+ // repeating alternative.
+ PatternAlternative* m_alternative;
+ size_t m_previousOp;
+ size_t m_nextOp;
+
+ // Used to record a set of Jumps out of the generated code, typically
+ // used for jumps out to backtracking code, and a single reentry back
+ // into the code for a node (likely where a backtrack will trigger
+ // rematching).
+ Label m_reentry;
+ JumpList m_jumps;
+
+ // Used for backtracking when the prior alternative did not consume any
+ // characters but matched.
+ Jump m_zeroLengthMatch;
+
+ // This flag is used to null out the second pattern character, when
+ // two are fused to match a pair together.
+ bool m_isDeadCode;
+
+ // Currently used in the case of some of the more complex management of
+ // 'm_checked', to cache the offset used in this alternative, to avoid
+ // recalculating it.
+ int m_checkAdjust;
+
+ // Used by OpNestedAlternativeNext/End to hold the pointer to the
+ // value that will be pushed into the pattern's frame to return to,
+ // upon backtracking back into the disjunction.
+ DataLabelPtr m_returnAddress;
+ };
+
+ // BacktrackingState
+ // This class encapsulates information about the state of code generation
+ // whilst generating the code for backtracking, when a term fails to match.
+ // Upon entry to code generation of the backtracking code for a given node,
+ // the Backtracking state will hold references to all control flow sources
+ // that are outputs in need of further backtracking from the prior node
+ // generated (which is the subsequent operation in the regular expression,
+ // and in the m_ops Vector, since we generated backtracking backwards).
+ // These references to control flow take the form of:
+ // - A jump list of jumps, to be linked to code that will backtrack them
+ // further.
+ // - A set of DataLabelPtr values, to be populated with values to be
+ // treated effectively as return addresses backtracking into complex
+ // subpatterns.
+ // - A flag indicating that the current sequence of generated code up to
+ // this point requires backtracking.
+ class BacktrackingState {
+ public:
+ BacktrackingState()
+ : m_pendingFallthrough(false)
+ {
+ }
+
+ // Add a jump or jumps, a return address, or set the flag indicating
+ // that the current 'fallthrough' control flow requires backtracking.
+ void append(const Jump& jump)
+ {
+ m_laterFailures.append(jump);
+ }
+ void append(JumpList& jumpList)
+ {
+ m_laterFailures.append(jumpList);
+ }
+ void append(const DataLabelPtr& returnAddress)
+ {
+ m_pendingReturns.append(returnAddress);
+ }
+ void fallthrough()
+ {
+ ASSERT(!m_pendingFallthrough);
+ m_pendingFallthrough = true;
+ }
+
+ // These methods clear the backtracking state, either linking to the
+ // current location, a provided label, or copying the backtracking out
+ // to a JumpList. All actions may require code generation to take place,
+ // and as such are passed a pointer to the assembler.
+ void link(MacroAssembler* assembler)
+ {
+ if (m_pendingReturns.size()) {
+ Label here(assembler);
+ for (unsigned i = 0; i < m_pendingReturns.size(); ++i)
+ m_backtrackRecords.append(ReturnAddressRecord(m_pendingReturns[i], here));
+ m_pendingReturns.clear();
+ }
+ m_laterFailures.link(assembler);
+ m_laterFailures.clear();
+ m_pendingFallthrough = false;
+ }
+ void linkTo(Label label, MacroAssembler* assembler)
+ {
+ if (m_pendingReturns.size()) {
+ for (unsigned i = 0; i < m_pendingReturns.size(); ++i)
+ m_backtrackRecords.append(ReturnAddressRecord(m_pendingReturns[i], label));
+ m_pendingReturns.clear();
+ }
+ if (m_pendingFallthrough)
+ assembler->jump(label);
+ m_laterFailures.linkTo(label, assembler);
+ m_laterFailures.clear();
+ m_pendingFallthrough = false;
+ }
+ void takeBacktracksToJumpList(JumpList& jumpList, MacroAssembler* assembler)
+ {
+ if (m_pendingReturns.size()) {
+ Label here(assembler);
+ for (unsigned i = 0; i < m_pendingReturns.size(); ++i)
+ m_backtrackRecords.append(ReturnAddressRecord(m_pendingReturns[i], here));
+ m_pendingReturns.clear();
+ m_pendingFallthrough = true;
+ }
+ if (m_pendingFallthrough)
+ jumpList.append(assembler->jump());
+ jumpList.append(m_laterFailures);
+ m_laterFailures.clear();
+ m_pendingFallthrough = false;
+ }
+
+ bool isEmpty()
+ {
+ return m_laterFailures.empty() && m_pendingReturns.isEmpty() && !m_pendingFallthrough;
+ }
+
+ // Called at the end of code generation to link all return addresses.
+ void linkDataLabels(LinkBuffer& linkBuffer)
+ {
+ ASSERT(isEmpty());
+ for (unsigned i = 0; i < m_backtrackRecords.size(); ++i)
+ linkBuffer.patch(m_backtrackRecords[i].m_dataLabel, linkBuffer.locationOf(m_backtrackRecords[i].m_backtrackLocation));
+ }
+
+ private:
+ struct ReturnAddressRecord {
+ ReturnAddressRecord(DataLabelPtr dataLabel, Label backtrackLocation)
+ : m_dataLabel(dataLabel)
+ , m_backtrackLocation(backtrackLocation)
+ {
+ }
+
+ DataLabelPtr m_dataLabel;
+ Label m_backtrackLocation;
+ };
+
+ JumpList m_laterFailures;
+ bool m_pendingFallthrough;
+ Vector<DataLabelPtr, 4> m_pendingReturns;
+ Vector<ReturnAddressRecord, 4> m_backtrackRecords;
+ };
+
+ // Generation methods:
+ // ===================
+
+ // This method provides a default implementation of backtracking common
+ // to many terms; terms commonly jump out of the forwards matching path
+ // on any failed conditions, and add these jumps to the m_jumps list. If
+ // no special handling is required we can often just backtrack to m_jumps.
+ void backtrackTermDefault(size_t opIndex)
+ {
+ YarrOp& op = m_ops[opIndex];
+ m_backtrackingState.append(op.m_jumps);
+ }
+
+ void generateAssertionBOL(size_t opIndex)
+ {
+ YarrOp& op = m_ops[opIndex];
+ PatternTerm* term = op.m_term;
+
+ if (m_pattern.m_multiline) {
+ const RegisterID character = regT0;
+
+ JumpList matchDest;
+ if (!term->inputPosition)
+ matchDest.append(branch32(Equal, index, Imm32(m_checked)));
+
+ readCharacter((term->inputPosition - m_checked) - 1, character);
+ matchCharacterClass(character, matchDest, m_pattern.newlineCharacterClass());
+ op.m_jumps.append(jump());
+
+ matchDest.link(this);
+ } else {
+ // Erk, really should poison out these alternatives early. :-/
+ if (term->inputPosition)
+ op.m_jumps.append(jump());
+ else
+ op.m_jumps.append(branch32(NotEqual, index, Imm32(m_checked)));
+ }
+ }
+ void backtrackAssertionBOL(size_t opIndex)
+ {
+ backtrackTermDefault(opIndex);
+ }
+
+ void generateAssertionEOL(size_t opIndex)
+ {
+ YarrOp& op = m_ops[opIndex];
+ PatternTerm* term = op.m_term;
+
+ if (m_pattern.m_multiline) {
+ const RegisterID character = regT0;
+
+ JumpList matchDest;
+ if (term->inputPosition == m_checked)
+ matchDest.append(atEndOfInput());
+
+ readCharacter(term->inputPosition - m_checked, character);
+ matchCharacterClass(character, matchDest, m_pattern.newlineCharacterClass());
+ op.m_jumps.append(jump());
+
+ matchDest.link(this);
+ } else {
+ if (term->inputPosition == m_checked)
+ op.m_jumps.append(notAtEndOfInput());
+ // Erk, really should poison out these alternatives early. :-/
+ else
+ op.m_jumps.append(jump());
+ }
+ }
+ void backtrackAssertionEOL(size_t opIndex)
+ {
+ backtrackTermDefault(opIndex);
+ }
+
+ // Also falls though on nextIsNotWordChar.
+ void matchAssertionWordchar(size_t opIndex, JumpList& nextIsWordChar, JumpList& nextIsNotWordChar)
+ {
+ YarrOp& op = m_ops[opIndex];
+ PatternTerm* term = op.m_term;
+
+ const RegisterID character = regT0;
+
+ if (term->inputPosition == m_checked)
+ nextIsNotWordChar.append(atEndOfInput());
+
+ readCharacter((term->inputPosition - m_checked), character);
+ matchCharacterClass(character, nextIsWordChar, m_pattern.wordcharCharacterClass());
+ }
+
+ void generateAssertionWordBoundary(size_t opIndex)
+ {
+ YarrOp& op = m_ops[opIndex];
+ PatternTerm* term = op.m_term;
+
+ const RegisterID character = regT0;
+
+ Jump atBegin;
+ JumpList matchDest;
+ if (!term->inputPosition)
+ atBegin = branch32(Equal, index, Imm32(m_checked));
+ readCharacter((term->inputPosition - m_checked) - 1, character);
+ matchCharacterClass(character, matchDest, m_pattern.wordcharCharacterClass());
+ if (!term->inputPosition)
+ atBegin.link(this);
+
+ // We fall through to here if the last character was not a wordchar.
+ JumpList nonWordCharThenWordChar;
+ JumpList nonWordCharThenNonWordChar;
+ if (term->invert()) {
+ matchAssertionWordchar(opIndex, nonWordCharThenNonWordChar, nonWordCharThenWordChar);
+ nonWordCharThenWordChar.append(jump());
+ } else {
+ matchAssertionWordchar(opIndex, nonWordCharThenWordChar, nonWordCharThenNonWordChar);
+ nonWordCharThenNonWordChar.append(jump());
+ }
+ op.m_jumps.append(nonWordCharThenNonWordChar);
+
+ // We jump here if the last character was a wordchar.
+ matchDest.link(this);
+ JumpList wordCharThenWordChar;
+ JumpList wordCharThenNonWordChar;
+ if (term->invert()) {
+ matchAssertionWordchar(opIndex, wordCharThenNonWordChar, wordCharThenWordChar);
+ wordCharThenWordChar.append(jump());
+ } else {
+ matchAssertionWordchar(opIndex, wordCharThenWordChar, wordCharThenNonWordChar);
+ // This can fall-though!
+ }
+
+ op.m_jumps.append(wordCharThenWordChar);
+
+ nonWordCharThenWordChar.link(this);
+ wordCharThenNonWordChar.link(this);
+ }
+ void backtrackAssertionWordBoundary(size_t opIndex)
+ {
+ backtrackTermDefault(opIndex);
+ }
+
+ void generatePatternCharacterOnce(size_t opIndex)
+ {
+ YarrOp& op = m_ops[opIndex];
+
+ if (op.m_isDeadCode)
+ return;
+
+ // m_ops always ends with a OpBodyAlternativeEnd or OpMatchFailed
+ // node, so there must always be at least one more node.
+ ASSERT(opIndex + 1 < m_ops.size());
+ YarrOp* nextOp = &m_ops[opIndex + 1];
+
+ PatternTerm* term = op.m_term;
+ UChar ch = term->patternCharacter;
+
+ if ((ch > 0xff) && (m_charSize == Char8)) {
+ // Have a 16 bit pattern character and an 8 bit string - short circuit
+ op.m_jumps.append(jump());
+ return;
+ }
+
+ const RegisterID character = regT0;
+ int maxCharactersAtOnce = m_charSize == Char8 ? 4 : 2;
+ unsigned ignoreCaseMask = 0;
+ int allCharacters = ch;
+ int numberCharacters;
+ int startTermPosition = term->inputPosition;
+
+ // For case-insesitive compares, non-ascii characters that have different
+ // upper & lower case representations are converted to a character class.
+ ASSERT(!m_pattern.m_ignoreCase || isASCIIAlpha(ch) || isCanonicallyUnique(ch));
+
+ if (m_pattern.m_ignoreCase && isASCIIAlpha(ch))
+ ignoreCaseMask |= 32;
+
+ for (numberCharacters = 1; numberCharacters < maxCharactersAtOnce && nextOp->m_op == OpTerm; ++numberCharacters, nextOp = &m_ops[opIndex + numberCharacters]) {
+ PatternTerm* nextTerm = nextOp->m_term;
+
+ if (nextTerm->type != PatternTerm::TypePatternCharacter
+ || nextTerm->quantityType != QuantifierFixedCount
+ || nextTerm->quantityCount != 1
+ || nextTerm->inputPosition != (startTermPosition + numberCharacters))
+ break;
+
+ nextOp->m_isDeadCode = true;
+
+ int shiftAmount = (m_charSize == Char8 ? 8 : 16) * numberCharacters;
+
+ UChar currentCharacter = nextTerm->patternCharacter;
+
+ if ((currentCharacter > 0xff) && (m_charSize == Char8)) {
+ // Have a 16 bit pattern character and an 8 bit string - short circuit
+ op.m_jumps.append(jump());
+ return;
+ }
+
+ // For case-insesitive compares, non-ascii characters that have different
+ // upper & lower case representations are converted to a character class.
+ ASSERT(!m_pattern.m_ignoreCase || isASCIIAlpha(currentCharacter) || isCanonicallyUnique(currentCharacter));
+
+ allCharacters |= (currentCharacter << shiftAmount);
+
+ if ((m_pattern.m_ignoreCase) && (isASCIIAlpha(currentCharacter)))
+ ignoreCaseMask |= 32 << shiftAmount;
+ }
+
+ if (m_charSize == Char8) {
+ switch (numberCharacters) {
+ case 1:
+ op.m_jumps.append(jumpIfCharNotEquals(ch, startTermPosition - m_checked, character));
+ return;
+ case 2: {
+ BaseIndex address(input, index, TimesOne, (startTermPosition - m_checked) * sizeof(LChar));
+ load16Unaligned(address, character);
+ break;
+ }
+ case 3: {
+ BaseIndex highAddress(input, index, TimesOne, (startTermPosition - m_checked) * sizeof(LChar));
+ load16Unaligned(highAddress, character);
+ if (ignoreCaseMask)
+ or32(Imm32(ignoreCaseMask), character);
+ op.m_jumps.append(branch32(NotEqual, character, Imm32((allCharacters & 0xffff) | ignoreCaseMask)));
+ op.m_jumps.append(jumpIfCharNotEquals(allCharacters >> 16, startTermPosition + 2 - m_checked, character));
+ return;
+ }
+ case 4: {
+ BaseIndex address(input, index, TimesOne, (startTermPosition - m_checked) * sizeof(LChar));
+ load32WithUnalignedHalfWords(address, character);
+ break;
+ }
+ }
+ } else {
+ switch (numberCharacters) {
+ case 1:
+ op.m_jumps.append(jumpIfCharNotEquals(ch, term->inputPosition - m_checked, character));
+ return;
+ case 2:
+ BaseIndex address(input, index, TimesTwo, (term->inputPosition - m_checked) * sizeof(UChar));
+ load32WithUnalignedHalfWords(address, character);
+ break;
+ }
+ }
+
+ if (ignoreCaseMask)
+ or32(Imm32(ignoreCaseMask), character);
+ op.m_jumps.append(branch32(NotEqual, character, Imm32(allCharacters | ignoreCaseMask)));
+ return;
+ }
+ void backtrackPatternCharacterOnce(size_t opIndex)
+ {
+ backtrackTermDefault(opIndex);
+ }
+
+ void generatePatternCharacterFixed(size_t opIndex)
+ {
+ YarrOp& op = m_ops[opIndex];
+ PatternTerm* term = op.m_term;
+ UChar ch = term->patternCharacter;
+
+ const RegisterID character = regT0;
+ const RegisterID countRegister = regT1;
+
+ move(index, countRegister);
+ sub32(Imm32(term->quantityCount.unsafeGet()), countRegister);
+
+ Label loop(this);
+ BaseIndex address(input, countRegister, m_charScale, (Checked<int>(term->inputPosition - m_checked + Checked<int64_t>(term->quantityCount)) * static_cast<int>(m_charSize == Char8 ? sizeof(char) : sizeof(UChar))).unsafeGet());
+
+ if (m_charSize == Char8)
+ load8(address, character);
+ else
+ load16(address, character);
+
+ // For case-insesitive compares, non-ascii characters that have different
+ // upper & lower case representations are converted to a character class.
+ ASSERT(!m_pattern.m_ignoreCase || isASCIIAlpha(ch) || isCanonicallyUnique(ch));
+ if (m_pattern.m_ignoreCase && isASCIIAlpha(ch)) {
+ or32(TrustedImm32(0x20), character);
+ ch |= 0x20;
+ }
+
+ op.m_jumps.append(branch32(NotEqual, character, Imm32(ch)));
+ add32(TrustedImm32(1), countRegister);
+ branch32(NotEqual, countRegister, index).linkTo(loop, this);
+ }
+ void backtrackPatternCharacterFixed(size_t opIndex)
+ {
+ backtrackTermDefault(opIndex);
+ }
+
+ void generatePatternCharacterGreedy(size_t opIndex)
+ {
+ YarrOp& op = m_ops[opIndex];
+ PatternTerm* term = op.m_term;
+ UChar ch = term->patternCharacter;
+
+ const RegisterID character = regT0;
+ const RegisterID countRegister = regT1;
+
+ move(TrustedImm32(0), countRegister);
+
+ // Unless have a 16 bit pattern character and an 8 bit string - short circuit
+ if (!((ch > 0xff) && (m_charSize == Char8))) {
+ JumpList failures;
+ Label loop(this);
+ failures.append(atEndOfInput());
+ failures.append(jumpIfCharNotEquals(ch, term->inputPosition - m_checked, character));
+
+ add32(TrustedImm32(1), countRegister);
+ add32(TrustedImm32(1), index);
+ if (term->quantityCount == quantifyInfinite)
+ jump(loop);
+ else
+ branch32(NotEqual, countRegister, Imm32(term->quantityCount.unsafeGet())).linkTo(loop, this);
+
+ failures.link(this);
+ }
+ op.m_reentry = label();
+
+ storeToFrame(countRegister, term->frameLocation);
+ }
+ void backtrackPatternCharacterGreedy(size_t opIndex)
+ {
+ YarrOp& op = m_ops[opIndex];
+ PatternTerm* term = op.m_term;
+
+ const RegisterID countRegister = regT1;
+
+ m_backtrackingState.link(this);
+
+ loadFromFrame(term->frameLocation, countRegister);
+ m_backtrackingState.append(branchTest32(Zero, countRegister));
+ sub32(TrustedImm32(1), countRegister);
+ sub32(TrustedImm32(1), index);
+ jump(op.m_reentry);
+ }
+
+ void generatePatternCharacterNonGreedy(size_t opIndex)
+ {
+ YarrOp& op = m_ops[opIndex];
+ PatternTerm* term = op.m_term;
+
+ const RegisterID countRegister = regT1;
+
+ move(TrustedImm32(0), countRegister);
+ op.m_reentry = label();
+ storeToFrame(countRegister, term->frameLocation);
+ }
+ void backtrackPatternCharacterNonGreedy(size_t opIndex)
+ {
+ YarrOp& op = m_ops[opIndex];
+ PatternTerm* term = op.m_term;
+ UChar ch = term->patternCharacter;
+
+ const RegisterID character = regT0;
+ const RegisterID countRegister = regT1;
+
+ m_backtrackingState.link(this);
+
+ loadFromFrame(term->frameLocation, countRegister);
+
+ // Unless have a 16 bit pattern character and an 8 bit string - short circuit
+ if (!((ch > 0xff) && (m_charSize == Char8))) {
+ JumpList nonGreedyFailures;
+ nonGreedyFailures.append(atEndOfInput());
+ if (term->quantityCount != quantifyInfinite)
+ nonGreedyFailures.append(branch32(Equal, countRegister, Imm32(term->quantityCount.unsafeGet())));
+ nonGreedyFailures.append(jumpIfCharNotEquals(ch, term->inputPosition - m_checked, character));
+
+ add32(TrustedImm32(1), countRegister);
+ add32(TrustedImm32(1), index);
+
+ jump(op.m_reentry);
+ nonGreedyFailures.link(this);
+ }
+
+ sub32(countRegister, index);
+ m_backtrackingState.fallthrough();
+ }
+
+ void generateCharacterClassOnce(size_t opIndex)
+ {
+ YarrOp& op = m_ops[opIndex];
+ PatternTerm* term = op.m_term;
+
+ const RegisterID character = regT0;
+
+ JumpList matchDest;
+ readCharacter(term->inputPosition - m_checked, character);
+ matchCharacterClass(character, matchDest, term->characterClass);
+
+ if (term->invert())
+ op.m_jumps.append(matchDest);
+ else {
+ op.m_jumps.append(jump());
+ matchDest.link(this);
+ }
+ }
+ void backtrackCharacterClassOnce(size_t opIndex)
+ {
+ backtrackTermDefault(opIndex);
+ }
+
+ void generateCharacterClassFixed(size_t opIndex)
+ {
+ YarrOp& op = m_ops[opIndex];
+ PatternTerm* term = op.m_term;
+
+ const RegisterID character = regT0;
+ const RegisterID countRegister = regT1;
+
+ move(index, countRegister);
+ sub32(Imm32(term->quantityCount.unsafeGet()), countRegister);
+
+ Label loop(this);
+ JumpList matchDest;
+ if (m_charSize == Char8)
+ load8(BaseIndex(input, countRegister, TimesOne, (Checked<int>(term->inputPosition - m_checked + Checked<int64_t>(term->quantityCount)) * static_cast<int>(sizeof(char))).unsafeGet()), character);
+ else
+ load16(BaseIndex(input, countRegister, TimesTwo, (Checked<int>(term->inputPosition - m_checked + Checked<int64_t>(term->quantityCount)) * static_cast<int>(sizeof(UChar))).unsafeGet()), character);
+ matchCharacterClass(character, matchDest, term->characterClass);
+
+ if (term->invert())
+ op.m_jumps.append(matchDest);
+ else {
+ op.m_jumps.append(jump());
+ matchDest.link(this);
+ }
+
+ add32(TrustedImm32(1), countRegister);
+ branch32(NotEqual, countRegister, index).linkTo(loop, this);
+ }
+ void backtrackCharacterClassFixed(size_t opIndex)
+ {
+ backtrackTermDefault(opIndex);
+ }
+
+ void generateCharacterClassGreedy(size_t opIndex)
+ {
+ YarrOp& op = m_ops[opIndex];
+ PatternTerm* term = op.m_term;
+
+ const RegisterID character = regT0;
+ const RegisterID countRegister = regT1;
+
+ move(TrustedImm32(0), countRegister);
+
+ JumpList failures;
+ Label loop(this);
+ failures.append(atEndOfInput());
+
+ if (term->invert()) {
+ readCharacter(term->inputPosition - m_checked, character);
+ matchCharacterClass(character, failures, term->characterClass);
+ } else {
+ JumpList matchDest;
+ readCharacter(term->inputPosition - m_checked, character);
+ matchCharacterClass(character, matchDest, term->characterClass);
+ failures.append(jump());
+ matchDest.link(this);
+ }
+
+ add32(TrustedImm32(1), countRegister);
+ add32(TrustedImm32(1), index);
+ if (term->quantityCount != quantifyInfinite) {
+ branch32(NotEqual, countRegister, Imm32(term->quantityCount.unsafeGet())).linkTo(loop, this);
+ failures.append(jump());
+ } else
+ jump(loop);
+
+ failures.link(this);
+ op.m_reentry = label();
+
+ storeToFrame(countRegister, term->frameLocation);
+ }
+ void backtrackCharacterClassGreedy(size_t opIndex)
+ {
+ YarrOp& op = m_ops[opIndex];
+ PatternTerm* term = op.m_term;
+
+ const RegisterID countRegister = regT1;
+
+ m_backtrackingState.link(this);
+
+ loadFromFrame(term->frameLocation, countRegister);
+ m_backtrackingState.append(branchTest32(Zero, countRegister));
+ sub32(TrustedImm32(1), countRegister);
+ sub32(TrustedImm32(1), index);
+ jump(op.m_reentry);
+ }
+
+ void generateCharacterClassNonGreedy(size_t opIndex)
+ {
+ YarrOp& op = m_ops[opIndex];
+ PatternTerm* term = op.m_term;
+
+ const RegisterID countRegister = regT1;
+
+ move(TrustedImm32(0), countRegister);
+ op.m_reentry = label();
+ storeToFrame(countRegister, term->frameLocation);
+ }
+ void backtrackCharacterClassNonGreedy(size_t opIndex)
+ {
+ YarrOp& op = m_ops[opIndex];
+ PatternTerm* term = op.m_term;
+
+ const RegisterID character = regT0;
+ const RegisterID countRegister = regT1;
+
+ JumpList nonGreedyFailures;
+
+ m_backtrackingState.link(this);
+
+ loadFromFrame(term->frameLocation, countRegister);
+
+ nonGreedyFailures.append(atEndOfInput());
+ nonGreedyFailures.append(branch32(Equal, countRegister, Imm32(term->quantityCount.unsafeGet())));
+
+ JumpList matchDest;
+ readCharacter(term->inputPosition - m_checked, character);
+ matchCharacterClass(character, matchDest, term->characterClass);
+
+ if (term->invert())
+ nonGreedyFailures.append(matchDest);
+ else {
+ nonGreedyFailures.append(jump());
+ matchDest.link(this);
+ }
+
+ add32(TrustedImm32(1), countRegister);
+ add32(TrustedImm32(1), index);
+
+ jump(op.m_reentry);
+
+ nonGreedyFailures.link(this);
+ sub32(countRegister, index);
+ m_backtrackingState.fallthrough();
+ }
+
+ void generateDotStarEnclosure(size_t opIndex)
+ {
+ YarrOp& op = m_ops[opIndex];
+ PatternTerm* term = op.m_term;
+
+ const RegisterID character = regT0;
+ const RegisterID matchPos = regT1;
+
+ JumpList foundBeginningNewLine;
+ JumpList saveStartIndex;
+ JumpList foundEndingNewLine;
+
+ ASSERT(!m_pattern.m_body->m_hasFixedSize);
+ getMatchStart(matchPos);
+
+ saveStartIndex.append(branchTest32(Zero, matchPos));
+ Label findBOLLoop(this);
+ sub32(TrustedImm32(1), matchPos);
+ if (m_charSize == Char8)
+ load8(BaseIndex(input, matchPos, TimesOne, 0), character);
+ else
+ load16(BaseIndex(input, matchPos, TimesTwo, 0), character);
+ matchCharacterClass(character, foundBeginningNewLine, m_pattern.newlineCharacterClass());
+ branchTest32(NonZero, matchPos).linkTo(findBOLLoop, this);
+ saveStartIndex.append(jump());
+
+ foundBeginningNewLine.link(this);
+ add32(TrustedImm32(1), matchPos); // Advance past newline
+ saveStartIndex.link(this);
+
+ if (!m_pattern.m_multiline && term->anchors.bolAnchor)
+ op.m_jumps.append(branchTest32(NonZero, matchPos));
+
+ ASSERT(!m_pattern.m_body->m_hasFixedSize);
+ setMatchStart(matchPos);
+
+ move(index, matchPos);
+
+ Label findEOLLoop(this);
+ foundEndingNewLine.append(branch32(Equal, matchPos, length));
+ if (m_charSize == Char8)
+ load8(BaseIndex(input, matchPos, TimesOne, 0), character);
+ else
+ load16(BaseIndex(input, matchPos, TimesTwo, 0), character);
+ matchCharacterClass(character, foundEndingNewLine, m_pattern.newlineCharacterClass());
+ add32(TrustedImm32(1), matchPos);
+ jump(findEOLLoop);
+
+ foundEndingNewLine.link(this);
+
+ if (!m_pattern.m_multiline && term->anchors.eolAnchor)
+ op.m_jumps.append(branch32(NotEqual, matchPos, length));
+
+ move(matchPos, index);
+ }
+
+ void backtrackDotStarEnclosure(size_t opIndex)
+ {
+ backtrackTermDefault(opIndex);
+ }
+
+ // Code generation/backtracking for simple terms
+ // (pattern characters, character classes, and assertions).
+ // These methods farm out work to the set of functions above.
+ void generateTerm(size_t opIndex)
+ {
+ YarrOp& op = m_ops[opIndex];
+ PatternTerm* term = op.m_term;
+
+ switch (term->type) {
+ case PatternTerm::TypePatternCharacter:
+ switch (term->quantityType) {
+ case QuantifierFixedCount:
+ if (term->quantityCount == 1)
+ generatePatternCharacterOnce(opIndex);
+ else
+ generatePatternCharacterFixed(opIndex);
+ break;
+ case QuantifierGreedy:
+ generatePatternCharacterGreedy(opIndex);
+ break;
+ case QuantifierNonGreedy:
+ generatePatternCharacterNonGreedy(opIndex);
+ break;
+ }
+ break;
+
+ case PatternTerm::TypeCharacterClass:
+ switch (term->quantityType) {
+ case QuantifierFixedCount:
+ if (term->quantityCount == 1)
+ generateCharacterClassOnce(opIndex);
+ else
+ generateCharacterClassFixed(opIndex);
+ break;
+ case QuantifierGreedy:
+ generateCharacterClassGreedy(opIndex);
+ break;
+ case QuantifierNonGreedy:
+ generateCharacterClassNonGreedy(opIndex);
+ break;
+ }
+ break;
+
+ case PatternTerm::TypeAssertionBOL:
+ generateAssertionBOL(opIndex);
+ break;
+
+ case PatternTerm::TypeAssertionEOL:
+ generateAssertionEOL(opIndex);
+ break;
+
+ case PatternTerm::TypeAssertionWordBoundary:
+ generateAssertionWordBoundary(opIndex);
+ break;
+
+ case PatternTerm::TypeForwardReference:
+ break;
+
+ case PatternTerm::TypeParenthesesSubpattern:
+ case PatternTerm::TypeParentheticalAssertion:
+ ASSERT_NOT_REACHED();
+ case PatternTerm::TypeBackReference:
+ m_shouldFallBack = true;
+ break;
+ case PatternTerm::TypeDotStarEnclosure:
+ generateDotStarEnclosure(opIndex);
+ break;
+ }
+ }
+ void backtrackTerm(size_t opIndex)
+ {
+ YarrOp& op = m_ops[opIndex];
+ PatternTerm* term = op.m_term;
+
+ switch (term->type) {
+ case PatternTerm::TypePatternCharacter:
+ switch (term->quantityType) {
+ case QuantifierFixedCount:
+ if (term->quantityCount == 1)
+ backtrackPatternCharacterOnce(opIndex);
+ else
+ backtrackPatternCharacterFixed(opIndex);
+ break;
+ case QuantifierGreedy:
+ backtrackPatternCharacterGreedy(opIndex);
+ break;
+ case QuantifierNonGreedy:
+ backtrackPatternCharacterNonGreedy(opIndex);
+ break;
+ }
+ break;
+
+ case PatternTerm::TypeCharacterClass:
+ switch (term->quantityType) {
+ case QuantifierFixedCount:
+ if (term->quantityCount == 1)
+ backtrackCharacterClassOnce(opIndex);
+ else
+ backtrackCharacterClassFixed(opIndex);
+ break;
+ case QuantifierGreedy:
+ backtrackCharacterClassGreedy(opIndex);
+ break;
+ case QuantifierNonGreedy:
+ backtrackCharacterClassNonGreedy(opIndex);
+ break;
+ }
+ break;
+
+ case PatternTerm::TypeAssertionBOL:
+ backtrackAssertionBOL(opIndex);
+ break;
+
+ case PatternTerm::TypeAssertionEOL:
+ backtrackAssertionEOL(opIndex);
+ break;
+
+ case PatternTerm::TypeAssertionWordBoundary:
+ backtrackAssertionWordBoundary(opIndex);
+ break;
+
+ case PatternTerm::TypeForwardReference:
+ break;
+
+ case PatternTerm::TypeParenthesesSubpattern:
+ case PatternTerm::TypeParentheticalAssertion:
+ ASSERT_NOT_REACHED();
+
+ case PatternTerm::TypeDotStarEnclosure:
+ backtrackDotStarEnclosure(opIndex);
+ break;
+
+ case PatternTerm::TypeBackReference:
+ m_shouldFallBack = true;
+ break;
+ }
+ }
+
+ void generate()
+ {
+ // Forwards generate the matching code.
+ ASSERT(m_ops.size());
+ size_t opIndex = 0;
+
+ do {
+ YarrOp& op = m_ops[opIndex];
+ switch (op.m_op) {
+
+ case OpTerm:
+ generateTerm(opIndex);
+ break;
+
+ // OpBodyAlternativeBegin/Next/End
+ //
+ // These nodes wrap the set of alternatives in the body of the regular expression.
+ // There may be either one or two chains of OpBodyAlternative nodes, one representing
+ // the 'once through' sequence of alternatives (if any exist), and one representing
+ // the repeating alternatives (again, if any exist).
+ //
+ // Upon normal entry to the Begin alternative, we will check that input is available.
+ // Reentry to the Begin alternative will take place after the check has taken place,
+ // and will assume that the input position has already been progressed as appropriate.
+ //
+ // Entry to subsequent Next/End alternatives occurs when the prior alternative has
+ // successfully completed a match - return a success state from JIT code.
+ //
+ // Next alternatives allow for reentry optimized to suit backtracking from its
+ // preceding alternative. It expects the input position to still be set to a position
+ // appropriate to its predecessor, and it will only perform an input check if the
+ // predecessor had a minimum size less than its own.
+ //
+ // In the case 'once through' expressions, the End node will also have a reentry
+ // point to jump to when the last alternative fails. Again, this expects the input
+ // position to still reflect that expected by the prior alternative.
+ case OpBodyAlternativeBegin: {
+ PatternAlternative* alternative = op.m_alternative;
+
+ // Upon entry at the head of the set of alternatives, check if input is available
+ // to run the first alternative. (This progresses the input position).
+ op.m_jumps.append(jumpIfNoAvailableInput(alternative->m_minimumSize));
+ // We will reenter after the check, and assume the input position to have been
+ // set as appropriate to this alternative.
+ op.m_reentry = label();
+
+ m_checked += alternative->m_minimumSize;
+ break;
+ }
+ case OpBodyAlternativeNext:
+ case OpBodyAlternativeEnd: {
+ PatternAlternative* priorAlternative = m_ops[op.m_previousOp].m_alternative;
+ PatternAlternative* alternative = op.m_alternative;
+
+ // If we get here, the prior alternative matched - return success.
+
+ // Adjust the stack pointer to remove the pattern's frame.
+ removeCallFrame();
+
+ // Load appropriate values into the return register and the first output
+ // slot, and return. In the case of pattern with a fixed size, we will
+ // not have yet set the value in the first
+ ASSERT(index != returnRegister);
+ if (m_pattern.m_body->m_hasFixedSize) {
+ move(index, returnRegister);
+ if (priorAlternative->m_minimumSize)
+ sub32(Imm32(priorAlternative->m_minimumSize), returnRegister);
+ if (compileMode == IncludeSubpatterns)
+ store32(returnRegister, output);
+ } else
+ getMatchStart(returnRegister);
+ if (compileMode == IncludeSubpatterns)
+ store32(index, Address(output, 4));
+ move(index, returnRegister2);
+
+ generateReturn();
+
+ // This is the divide between the tail of the prior alternative, above, and
+ // the head of the subsequent alternative, below.
+
+ if (op.m_op == OpBodyAlternativeNext) {
+ // This is the reentry point for the Next alternative. We expect any code
+ // that jumps here to do so with the input position matching that of the
+ // PRIOR alteranative, and we will only check input availability if we
+ // need to progress it forwards.
+ op.m_reentry = label();
+ if (alternative->m_minimumSize > priorAlternative->m_minimumSize) {
+ add32(Imm32(alternative->m_minimumSize - priorAlternative->m_minimumSize), index);
+ op.m_jumps.append(jumpIfNoAvailableInput());
+ } else if (priorAlternative->m_minimumSize > alternative->m_minimumSize)
+ sub32(Imm32(priorAlternative->m_minimumSize - alternative->m_minimumSize), index);
+ } else if (op.m_nextOp == notFound) {
+ // This is the reentry point for the End of 'once through' alternatives,
+ // jumped to when the last alternative fails to match.
+ op.m_reentry = label();
+ sub32(Imm32(priorAlternative->m_minimumSize), index);
+ }
+
+ if (op.m_op == OpBodyAlternativeNext)
+ m_checked += alternative->m_minimumSize;
+ m_checked -= priorAlternative->m_minimumSize;
+ break;
+ }
+
+ // OpSimpleNestedAlternativeBegin/Next/End
+ // OpNestedAlternativeBegin/Next/End
+ //
+ // These nodes are used to handle sets of alternatives that are nested within
+ // subpatterns and parenthetical assertions. The 'simple' forms are used where
+ // we do not need to be able to backtrack back into any alternative other than
+ // the last, the normal forms allow backtracking into any alternative.
+ //
+ // Each Begin/Next node is responsible for planting an input check to ensure
+ // sufficient input is available on entry. Next nodes additionally need to
+ // jump to the end - Next nodes use the End node's m_jumps list to hold this
+ // set of jumps.
+ //
+ // In the non-simple forms, successful alternative matches must store a
+ // 'return address' using a DataLabelPtr, used to store the address to jump
+ // to when backtracking, to get to the code for the appropriate alternative.
+ case OpSimpleNestedAlternativeBegin:
+ case OpNestedAlternativeBegin: {
+ PatternTerm* term = op.m_term;
+ PatternAlternative* alternative = op.m_alternative;
+ PatternDisjunction* disjunction = term->parentheses.disjunction;
+
+ // Calculate how much input we need to check for, and if non-zero check.
+ op.m_checkAdjust = alternative->m_minimumSize;
+ if ((term->quantityType == QuantifierFixedCount) && (term->type != PatternTerm::TypeParentheticalAssertion))
+ op.m_checkAdjust -= disjunction->m_minimumSize;
+ if (op.m_checkAdjust)
+ op.m_jumps.append(jumpIfNoAvailableInput(op.m_checkAdjust));
+
+ m_checked += op.m_checkAdjust;
+ break;
+ }
+ case OpSimpleNestedAlternativeNext:
+ case OpNestedAlternativeNext: {
+ PatternTerm* term = op.m_term;
+ PatternAlternative* alternative = op.m_alternative;
+ PatternDisjunction* disjunction = term->parentheses.disjunction;
+
+ // In the non-simple case, store a 'return address' so we can backtrack correctly.
+ if (op.m_op == OpNestedAlternativeNext) {
+ unsigned parenthesesFrameLocation = term->frameLocation;
+ unsigned alternativeFrameLocation = parenthesesFrameLocation;
+ if (term->quantityType != QuantifierFixedCount)
+ alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce;
+ op.m_returnAddress = storeToFrameWithPatch(alternativeFrameLocation);
+ }
+
+ if (term->quantityType != QuantifierFixedCount && !m_ops[op.m_previousOp].m_alternative->m_minimumSize) {
+ // If the previous alternative matched without consuming characters then
+ // backtrack to try to match while consumming some input.
+ op.m_zeroLengthMatch = branch32(Equal, index, Address(stackPointerRegister, term->frameLocation * sizeof(void*)));
+ }
+
+ // If we reach here then the last alternative has matched - jump to the
+ // End node, to skip over any further alternatives.
+ //
+ // FIXME: this is logically O(N^2) (though N can be expected to be very
+ // small). We could avoid this either by adding an extra jump to the JIT
+ // data structures, or by making backtracking code that jumps to Next
+ // alternatives are responsible for checking that input is available (if
+ // we didn't need to plant the input checks, then m_jumps would be free).
+ YarrOp* endOp = &m_ops[op.m_nextOp];
+ while (endOp->m_nextOp != notFound) {
+ ASSERT(endOp->m_op == OpSimpleNestedAlternativeNext || endOp->m_op == OpNestedAlternativeNext);
+ endOp = &m_ops[endOp->m_nextOp];
+ }
+ ASSERT(endOp->m_op == OpSimpleNestedAlternativeEnd || endOp->m_op == OpNestedAlternativeEnd);
+ endOp->m_jumps.append(jump());
+
+ // This is the entry point for the next alternative.
+ op.m_reentry = label();
+
+ // Calculate how much input we need to check for, and if non-zero check.
+ op.m_checkAdjust = alternative->m_minimumSize;
+ if ((term->quantityType == QuantifierFixedCount) && (term->type != PatternTerm::TypeParentheticalAssertion))
+ op.m_checkAdjust -= disjunction->m_minimumSize;
+ if (op.m_checkAdjust)
+ op.m_jumps.append(jumpIfNoAvailableInput(op.m_checkAdjust));
+
+ YarrOp& lastOp = m_ops[op.m_previousOp];
+ m_checked -= lastOp.m_checkAdjust;
+ m_checked += op.m_checkAdjust;
+ break;
+ }
+ case OpSimpleNestedAlternativeEnd:
+ case OpNestedAlternativeEnd: {
+ PatternTerm* term = op.m_term;
+
+ // In the non-simple case, store a 'return address' so we can backtrack correctly.
+ if (op.m_op == OpNestedAlternativeEnd) {
+ unsigned parenthesesFrameLocation = term->frameLocation;
+ unsigned alternativeFrameLocation = parenthesesFrameLocation;
+ if (term->quantityType != QuantifierFixedCount)
+ alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce;
+ op.m_returnAddress = storeToFrameWithPatch(alternativeFrameLocation);
+ }
+
+ if (term->quantityType != QuantifierFixedCount && !m_ops[op.m_previousOp].m_alternative->m_minimumSize) {
+ // If the previous alternative matched without consuming characters then
+ // backtrack to try to match while consumming some input.
+ op.m_zeroLengthMatch = branch32(Equal, index, Address(stackPointerRegister, term->frameLocation * sizeof(void*)));
+ }
+
+ // If this set of alternatives contains more than one alternative,
+ // then the Next nodes will have planted jumps to the End, and added
+ // them to this node's m_jumps list.
+ op.m_jumps.link(this);
+ op.m_jumps.clear();
+
+ YarrOp& lastOp = m_ops[op.m_previousOp];
+ m_checked -= lastOp.m_checkAdjust;
+ break;
+ }
+
+ // OpParenthesesSubpatternOnceBegin/End
+ //
+ // These nodes support (optionally) capturing subpatterns, that have a
+ // quantity count of 1 (this covers fixed once, and ?/?? quantifiers).
+ case OpParenthesesSubpatternOnceBegin: {
+ PatternTerm* term = op.m_term;
+ unsigned parenthesesFrameLocation = term->frameLocation;
+ const RegisterID indexTemporary = regT0;
+ ASSERT(term->quantityCount == 1);
+
+ // Upon entry to a Greedy quantified set of parenthese store the index.
+ // We'll use this for two purposes:
+ // - To indicate which iteration we are on of mathing the remainder of
+ // the expression after the parentheses - the first, including the
+ // match within the parentheses, or the second having skipped over them.
+ // - To check for empty matches, which must be rejected.
+ //
+ // At the head of a NonGreedy set of parentheses we'll immediately set the
+ // value on the stack to -1 (indicating a match skipping the subpattern),
+ // and plant a jump to the end. We'll also plant a label to backtrack to
+ // to reenter the subpattern later, with a store to set up index on the
+ // second iteration.
+ //
+ // FIXME: for capturing parens, could use the index in the capture array?
+ if (term->quantityType == QuantifierGreedy)
+ storeToFrame(index, parenthesesFrameLocation);
+ else if (term->quantityType == QuantifierNonGreedy) {
+ storeToFrame(TrustedImm32(-1), parenthesesFrameLocation);
+ op.m_jumps.append(jump());
+ op.m_reentry = label();
+ storeToFrame(index, parenthesesFrameLocation);
+ }
+
+ // If the parenthese are capturing, store the starting index value to the
+ // captures array, offsetting as necessary.
+ //
+ // FIXME: could avoid offsetting this value in JIT code, apply
+ // offsets only afterwards, at the point the results array is
+ // being accessed.
+ if (term->capture() && compileMode == IncludeSubpatterns) {
+ int inputOffset = term->inputPosition - m_checked;
+ if (term->quantityType == QuantifierFixedCount)
+ inputOffset -= term->parentheses.disjunction->m_minimumSize;
+ if (inputOffset) {
+ move(index, indexTemporary);
+ add32(Imm32(inputOffset), indexTemporary);
+ setSubpatternStart(indexTemporary, term->parentheses.subpatternId);
+ } else
+ setSubpatternStart(index, term->parentheses.subpatternId);
+ }
+ break;
+ }
+ case OpParenthesesSubpatternOnceEnd: {
+ PatternTerm* term = op.m_term;
+ const RegisterID indexTemporary = regT0;
+ ASSERT(term->quantityCount == 1);
+
+#ifndef NDEBUG
+ // Runtime ASSERT to make sure that the nested alternative handled the
+ // "no input consumed" check.
+ if (term->quantityType != QuantifierFixedCount && !term->parentheses.disjunction->m_minimumSize) {
+ Jump pastBreakpoint;
+ pastBreakpoint = branch32(NotEqual, index, Address(stackPointerRegister, term->frameLocation * sizeof(void*)));
+ breakpoint();
+ pastBreakpoint.link(this);
+ }
+#endif
+
+ // If the parenthese are capturing, store the ending index value to the
+ // captures array, offsetting as necessary.
+ //
+ // FIXME: could avoid offsetting this value in JIT code, apply
+ // offsets only afterwards, at the point the results array is
+ // being accessed.
+ if (term->capture() && compileMode == IncludeSubpatterns) {
+ int inputOffset = term->inputPosition - m_checked;
+ if (inputOffset) {
+ move(index, indexTemporary);
+ add32(Imm32(inputOffset), indexTemporary);
+ setSubpatternEnd(indexTemporary, term->parentheses.subpatternId);
+ } else
+ setSubpatternEnd(index, term->parentheses.subpatternId);
+ }
+
+ // If the parentheses are quantified Greedy then add a label to jump back
+ // to if get a failed match from after the parentheses. For NonGreedy
+ // parentheses, link the jump from before the subpattern to here.
+ if (term->quantityType == QuantifierGreedy)
+ op.m_reentry = label();
+ else if (term->quantityType == QuantifierNonGreedy) {
+ YarrOp& beginOp = m_ops[op.m_previousOp];
+ beginOp.m_jumps.link(this);
+ }
+ break;
+ }
+
+ // OpParenthesesSubpatternTerminalBegin/End
+ case OpParenthesesSubpatternTerminalBegin: {
+ PatternTerm* term = op.m_term;
+ ASSERT(term->quantityType == QuantifierGreedy);
+ ASSERT(term->quantityCount == quantifyInfinite);
+ ASSERT(!term->capture());
+
+ // Upon entry set a label to loop back to.
+ op.m_reentry = label();
+
+ // Store the start index of the current match; we need to reject zero
+ // length matches.
+ storeToFrame(index, term->frameLocation);
+ break;
+ }
+ case OpParenthesesSubpatternTerminalEnd: {
+ YarrOp& beginOp = m_ops[op.m_previousOp];
+#ifndef NDEBUG
+ PatternTerm* term = op.m_term;
+
+ // Runtime ASSERT to make sure that the nested alternative handled the
+ // "no input consumed" check.
+ Jump pastBreakpoint;
+ pastBreakpoint = branch32(NotEqual, index, Address(stackPointerRegister, term->frameLocation * sizeof(void*)));
+ breakpoint();
+ pastBreakpoint.link(this);
+#endif
+
+ // We know that the match is non-zero, we can accept it and
+ // loop back up to the head of the subpattern.
+ jump(beginOp.m_reentry);
+
+ // This is the entry point to jump to when we stop matching - we will
+ // do so once the subpattern cannot match any more.
+ op.m_reentry = label();
+ break;
+ }
+
+ // OpParentheticalAssertionBegin/End
+ case OpParentheticalAssertionBegin: {
+ PatternTerm* term = op.m_term;
+
+ // Store the current index - assertions should not update index, so
+ // we will need to restore it upon a successful match.
+ unsigned parenthesesFrameLocation = term->frameLocation;
+ storeToFrame(index, parenthesesFrameLocation);
+
+ // Check
+ op.m_checkAdjust = m_checked - term->inputPosition;
+ if (op.m_checkAdjust)
+ sub32(Imm32(op.m_checkAdjust), index);
+
+ m_checked -= op.m_checkAdjust;
+ break;
+ }
+ case OpParentheticalAssertionEnd: {
+ PatternTerm* term = op.m_term;
+
+ // Restore the input index value.
+ unsigned parenthesesFrameLocation = term->frameLocation;
+ loadFromFrame(parenthesesFrameLocation, index);
+
+ // If inverted, a successful match of the assertion must be treated
+ // as a failure, so jump to backtracking.
+ if (term->invert()) {
+ op.m_jumps.append(jump());
+ op.m_reentry = label();
+ }
+
+ YarrOp& lastOp = m_ops[op.m_previousOp];
+ m_checked += lastOp.m_checkAdjust;
+ break;
+ }
+
+ case OpMatchFailed:
+ removeCallFrame();
+ move(TrustedImmPtr((void*)WTF::notFound), returnRegister);
+ move(TrustedImm32(0), returnRegister2);
+ generateReturn();
+ break;
+ }
+
+ ++opIndex;
+ } while (opIndex < m_ops.size());
+ }
+
+ void backtrack()
+ {
+ // Backwards generate the backtracking code.
+ size_t opIndex = m_ops.size();
+ ASSERT(opIndex);
+
+ do {
+ --opIndex;
+ YarrOp& op = m_ops[opIndex];
+ switch (op.m_op) {
+
+ case OpTerm:
+ backtrackTerm(opIndex);
+ break;
+
+ // OpBodyAlternativeBegin/Next/End
+ //
+ // For each Begin/Next node representing an alternative, we need to decide what to do
+ // in two circumstances:
+ // - If we backtrack back into this node, from within the alternative.
+ // - If the input check at the head of the alternative fails (if this exists).
+ //
+ // We treat these two cases differently since in the former case we have slightly
+ // more information - since we are backtracking out of a prior alternative we know
+ // that at least enough input was available to run it. For example, given the regular
+ // expression /a|b/, if we backtrack out of the first alternative (a failed pattern
+ // character match of 'a'), then we need not perform an additional input availability
+ // check before running the second alternative.
+ //
+ // Backtracking required differs for the last alternative, which in the case of the
+ // repeating set of alternatives must loop. The code generated for the last alternative
+ // will also be used to handle all input check failures from any prior alternatives -
+ // these require similar functionality, in seeking the next available alternative for
+ // which there is sufficient input.
+ //
+ // Since backtracking of all other alternatives simply requires us to link backtracks
+ // to the reentry point for the subsequent alternative, we will only be generating any
+ // code when backtracking the last alternative.
+ case OpBodyAlternativeBegin:
+ case OpBodyAlternativeNext: {
+ PatternAlternative* alternative = op.m_alternative;
+
+ if (op.m_op == OpBodyAlternativeNext) {
+ PatternAlternative* priorAlternative = m_ops[op.m_previousOp].m_alternative;
+ m_checked += priorAlternative->m_minimumSize;
+ }
+ m_checked -= alternative->m_minimumSize;
+
+ // Is this the last alternative? If not, then if we backtrack to this point we just
+ // need to jump to try to match the next alternative.
+ if (m_ops[op.m_nextOp].m_op != OpBodyAlternativeEnd) {
+ m_backtrackingState.linkTo(m_ops[op.m_nextOp].m_reentry, this);
+ break;
+ }
+ YarrOp& endOp = m_ops[op.m_nextOp];
+
+ YarrOp* beginOp = &op;
+ while (beginOp->m_op != OpBodyAlternativeBegin) {
+ ASSERT(beginOp->m_op == OpBodyAlternativeNext);
+ beginOp = &m_ops[beginOp->m_previousOp];
+ }
+
+ bool onceThrough = endOp.m_nextOp == notFound;
+
+ // First, generate code to handle cases where we backtrack out of an attempted match
+ // of the last alternative. If this is a 'once through' set of alternatives then we
+ // have nothing to do - link this straight through to the End.
+ if (onceThrough)
+ m_backtrackingState.linkTo(endOp.m_reentry, this);
+ else {
+ // If we don't need to move the input poistion, and the pattern has a fixed size
+ // (in which case we omit the store of the start index until the pattern has matched)
+ // then we can just link the backtrack out of the last alternative straight to the
+ // head of the first alternative.
+ if (m_pattern.m_body->m_hasFixedSize
+ && (alternative->m_minimumSize > beginOp->m_alternative->m_minimumSize)
+ && (alternative->m_minimumSize - beginOp->m_alternative->m_minimumSize == 1))
+ m_backtrackingState.linkTo(beginOp->m_reentry, this);
+ else {
+ // We need to generate a trampoline of code to execute before looping back
+ // around to the first alternative.
+ m_backtrackingState.link(this);
+
+ // If the pattern size is not fixed, then store the start index, for use if we match.
+ if (!m_pattern.m_body->m_hasFixedSize) {
+ if (alternative->m_minimumSize == 1)
+ setMatchStart(index);
+ else {
+ move(index, regT0);
+ if (alternative->m_minimumSize)
+ sub32(Imm32(alternative->m_minimumSize - 1), regT0);
+ else
+ add32(TrustedImm32(1), regT0);
+ setMatchStart(regT0);
+ }
+ }
+
+ // Generate code to loop. Check whether the last alternative is longer than the
+ // first (e.g. /a|xy/ or /a|xyz/).
+ if (alternative->m_minimumSize > beginOp->m_alternative->m_minimumSize) {
+ // We want to loop, and increment input position. If the delta is 1, it is
+ // already correctly incremented, if more than one then decrement as appropriate.
+ unsigned delta = alternative->m_minimumSize - beginOp->m_alternative->m_minimumSize;
+ ASSERT(delta);
+ if (delta != 1)
+ sub32(Imm32(delta - 1), index);
+ jump(beginOp->m_reentry);
+ } else {
+ // If the first alternative has minimum size 0xFFFFFFFFu, then there cannot
+ // be sufficent input available to handle this, so just fall through.
+ unsigned delta = beginOp->m_alternative->m_minimumSize - alternative->m_minimumSize;
+ if (delta != 0xFFFFFFFFu) {
+ // We need to check input because we are incrementing the input.
+ add32(Imm32(delta + 1), index);
+ checkInput().linkTo(beginOp->m_reentry, this);
+ }
+ }
+ }
+ }
+
+ // We can reach this point in the code in two ways:
+ // - Fallthrough from the code above (a repeating alternative backtracked out of its
+ // last alternative, and did not have sufficent input to run the first).
+ // - We will loop back up to the following label when a releating alternative loops,
+ // following a failed input check.
+ //
+ // Either way, we have just failed the input check for the first alternative.
+ Label firstInputCheckFailed(this);
+
+ // Generate code to handle input check failures from alternatives except the last.
+ // prevOp is the alternative we're handling a bail out from (initially Begin), and
+ // nextOp is the alternative we will be attempting to reenter into.
+ //
+ // We will link input check failures from the forwards matching path back to the code
+ // that can handle them.
+ YarrOp* prevOp = beginOp;
+ YarrOp* nextOp = &m_ops[beginOp->m_nextOp];
+ while (nextOp->m_op != OpBodyAlternativeEnd) {
+ prevOp->m_jumps.link(this);
+
+ // We only get here if an input check fails, it is only worth checking again
+ // if the next alternative has a minimum size less than the last.
+ if (prevOp->m_alternative->m_minimumSize > nextOp->m_alternative->m_minimumSize) {
+ // FIXME: if we added an extra label to YarrOp, we could avoid needing to
+ // subtract delta back out, and reduce this code. Should performance test
+ // the benefit of this.
+ unsigned delta = prevOp->m_alternative->m_minimumSize - nextOp->m_alternative->m_minimumSize;
+ sub32(Imm32(delta), index);
+ Jump fail = jumpIfNoAvailableInput();
+ add32(Imm32(delta), index);
+ jump(nextOp->m_reentry);
+ fail.link(this);
+ } else if (prevOp->m_alternative->m_minimumSize < nextOp->m_alternative->m_minimumSize)
+ add32(Imm32(nextOp->m_alternative->m_minimumSize - prevOp->m_alternative->m_minimumSize), index);
+ prevOp = nextOp;
+ nextOp = &m_ops[nextOp->m_nextOp];
+ }
+
+ // We fall through to here if there is insufficient input to run the last alternative.
+
+ // If there is insufficient input to run the last alternative, then for 'once through'
+ // alternatives we are done - just jump back up into the forwards matching path at the End.
+ if (onceThrough) {
+ op.m_jumps.linkTo(endOp.m_reentry, this);
+ jump(endOp.m_reentry);
+ break;
+ }
+
+ // For repeating alternatives, link any input check failure from the last alternative to
+ // this point.
+ op.m_jumps.link(this);
+
+ bool needsToUpdateMatchStart = !m_pattern.m_body->m_hasFixedSize;
+
+ // Check for cases where input position is already incremented by 1 for the last
+ // alternative (this is particularly useful where the minimum size of the body
+ // disjunction is 0, e.g. /a*|b/).
+ if (needsToUpdateMatchStart && alternative->m_minimumSize == 1) {
+ // index is already incremented by 1, so just store it now!
+ setMatchStart(index);
+ needsToUpdateMatchStart = false;
+ }
+
+ // Check whether there is sufficient input to loop. Increment the input position by
+ // one, and check. Also add in the minimum disjunction size before checking - there
+ // is no point in looping if we're just going to fail all the input checks around
+ // the next iteration.
+ ASSERT(alternative->m_minimumSize >= m_pattern.m_body->m_minimumSize);
+ if (alternative->m_minimumSize == m_pattern.m_body->m_minimumSize) {
+ // If the last alternative had the same minimum size as the disjunction,
+ // just simply increment input pos by 1, no adjustment based on minimum size.
+ add32(TrustedImm32(1), index);
+ } else {
+ // If the minumum for the last alternative was one greater than than that
+ // for the disjunction, we're already progressed by 1, nothing to do!
+ unsigned delta = (alternative->m_minimumSize - m_pattern.m_body->m_minimumSize) - 1;
+ if (delta)
+ sub32(Imm32(delta), index);
+ }
+ Jump matchFailed = jumpIfNoAvailableInput();
+
+ if (needsToUpdateMatchStart) {
+ if (!m_pattern.m_body->m_minimumSize)
+ setMatchStart(index);
+ else {
+ move(index, regT0);
+ sub32(Imm32(m_pattern.m_body->m_minimumSize), regT0);
+ setMatchStart(regT0);
+ }
+ }
+
+ // Calculate how much more input the first alternative requires than the minimum
+ // for the body as a whole. If no more is needed then we dont need an additional
+ // input check here - jump straight back up to the start of the first alternative.
+ if (beginOp->m_alternative->m_minimumSize == m_pattern.m_body->m_minimumSize)
+ jump(beginOp->m_reentry);
+ else {
+ if (beginOp->m_alternative->m_minimumSize > m_pattern.m_body->m_minimumSize)
+ add32(Imm32(beginOp->m_alternative->m_minimumSize - m_pattern.m_body->m_minimumSize), index);
+ else
+ sub32(Imm32(m_pattern.m_body->m_minimumSize - beginOp->m_alternative->m_minimumSize), index);
+ checkInput().linkTo(beginOp->m_reentry, this);
+ jump(firstInputCheckFailed);
+ }
+
+ // We jump to here if we iterate to the point that there is insufficient input to
+ // run any matches, and need to return a failure state from JIT code.
+ matchFailed.link(this);
+
+ removeCallFrame();
+ move(TrustedImmPtr((void*)WTF::notFound), returnRegister);
+ move(TrustedImm32(0), returnRegister2);
+ generateReturn();
+ break;
+ }
+ case OpBodyAlternativeEnd: {
+ // We should never backtrack back into a body disjunction.
+ ASSERT(m_backtrackingState.isEmpty());
+
+ PatternAlternative* priorAlternative = m_ops[op.m_previousOp].m_alternative;
+ m_checked += priorAlternative->m_minimumSize;
+ break;
+ }
+
+ // OpSimpleNestedAlternativeBegin/Next/End
+ // OpNestedAlternativeBegin/Next/End
+ //
+ // Generate code for when we backtrack back out of an alternative into
+ // a Begin or Next node, or when the entry input count check fails. If
+ // there are more alternatives we need to jump to the next alternative,
+ // if not we backtrack back out of the current set of parentheses.
+ //
+ // In the case of non-simple nested assertions we need to also link the
+ // 'return address' appropriately to backtrack back out into the correct
+ // alternative.
+ case OpSimpleNestedAlternativeBegin:
+ case OpSimpleNestedAlternativeNext:
+ case OpNestedAlternativeBegin:
+ case OpNestedAlternativeNext: {
+ YarrOp& nextOp = m_ops[op.m_nextOp];
+ bool isBegin = op.m_previousOp == notFound;
+ bool isLastAlternative = nextOp.m_nextOp == notFound;
+ ASSERT(isBegin == (op.m_op == OpSimpleNestedAlternativeBegin || op.m_op == OpNestedAlternativeBegin));
+ ASSERT(isLastAlternative == (nextOp.m_op == OpSimpleNestedAlternativeEnd || nextOp.m_op == OpNestedAlternativeEnd));
+
+ // Treat an input check failure the same as a failed match.
+ m_backtrackingState.append(op.m_jumps);
+
+ // Set the backtracks to jump to the appropriate place. We may need
+ // to link the backtracks in one of three different way depending on
+ // the type of alternative we are dealing with:
+ // - A single alternative, with no simplings.
+ // - The last alternative of a set of two or more.
+ // - An alternative other than the last of a set of two or more.
+ //
+ // In the case of a single alternative on its own, we don't need to
+ // jump anywhere - if the alternative fails to match we can just
+ // continue to backtrack out of the parentheses without jumping.
+ //
+ // In the case of the last alternative in a set of more than one, we
+ // need to jump to return back out to the beginning. We'll do so by
+ // adding a jump to the End node's m_jumps list, and linking this
+ // when we come to generate the Begin node. For alternatives other
+ // than the last, we need to jump to the next alternative.
+ //
+ // If the alternative had adjusted the input position we must link
+ // backtracking to here, correct, and then jump on. If not we can
+ // link the backtracks directly to their destination.
+ if (op.m_checkAdjust) {
+ // Handle the cases where we need to link the backtracks here.
+ m_backtrackingState.link(this);
+ sub32(Imm32(op.m_checkAdjust), index);
+ if (!isLastAlternative) {
+ // An alternative that is not the last should jump to its successor.
+ jump(nextOp.m_reentry);
+ } else if (!isBegin) {
+ // The last of more than one alternatives must jump back to the beginning.
+ nextOp.m_jumps.append(jump());
+ } else {
+ // A single alternative on its own can fall through.
+ m_backtrackingState.fallthrough();
+ }
+ } else {
+ // Handle the cases where we can link the backtracks directly to their destinations.
+ if (!isLastAlternative) {
+ // An alternative that is not the last should jump to its successor.
+ m_backtrackingState.linkTo(nextOp.m_reentry, this);
+ } else if (!isBegin) {
+ // The last of more than one alternatives must jump back to the beginning.
+ m_backtrackingState.takeBacktracksToJumpList(nextOp.m_jumps, this);
+ }
+ // In the case of a single alternative on its own do nothing - it can fall through.
+ }
+
+ // If there is a backtrack jump from a zero length match link it here.
+ if (op.m_zeroLengthMatch.isSet())
+ m_backtrackingState.append(op.m_zeroLengthMatch);
+
+ // At this point we've handled the backtracking back into this node.
+ // Now link any backtracks that need to jump to here.
+
+ // For non-simple alternatives, link the alternative's 'return address'
+ // so that we backtrack back out into the previous alternative.
+ if (op.m_op == OpNestedAlternativeNext)
+ m_backtrackingState.append(op.m_returnAddress);
+
+ // If there is more than one alternative, then the last alternative will
+ // have planted a jump to be linked to the end. This jump was added to the
+ // End node's m_jumps list. If we are back at the beginning, link it here.
+ if (isBegin) {
+ YarrOp* endOp = &m_ops[op.m_nextOp];
+ while (endOp->m_nextOp != notFound) {
+ ASSERT(endOp->m_op == OpSimpleNestedAlternativeNext || endOp->m_op == OpNestedAlternativeNext);
+ endOp = &m_ops[endOp->m_nextOp];
+ }
+ ASSERT(endOp->m_op == OpSimpleNestedAlternativeEnd || endOp->m_op == OpNestedAlternativeEnd);
+ m_backtrackingState.append(endOp->m_jumps);
+ }
+
+ if (!isBegin) {
+ YarrOp& lastOp = m_ops[op.m_previousOp];
+ m_checked += lastOp.m_checkAdjust;
+ }
+ m_checked -= op.m_checkAdjust;
+ break;
+ }
+ case OpSimpleNestedAlternativeEnd:
+ case OpNestedAlternativeEnd: {
+ PatternTerm* term = op.m_term;
+
+ // If there is a backtrack jump from a zero length match link it here.
+ if (op.m_zeroLengthMatch.isSet())
+ m_backtrackingState.append(op.m_zeroLengthMatch);
+
+ // If we backtrack into the end of a simple subpattern do nothing;
+ // just continue through into the last alternative. If we backtrack
+ // into the end of a non-simple set of alterntives we need to jump
+ // to the backtracking return address set up during generation.
+ if (op.m_op == OpNestedAlternativeEnd) {
+ m_backtrackingState.link(this);
+
+ // Plant a jump to the return address.
+ unsigned parenthesesFrameLocation = term->frameLocation;
+ unsigned alternativeFrameLocation = parenthesesFrameLocation;
+ if (term->quantityType != QuantifierFixedCount)
+ alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce;
+ loadFromFrameAndJump(alternativeFrameLocation);
+
+ // Link the DataLabelPtr associated with the end of the last
+ // alternative to this point.
+ m_backtrackingState.append(op.m_returnAddress);
+ }
+
+ YarrOp& lastOp = m_ops[op.m_previousOp];
+ m_checked += lastOp.m_checkAdjust;
+ break;
+ }
+
+ // OpParenthesesSubpatternOnceBegin/End
+ //
+ // When we are backtracking back out of a capturing subpattern we need
+ // to clear the start index in the matches output array, to record that
+ // this subpattern has not been captured.
+ //
+ // When backtracking back out of a Greedy quantified subpattern we need
+ // to catch this, and try running the remainder of the alternative after
+ // the subpattern again, skipping the parentheses.
+ //
+ // Upon backtracking back into a quantified set of parentheses we need to
+ // check whether we were currently skipping the subpattern. If not, we
+ // can backtrack into them, if we were we need to either backtrack back
+ // out of the start of the parentheses, or jump back to the forwards
+ // matching start, depending of whether the match is Greedy or NonGreedy.
+ case OpParenthesesSubpatternOnceBegin: {
+ PatternTerm* term = op.m_term;
+ ASSERT(term->quantityCount == 1);
+
+ // We only need to backtrack to thispoint if capturing or greedy.
+ if ((term->capture() && compileMode == IncludeSubpatterns) || term->quantityType == QuantifierGreedy) {
+ m_backtrackingState.link(this);
+
+ // If capturing, clear the capture (we only need to reset start).
+ if (term->capture() && compileMode == IncludeSubpatterns)
+ clearSubpatternStart(term->parentheses.subpatternId);
+
+ // If Greedy, jump to the end.
+ if (term->quantityType == QuantifierGreedy) {
+ // Clear the flag in the stackframe indicating we ran through the subpattern.
+ unsigned parenthesesFrameLocation = term->frameLocation;
+ storeToFrame(TrustedImm32(-1), parenthesesFrameLocation);
+ // Jump to after the parentheses, skipping the subpattern.
+ jump(m_ops[op.m_nextOp].m_reentry);
+ // A backtrack from after the parentheses, when skipping the subpattern,
+ // will jump back to here.
+ op.m_jumps.link(this);
+ }
+
+ m_backtrackingState.fallthrough();
+ }
+ break;
+ }
+ case OpParenthesesSubpatternOnceEnd: {
+ PatternTerm* term = op.m_term;
+
+ if (term->quantityType != QuantifierFixedCount) {
+ m_backtrackingState.link(this);
+
+ // Check whether we should backtrack back into the parentheses, or if we
+ // are currently in a state where we had skipped over the subpattern
+ // (in which case the flag value on the stack will be -1).
+ unsigned parenthesesFrameLocation = term->frameLocation;
+ Jump hadSkipped = branch32(Equal, Address(stackPointerRegister, parenthesesFrameLocation * sizeof(void*)), TrustedImm32(-1));
+
+ if (term->quantityType == QuantifierGreedy) {
+ // For Greedy parentheses, we skip after having already tried going
+ // through the subpattern, so if we get here we're done.
+ YarrOp& beginOp = m_ops[op.m_previousOp];
+ beginOp.m_jumps.append(hadSkipped);
+ } else {
+ // For NonGreedy parentheses, we try skipping the subpattern first,
+ // so if we get here we need to try running through the subpattern
+ // next. Jump back to the start of the parentheses in the forwards
+ // matching path.
+ ASSERT(term->quantityType == QuantifierNonGreedy);
+ YarrOp& beginOp = m_ops[op.m_previousOp];
+ hadSkipped.linkTo(beginOp.m_reentry, this);
+ }
+
+ m_backtrackingState.fallthrough();
+ }
+
+ m_backtrackingState.append(op.m_jumps);
+ break;
+ }
+
+ // OpParenthesesSubpatternTerminalBegin/End
+ //
+ // Terminal subpatterns will always match - there is nothing after them to
+ // force a backtrack, and they have a minimum count of 0, and as such will
+ // always produce an acceptable result.
+ case OpParenthesesSubpatternTerminalBegin: {
+ // We will backtrack to this point once the subpattern cannot match any
+ // more. Since no match is accepted as a successful match (we are Greedy
+ // quantified with a minimum of zero) jump back to the forwards matching
+ // path at the end.
+ YarrOp& endOp = m_ops[op.m_nextOp];
+ m_backtrackingState.linkTo(endOp.m_reentry, this);
+ break;
+ }
+ case OpParenthesesSubpatternTerminalEnd:
+ // We should never be backtracking to here (hence the 'terminal' in the name).
+ ASSERT(m_backtrackingState.isEmpty());
+ m_backtrackingState.append(op.m_jumps);
+ break;
+
+ // OpParentheticalAssertionBegin/End
+ case OpParentheticalAssertionBegin: {
+ PatternTerm* term = op.m_term;
+ YarrOp& endOp = m_ops[op.m_nextOp];
+
+ // We need to handle the backtracks upon backtracking back out
+ // of a parenthetical assertion if either we need to correct
+ // the input index, or the assertion was inverted.
+ if (op.m_checkAdjust || term->invert()) {
+ m_backtrackingState.link(this);
+
+ if (op.m_checkAdjust)
+ add32(Imm32(op.m_checkAdjust), index);
+
+ // In an inverted assertion failure to match the subpattern
+ // is treated as a successful match - jump to the end of the
+ // subpattern. We already have adjusted the input position
+ // back to that before the assertion, which is correct.
+ if (term->invert())
+ jump(endOp.m_reentry);
+
+ m_backtrackingState.fallthrough();
+ }
+
+ // The End node's jump list will contain any backtracks into
+ // the end of the assertion. Also, if inverted, we will have
+ // added the failure caused by a successful match to this.
+ m_backtrackingState.append(endOp.m_jumps);
+
+ m_checked += op.m_checkAdjust;
+ break;
+ }
+ case OpParentheticalAssertionEnd: {
+ // FIXME: We should really be clearing any nested subpattern
+ // matches on bailing out from after the pattern. Firefox has
+ // this bug too (presumably because they use YARR!)
+
+ // Never backtrack into an assertion; later failures bail to before the begin.
+ m_backtrackingState.takeBacktracksToJumpList(op.m_jumps, this);
+
+ YarrOp& lastOp = m_ops[op.m_previousOp];
+ m_checked -= lastOp.m_checkAdjust;
+ break;
+ }
+
+ case OpMatchFailed:
+ break;
+ }
+
+ } while (opIndex);
+ }
+
+ // Compilation methods:
+ // ====================
+
+ // opCompileParenthesesSubpattern
+ // Emits ops for a subpattern (set of parentheses). These consist
+ // of a set of alternatives wrapped in an outer set of nodes for
+ // the parentheses.
+ // Supported types of parentheses are 'Once' (quantityCount == 1)
+ // and 'Terminal' (non-capturing parentheses quantified as greedy
+ // and infinite).
+ // Alternatives will use the 'Simple' set of ops if either the
+ // subpattern is terminal (in which case we will never need to
+ // backtrack), or if the subpattern only contains one alternative.
+ void opCompileParenthesesSubpattern(PatternTerm* term)
+ {
+ YarrOpCode parenthesesBeginOpCode;
+ YarrOpCode parenthesesEndOpCode;
+ YarrOpCode alternativeBeginOpCode = OpSimpleNestedAlternativeBegin;
+ YarrOpCode alternativeNextOpCode = OpSimpleNestedAlternativeNext;
+ YarrOpCode alternativeEndOpCode = OpSimpleNestedAlternativeEnd;
+
+ // We can currently only compile quantity 1 subpatterns that are
+ // not copies. We generate a copy in the case of a range quantifier,
+ // e.g. /(?:x){3,9}/, or /(?:x)+/ (These are effectively expanded to
+ // /(?:x){3,3}(?:x){0,6}/ and /(?:x)(?:x)*/ repectively). The problem
+ // comes where the subpattern is capturing, in which case we would
+ // need to restore the capture from the first subpattern upon a
+ // failure in the second.
+ if (term->quantityCount == 1 && !term->parentheses.isCopy) {
+ // Select the 'Once' nodes.
+ parenthesesBeginOpCode = OpParenthesesSubpatternOnceBegin;
+ parenthesesEndOpCode = OpParenthesesSubpatternOnceEnd;
+
+ // If there is more than one alternative we cannot use the 'simple' nodes.
+ if (term->parentheses.disjunction->m_alternatives.size() != 1) {
+ alternativeBeginOpCode = OpNestedAlternativeBegin;
+ alternativeNextOpCode = OpNestedAlternativeNext;
+ alternativeEndOpCode = OpNestedAlternativeEnd;
+ }
+ } else if (term->parentheses.isTerminal) {
+ // Select the 'Terminal' nodes.
+ parenthesesBeginOpCode = OpParenthesesSubpatternTerminalBegin;
+ parenthesesEndOpCode = OpParenthesesSubpatternTerminalEnd;
+ } else {
+ // This subpattern is not supported by the JIT.
+ m_shouldFallBack = true;
+ return;
+ }
+
+ size_t parenBegin = m_ops.size();
+ m_ops.append(parenthesesBeginOpCode);
+
+ m_ops.append(alternativeBeginOpCode);
+ m_ops.last().m_previousOp = notFound;
+ m_ops.last().m_term = term;
+ Vector<PatternAlternative*>& alternatives = term->parentheses.disjunction->m_alternatives;
+ for (unsigned i = 0; i < alternatives.size(); ++i) {
+ size_t lastOpIndex = m_ops.size() - 1;
+
+ PatternAlternative* nestedAlternative = alternatives[i];
+ opCompileAlternative(nestedAlternative);
+
+ size_t thisOpIndex = m_ops.size();
+ m_ops.append(YarrOp(alternativeNextOpCode));
+
+ YarrOp& lastOp = m_ops[lastOpIndex];
+ YarrOp& thisOp = m_ops[thisOpIndex];
+
+ lastOp.m_alternative = nestedAlternative;
+ lastOp.m_nextOp = thisOpIndex;
+ thisOp.m_previousOp = lastOpIndex;
+ thisOp.m_term = term;
+ }
+ YarrOp& lastOp = m_ops.last();
+ ASSERT(lastOp.m_op == alternativeNextOpCode);
+ lastOp.m_op = alternativeEndOpCode;
+ lastOp.m_alternative = 0;
+ lastOp.m_nextOp = notFound;
+
+ size_t parenEnd = m_ops.size();
+ m_ops.append(parenthesesEndOpCode);
+
+ m_ops[parenBegin].m_term = term;
+ m_ops[parenBegin].m_previousOp = notFound;
+ m_ops[parenBegin].m_nextOp = parenEnd;
+ m_ops[parenEnd].m_term = term;
+ m_ops[parenEnd].m_previousOp = parenBegin;
+ m_ops[parenEnd].m_nextOp = notFound;
+ }
+
+ // opCompileParentheticalAssertion
+ // Emits ops for a parenthetical assertion. These consist of an
+ // OpSimpleNestedAlternativeBegin/Next/End set of nodes wrapping
+ // the alternatives, with these wrapped by an outer pair of
+ // OpParentheticalAssertionBegin/End nodes.
+ // We can always use the OpSimpleNestedAlternative nodes in the
+ // case of parenthetical assertions since these only ever match
+ // once, and will never backtrack back into the assertion.
+ void opCompileParentheticalAssertion(PatternTerm* term)
+ {
+ size_t parenBegin = m_ops.size();
+ m_ops.append(OpParentheticalAssertionBegin);
+
+ m_ops.append(OpSimpleNestedAlternativeBegin);
+ m_ops.last().m_previousOp = notFound;
+ m_ops.last().m_term = term;
+ Vector<PatternAlternative*>& alternatives = term->parentheses.disjunction->m_alternatives;
+ for (unsigned i = 0; i < alternatives.size(); ++i) {
+ size_t lastOpIndex = m_ops.size() - 1;
+
+ PatternAlternative* nestedAlternative = alternatives[i];
+ opCompileAlternative(nestedAlternative);
+
+ size_t thisOpIndex = m_ops.size();
+ m_ops.append(YarrOp(OpSimpleNestedAlternativeNext));
+
+ YarrOp& lastOp = m_ops[lastOpIndex];
+ YarrOp& thisOp = m_ops[thisOpIndex];
+
+ lastOp.m_alternative = nestedAlternative;
+ lastOp.m_nextOp = thisOpIndex;
+ thisOp.m_previousOp = lastOpIndex;
+ thisOp.m_term = term;
+ }
+ YarrOp& lastOp = m_ops.last();
+ ASSERT(lastOp.m_op == OpSimpleNestedAlternativeNext);
+ lastOp.m_op = OpSimpleNestedAlternativeEnd;
+ lastOp.m_alternative = 0;
+ lastOp.m_nextOp = notFound;
+
+ size_t parenEnd = m_ops.size();
+ m_ops.append(OpParentheticalAssertionEnd);
+
+ m_ops[parenBegin].m_term = term;
+ m_ops[parenBegin].m_previousOp = notFound;
+ m_ops[parenBegin].m_nextOp = parenEnd;
+ m_ops[parenEnd].m_term = term;
+ m_ops[parenEnd].m_previousOp = parenBegin;
+ m_ops[parenEnd].m_nextOp = notFound;
+ }
+
+ // opCompileAlternative
+ // Called to emit nodes for all terms in an alternative.
+ void opCompileAlternative(PatternAlternative* alternative)
+ {
+ optimizeAlternative(alternative);
+
+ for (unsigned i = 0; i < alternative->m_terms.size(); ++i) {
+ PatternTerm* term = &alternative->m_terms[i];
+
+ switch (term->type) {
+ case PatternTerm::TypeParenthesesSubpattern:
+ opCompileParenthesesSubpattern(term);
+ break;
+
+ case PatternTerm::TypeParentheticalAssertion:
+ opCompileParentheticalAssertion(term);
+ break;
+
+ default:
+ m_ops.append(term);
+ }
+ }
+ }
+
+ // opCompileBody
+ // This method compiles the body disjunction of the regular expression.
+ // The body consists of two sets of alternatives - zero or more 'once
+ // through' (BOL anchored) alternatives, followed by zero or more
+ // repeated alternatives.
+ // For each of these two sets of alteratives, if not empty they will be
+ // wrapped in a set of OpBodyAlternativeBegin/Next/End nodes (with the
+ // 'begin' node referencing the first alternative, and 'next' nodes
+ // referencing any further alternatives. The begin/next/end nodes are
+ // linked together in a doubly linked list. In the case of repeating
+ // alternatives, the end node is also linked back to the beginning.
+ // If no repeating alternatives exist, then a OpMatchFailed node exists
+ // to return the failing result.
+ void opCompileBody(PatternDisjunction* disjunction)
+ {
+ Vector<PatternAlternative*>& alternatives = disjunction->m_alternatives;
+ size_t currentAlternativeIndex = 0;
+
+ // Emit the 'once through' alternatives.
+ if (alternatives.size() && alternatives[0]->onceThrough()) {
+ m_ops.append(YarrOp(OpBodyAlternativeBegin));
+ m_ops.last().m_previousOp = notFound;
+
+ do {
+ size_t lastOpIndex = m_ops.size() - 1;
+ PatternAlternative* alternative = alternatives[currentAlternativeIndex];
+ opCompileAlternative(alternative);
+
+ size_t thisOpIndex = m_ops.size();
+ m_ops.append(YarrOp(OpBodyAlternativeNext));
+
+ YarrOp& lastOp = m_ops[lastOpIndex];
+ YarrOp& thisOp = m_ops[thisOpIndex];
+
+ lastOp.m_alternative = alternative;
+ lastOp.m_nextOp = thisOpIndex;
+ thisOp.m_previousOp = lastOpIndex;
+
+ ++currentAlternativeIndex;
+ } while (currentAlternativeIndex < alternatives.size() && alternatives[currentAlternativeIndex]->onceThrough());
+
+ YarrOp& lastOp = m_ops.last();
+
+ ASSERT(lastOp.m_op == OpBodyAlternativeNext);
+ lastOp.m_op = OpBodyAlternativeEnd;
+ lastOp.m_alternative = 0;
+ lastOp.m_nextOp = notFound;
+ }
+
+ if (currentAlternativeIndex == alternatives.size()) {
+ m_ops.append(YarrOp(OpMatchFailed));
+ return;
+ }
+
+ // Emit the repeated alternatives.
+ size_t repeatLoop = m_ops.size();
+ m_ops.append(YarrOp(OpBodyAlternativeBegin));
+ m_ops.last().m_previousOp = notFound;
+ do {
+ size_t lastOpIndex = m_ops.size() - 1;
+ PatternAlternative* alternative = alternatives[currentAlternativeIndex];
+ ASSERT(!alternative->onceThrough());
+ opCompileAlternative(alternative);
+
+ size_t thisOpIndex = m_ops.size();
+ m_ops.append(YarrOp(OpBodyAlternativeNext));
+
+ YarrOp& lastOp = m_ops[lastOpIndex];
+ YarrOp& thisOp = m_ops[thisOpIndex];
+
+ lastOp.m_alternative = alternative;
+ lastOp.m_nextOp = thisOpIndex;
+ thisOp.m_previousOp = lastOpIndex;
+
+ ++currentAlternativeIndex;
+ } while (currentAlternativeIndex < alternatives.size());
+ YarrOp& lastOp = m_ops.last();
+ ASSERT(lastOp.m_op == OpBodyAlternativeNext);
+ lastOp.m_op = OpBodyAlternativeEnd;
+ lastOp.m_alternative = 0;
+ lastOp.m_nextOp = repeatLoop;
+ }
+
+ void generateEnter()
+ {
+#if CPU(X86_64)
+ push(X86Registers::ebp);
+ move(stackPointerRegister, X86Registers::ebp);
+ push(X86Registers::ebx);
+#elif CPU(X86)
+ push(X86Registers::ebp);
+ move(stackPointerRegister, X86Registers::ebp);
+ // TODO: do we need spill registers to fill the output pointer if there are no sub captures?
+ push(X86Registers::ebx);
+ push(X86Registers::edi);
+ push(X86Registers::esi);
+ // load output into edi (2 = saved ebp + return address).
+ #if COMPILER(MSVC)
+ loadPtr(Address(X86Registers::ebp, 2 * sizeof(void*)), input);
+ loadPtr(Address(X86Registers::ebp, 3 * sizeof(void*)), index);
+ loadPtr(Address(X86Registers::ebp, 4 * sizeof(void*)), length);
+ if (compileMode == IncludeSubpatterns)
+ loadPtr(Address(X86Registers::ebp, 5 * sizeof(void*)), output);
+ #else
+ if (compileMode == IncludeSubpatterns)
+ loadPtr(Address(X86Registers::ebp, 2 * sizeof(void*)), output);
+ #endif
+#elif CPU(ARM)
+ push(ARMRegisters::r4);
+ push(ARMRegisters::r5);
+ push(ARMRegisters::r6);
+#if CPU(ARM_TRADITIONAL)
+ push(ARMRegisters::r8); // scratch register
+#endif
+ if (compileMode == IncludeSubpatterns)
+ move(ARMRegisters::r3, output);
+#elif CPU(SH4)
+ push(SH4Registers::r11);
+ push(SH4Registers::r13);
+#elif CPU(MIPS)
+ // Do nothing.
+#endif
+ }
+
+ void generateReturn()
+ {
+#if CPU(X86_64)
+ pop(X86Registers::ebx);
+ pop(X86Registers::ebp);
+#elif CPU(X86)
+ pop(X86Registers::esi);
+ pop(X86Registers::edi);
+ pop(X86Registers::ebx);
+ pop(X86Registers::ebp);
+#elif CPU(ARM)
+#if CPU(ARM_TRADITIONAL)
+ pop(ARMRegisters::r8); // scratch register
+#endif
+ pop(ARMRegisters::r6);
+ pop(ARMRegisters::r5);
+ pop(ARMRegisters::r4);
+#elif CPU(SH4)
+ pop(SH4Registers::r13);
+ pop(SH4Registers::r11);
+#elif CPU(MIPS)
+ // Do nothing
+#endif
+ ret();
+ }
+
+public:
+ YarrGenerator(YarrPattern& pattern, YarrCharSize charSize)
+ : m_pattern(pattern)
+ , m_charSize(charSize)
+ , m_charScale(m_charSize == Char8 ? TimesOne: TimesTwo)
+ , m_shouldFallBack(false)
+ , m_checked(0)
+ {
+ }
+
+ void compile(JSGlobalData* globalData, YarrCodeBlock& jitObject)
+ {
+ generateEnter();
+
+ Jump hasInput = checkInput();
+ move(TrustedImmPtr((void*)WTF::notFound), returnRegister);
+ move(TrustedImm32(0), returnRegister2);
+ generateReturn();
+ hasInput.link(this);
+
+ if (compileMode == IncludeSubpatterns) {
+ for (unsigned i = 0; i < m_pattern.m_numSubpatterns + 1; ++i)
+ store32(TrustedImm32(-1), Address(output, (i << 1) * sizeof(int)));
+ }
+
+ if (!m_pattern.m_body->m_hasFixedSize)
+ setMatchStart(index);
+
+ initCallFrame();
+
+ // Compile the pattern to the internal 'YarrOp' representation.
+ opCompileBody(m_pattern.m_body);
+
+ // If we encountered anything we can't handle in the JIT code
+ // (e.g. backreferences) then return early.
+ if (m_shouldFallBack) {
+ jitObject.setFallBack(true);
+ return;
+ }
+
+ generate();
+ backtrack();
+
+ // Link & finalize the code.
+ LinkBuffer linkBuffer(*globalData, this, REGEXP_CODE_ID);
+ m_backtrackingState.linkDataLabels(linkBuffer);
+
+ if (compileMode == MatchOnly) {
+ if (m_charSize == Char8)
+ jitObject.set8BitCodeMatchOnly(FINALIZE_CODE(linkBuffer, ("Match-only 8-bit regular expression")));
+ else
+ jitObject.set16BitCodeMatchOnly(FINALIZE_CODE(linkBuffer, ("Match-only 16-bit regular expression")));
+ } else {
+ if (m_charSize == Char8)
+ jitObject.set8BitCode(FINALIZE_CODE(linkBuffer, ("8-bit regular expression")));
+ else
+ jitObject.set16BitCode(FINALIZE_CODE(linkBuffer, ("16-bit regular expression")));
+ }
+ jitObject.setFallBack(m_shouldFallBack);
+ }
+
+private:
+ YarrPattern& m_pattern;
+
+ YarrCharSize m_charSize;
+
+ Scale m_charScale;
+
+ // Used to detect regular expression constructs that are not currently
+ // supported in the JIT; fall back to the interpreter when this is detected.
+ bool m_shouldFallBack;
+
+ // The regular expression expressed as a linear sequence of operations.
+ Vector<YarrOp, 128> m_ops;
+
+ // This records the current input offset being applied due to the current
+ // set of alternatives we are nested within. E.g. when matching the
+ // character 'b' within the regular expression /abc/, we will know that
+ // the minimum size for the alternative is 3, checked upon entry to the
+ // alternative, and that 'b' is at offset 1 from the start, and as such
+ // when matching 'b' we need to apply an offset of -2 to the load.
+ //
+ // FIXME: This should go away. Rather than tracking this value throughout
+ // code generation, we should gather this information up front & store it
+ // on the YarrOp structure.
+ int m_checked;
+
+ // This class records state whilst generating the backtracking path of code.
+ BacktrackingState m_backtrackingState;
+};
+
+void jitCompile(YarrPattern& pattern, YarrCharSize charSize, JSGlobalData* globalData, YarrCodeBlock& jitObject, YarrJITCompileMode mode)
+{
+ if (mode == MatchOnly)
+ YarrGenerator<MatchOnly>(pattern, charSize).compile(globalData, jitObject);
+ else
+ YarrGenerator<IncludeSubpatterns>(pattern, charSize).compile(globalData, jitObject);
+}
+
+}}
+
+#endif
diff --git a/src/3rdparty/masm/yarr/YarrJIT.h b/src/3rdparty/masm/yarr/YarrJIT.h
new file mode 100644
index 0000000000..bb7033fdea
--- /dev/null
+++ b/src/3rdparty/masm/yarr/YarrJIT.h
@@ -0,0 +1,141 @@
+/*
+ * Copyright (C) 2009 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef YarrJIT_h
+#define YarrJIT_h
+
+#if ENABLE(YARR_JIT)
+
+#include "JSGlobalData.h"
+#include "MacroAssemblerCodeRef.h"
+#include "MatchResult.h"
+#include "Yarr.h"
+#include "YarrPattern.h"
+
+#if CPU(X86) && !COMPILER(MSVC)
+#define YARR_CALL __attribute__ ((regparm (3)))
+#else
+#define YARR_CALL
+#endif
+
+namespace JSC {
+
+class JSGlobalData;
+class ExecutablePool;
+
+namespace Yarr {
+
+class YarrCodeBlock {
+#if CPU(X86_64)
+ typedef MatchResult (*YarrJITCode8)(const LChar* input, unsigned start, unsigned length, int* output) YARR_CALL;
+ typedef MatchResult (*YarrJITCode16)(const UChar* input, unsigned start, unsigned length, int* output) YARR_CALL;
+ typedef MatchResult (*YarrJITCodeMatchOnly8)(const LChar* input, unsigned start, unsigned length) YARR_CALL;
+ typedef MatchResult (*YarrJITCodeMatchOnly16)(const UChar* input, unsigned start, unsigned length) YARR_CALL;
+#else
+ typedef EncodedMatchResult (*YarrJITCode8)(const LChar* input, unsigned start, unsigned length, int* output) YARR_CALL;
+ typedef EncodedMatchResult (*YarrJITCode16)(const UChar* input, unsigned start, unsigned length, int* output) YARR_CALL;
+ typedef EncodedMatchResult (*YarrJITCodeMatchOnly8)(const LChar* input, unsigned start, unsigned length) YARR_CALL;
+ typedef EncodedMatchResult (*YarrJITCodeMatchOnly16)(const UChar* input, unsigned start, unsigned length) YARR_CALL;
+#endif
+
+public:
+ YarrCodeBlock()
+ : m_needFallBack(false)
+ {
+ }
+
+ ~YarrCodeBlock()
+ {
+ }
+
+ void setFallBack(bool fallback) { m_needFallBack = fallback; }
+ bool isFallBack() { return m_needFallBack; }
+
+ bool has8BitCode() { return m_ref8.size(); }
+ bool has16BitCode() { return m_ref16.size(); }
+ void set8BitCode(MacroAssemblerCodeRef ref) { m_ref8 = ref; }
+ void set16BitCode(MacroAssemblerCodeRef ref) { m_ref16 = ref; }
+
+ bool has8BitCodeMatchOnly() { return m_matchOnly8.size(); }
+ bool has16BitCodeMatchOnly() { return m_matchOnly16.size(); }
+ void set8BitCodeMatchOnly(MacroAssemblerCodeRef matchOnly) { m_matchOnly8 = matchOnly; }
+ void set16BitCodeMatchOnly(MacroAssemblerCodeRef matchOnly) { m_matchOnly16 = matchOnly; }
+
+ MatchResult execute(const LChar* input, unsigned start, unsigned length, int* output)
+ {
+ ASSERT(has8BitCode());
+ return MatchResult(reinterpret_cast<YarrJITCode8>(m_ref8.code().executableAddress())(input, start, length, output));
+ }
+
+ MatchResult execute(const UChar* input, unsigned start, unsigned length, int* output)
+ {
+ ASSERT(has16BitCode());
+ return MatchResult(reinterpret_cast<YarrJITCode16>(m_ref16.code().executableAddress())(input, start, length, output));
+ }
+
+ MatchResult execute(const LChar* input, unsigned start, unsigned length)
+ {
+ ASSERT(has8BitCodeMatchOnly());
+ return MatchResult(reinterpret_cast<YarrJITCodeMatchOnly8>(m_matchOnly8.code().executableAddress())(input, start, length));
+ }
+
+ MatchResult execute(const UChar* input, unsigned start, unsigned length)
+ {
+ ASSERT(has16BitCodeMatchOnly());
+ return MatchResult(reinterpret_cast<YarrJITCodeMatchOnly16>(m_matchOnly16.code().executableAddress())(input, start, length));
+ }
+
+#if ENABLE(REGEXP_TRACING)
+ void *getAddr() { return m_ref.code().executableAddress(); }
+#endif
+
+ void clear()
+ {
+ m_ref8 = MacroAssemblerCodeRef();
+ m_ref16 = MacroAssemblerCodeRef();
+ m_matchOnly8 = MacroAssemblerCodeRef();
+ m_matchOnly16 = MacroAssemblerCodeRef();
+ m_needFallBack = false;
+ }
+
+private:
+ MacroAssemblerCodeRef m_ref8;
+ MacroAssemblerCodeRef m_ref16;
+ MacroAssemblerCodeRef m_matchOnly8;
+ MacroAssemblerCodeRef m_matchOnly16;
+ bool m_needFallBack;
+};
+
+enum YarrJITCompileMode {
+ MatchOnly,
+ IncludeSubpatterns
+};
+void jitCompile(YarrPattern&, YarrCharSize, JSGlobalData*, YarrCodeBlock& jitObject, YarrJITCompileMode = IncludeSubpatterns);
+
+} } // namespace JSC::Yarr
+
+#endif
+
+#endif // YarrJIT_h
diff --git a/src/3rdparty/masm/yarr/YarrParser.h b/src/3rdparty/masm/yarr/YarrParser.h
new file mode 100644
index 0000000000..4bab1a0903
--- /dev/null
+++ b/src/3rdparty/masm/yarr/YarrParser.h
@@ -0,0 +1,880 @@
+/*
+ * Copyright (C) 2009 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef YarrParser_h
+#define YarrParser_h
+
+#include "Yarr.h"
+#include <wtf/ASCIICType.h>
+#include <wtf/text/WTFString.h>
+#include <wtf/unicode/Unicode.h>
+
+namespace JSC { namespace Yarr {
+
+#define REGEXP_ERROR_PREFIX "Invalid regular expression: "
+
+enum BuiltInCharacterClassID {
+ DigitClassID,
+ SpaceClassID,
+ WordClassID,
+ NewlineClassID,
+};
+
+// The Parser class should not be used directly - only via the Yarr::parse() method.
+template<class Delegate, typename CharType>
+class Parser {
+private:
+ template<class FriendDelegate>
+ friend const char* parse(FriendDelegate&, const String& pattern, unsigned backReferenceLimit);
+
+ enum ErrorCode {
+ NoError,
+ PatternTooLarge,
+ QuantifierOutOfOrder,
+ QuantifierWithoutAtom,
+ QuantifierTooLarge,
+ MissingParentheses,
+ ParenthesesUnmatched,
+ ParenthesesTypeInvalid,
+ CharacterClassUnmatched,
+ CharacterClassOutOfOrder,
+ EscapeUnterminated,
+ NumberOfErrorCodes
+ };
+
+ /*
+ * CharacterClassParserDelegate:
+ *
+ * The class CharacterClassParserDelegate is used in the parsing of character
+ * classes. This class handles detection of character ranges. This class
+ * implements enough of the delegate interface such that it can be passed to
+ * parseEscape() as an EscapeDelegate. This allows parseEscape() to be reused
+ * to perform the parsing of escape characters in character sets.
+ */
+ class CharacterClassParserDelegate {
+ public:
+ CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err)
+ : m_delegate(delegate)
+ , m_err(err)
+ , m_state(Empty)
+ , m_character(0)
+ {
+ }
+
+ /*
+ * begin():
+ *
+ * Called at beginning of construction.
+ */
+ void begin(bool invert)
+ {
+ m_delegate.atomCharacterClassBegin(invert);
+ }
+
+ /*
+ * atomPatternCharacter():
+ *
+ * This method is called either from parseCharacterClass() (for an unescaped
+ * character in a character class), or from parseEscape(). In the former case
+ * the value true will be passed for the argument 'hyphenIsRange', and in this
+ * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/
+ * is different to /[a\-z]/).
+ */
+ void atomPatternCharacter(UChar ch, bool hyphenIsRange = false)
+ {
+ switch (m_state) {
+ case AfterCharacterClass:
+ // Following a builtin character class we need look out for a hyphen.
+ // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/.
+ // If we see a hyphen following a charater class then unlike usual
+ // we'll report it to the delegate immediately, and put ourself into
+ // a poisoned state. Any following calls to add another character or
+ // character class will result in an error. (A hypen following a
+ // character-class is itself valid, but only at the end of a regex).
+ if (hyphenIsRange && ch == '-') {
+ m_delegate.atomCharacterClassAtom('-');
+ m_state = AfterCharacterClassHyphen;
+ return;
+ }
+ // Otherwise just fall through - cached character so treat this as Empty.
+
+ case Empty:
+ m_character = ch;
+ m_state = CachedCharacter;
+ return;
+
+ case CachedCharacter:
+ if (hyphenIsRange && ch == '-')
+ m_state = CachedCharacterHyphen;
+ else {
+ m_delegate.atomCharacterClassAtom(m_character);
+ m_character = ch;
+ }
+ return;
+
+ case CachedCharacterHyphen:
+ if (ch < m_character) {
+ m_err = CharacterClassOutOfOrder;
+ return;
+ }
+ m_delegate.atomCharacterClassRange(m_character, ch);
+ m_state = Empty;
+ return;
+
+ // See coment in atomBuiltInCharacterClass below.
+ // This too is technically an error, per ECMA-262, and again we
+ // we chose to allow this. Note a subtlely here that while we
+ // diverge from the spec's definition of CharacterRange we do
+ // remain in compliance with the grammar. For example, consider
+ // the expression /[\d-a-z]/. We comply with the grammar in
+ // this case by not allowing a-z to be matched as a range.
+ case AfterCharacterClassHyphen:
+ m_delegate.atomCharacterClassAtom(ch);
+ m_state = Empty;
+ return;
+ }
+ }
+
+ /*
+ * atomBuiltInCharacterClass():
+ *
+ * Adds a built-in character class, called by parseEscape().
+ */
+ void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
+ {
+ switch (m_state) {
+ case CachedCharacter:
+ // Flush the currently cached character, then fall through.
+ m_delegate.atomCharacterClassAtom(m_character);
+
+ case Empty:
+ case AfterCharacterClass:
+ m_state = AfterCharacterClass;
+ m_delegate.atomCharacterClassBuiltIn(classID, invert);
+ return;
+
+ // If we hit either of these cases, we have an invalid range that
+ // looks something like /[x-\d]/ or /[\d-\d]/.
+ // According to ECMA-262 this should be a syntax error, but
+ // empirical testing shows this to break teh webz. Instead we
+ // comply with to the ECMA-262 grammar, and assume the grammar to
+ // have matched the range correctly, but tweak our interpretation
+ // of CharacterRange. Effectively we implicitly handle the hyphen
+ // as if it were escaped, e.g. /[\w-_]/ is treated as /[\w\-_]/.
+ case CachedCharacterHyphen:
+ m_delegate.atomCharacterClassAtom(m_character);
+ m_delegate.atomCharacterClassAtom('-');
+ // fall through
+ case AfterCharacterClassHyphen:
+ m_delegate.atomCharacterClassBuiltIn(classID, invert);
+ m_state = Empty;
+ return;
+ }
+ }
+
+ /*
+ * end():
+ *
+ * Called at end of construction.
+ */
+ void end()
+ {
+ if (m_state == CachedCharacter)
+ m_delegate.atomCharacterClassAtom(m_character);
+ else if (m_state == CachedCharacterHyphen) {
+ m_delegate.atomCharacterClassAtom(m_character);
+ m_delegate.atomCharacterClassAtom('-');
+ }
+ m_delegate.atomCharacterClassEnd();
+ }
+
+ // parseEscape() should never call these delegate methods when
+ // invoked with inCharacterClass set.
+ NO_RETURN_DUE_TO_ASSERT void assertionWordBoundary(bool) { ASSERT_NOT_REACHED(); }
+ NO_RETURN_DUE_TO_ASSERT void atomBackReference(unsigned) { ASSERT_NOT_REACHED(); }
+
+ private:
+ Delegate& m_delegate;
+ ErrorCode& m_err;
+ enum CharacterClassConstructionState {
+ Empty,
+ CachedCharacter,
+ CachedCharacterHyphen,
+ AfterCharacterClass,
+ AfterCharacterClassHyphen,
+ } m_state;
+ UChar m_character;
+ };
+
+ Parser(Delegate& delegate, const String& pattern, unsigned backReferenceLimit)
+ : m_delegate(delegate)
+ , m_backReferenceLimit(backReferenceLimit)
+ , m_err(NoError)
+ , m_data(pattern.getCharacters<CharType>())
+ , m_size(pattern.length())
+ , m_index(0)
+ , m_parenthesesNestingDepth(0)
+ {
+ }
+
+ /*
+ * parseEscape():
+ *
+ * Helper for parseTokens() AND parseCharacterClass().
+ * Unlike the other parser methods, this function does not report tokens
+ * directly to the member delegate (m_delegate), instead tokens are
+ * emitted to the delegate provided as an argument. In the case of atom
+ * escapes, parseTokens() will call parseEscape() passing m_delegate as
+ * an argument, and as such the escape will be reported to the delegate.
+ *
+ * However this method may also be used by parseCharacterClass(), in which
+ * case a CharacterClassParserDelegate will be passed as the delegate that
+ * tokens should be added to. A boolean flag is also provided to indicate
+ * whether that an escape in a CharacterClass is being parsed (some parsing
+ * rules change in this context).
+ *
+ * The boolean value returned by this method indicates whether the token
+ * parsed was an atom (outside of a characted class \b and \B will be
+ * interpreted as assertions).
+ */
+ template<bool inCharacterClass, class EscapeDelegate>
+ bool parseEscape(EscapeDelegate& delegate)
+ {
+ ASSERT(!m_err);
+ ASSERT(peek() == '\\');
+ consume();
+
+ if (atEndOfPattern()) {
+ m_err = EscapeUnterminated;
+ return false;
+ }
+
+ switch (peek()) {
+ // Assertions
+ case 'b':
+ consume();
+ if (inCharacterClass)
+ delegate.atomPatternCharacter('\b');
+ else {
+ delegate.assertionWordBoundary(false);
+ return false;
+ }
+ break;
+ case 'B':
+ consume();
+ if (inCharacterClass)
+ delegate.atomPatternCharacter('B');
+ else {
+ delegate.assertionWordBoundary(true);
+ return false;
+ }
+ break;
+
+ // CharacterClassEscape
+ case 'd':
+ consume();
+ delegate.atomBuiltInCharacterClass(DigitClassID, false);
+ break;
+ case 's':
+ consume();
+ delegate.atomBuiltInCharacterClass(SpaceClassID, false);
+ break;
+ case 'w':
+ consume();
+ delegate.atomBuiltInCharacterClass(WordClassID, false);
+ break;
+ case 'D':
+ consume();
+ delegate.atomBuiltInCharacterClass(DigitClassID, true);
+ break;
+ case 'S':
+ consume();
+ delegate.atomBuiltInCharacterClass(SpaceClassID, true);
+ break;
+ case 'W':
+ consume();
+ delegate.atomBuiltInCharacterClass(WordClassID, true);
+ break;
+
+ // DecimalEscape
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9': {
+ // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape.
+ // First, try to parse this as backreference.
+ if (!inCharacterClass) {
+ ParseState state = saveState();
+
+ unsigned backReference = consumeNumber();
+ if (backReference <= m_backReferenceLimit) {
+ delegate.atomBackReference(backReference);
+ break;
+ }
+
+ restoreState(state);
+ }
+
+ // Not a backreference, and not octal.
+ if (peek() >= '8') {
+ delegate.atomPatternCharacter('\\');
+ break;
+ }
+
+ // Fall-through to handle this as an octal escape.
+ }
+
+ // Octal escape
+ case '0':
+ delegate.atomPatternCharacter(consumeOctal());
+ break;
+
+ // ControlEscape
+ case 'f':
+ consume();
+ delegate.atomPatternCharacter('\f');
+ break;
+ case 'n':
+ consume();
+ delegate.atomPatternCharacter('\n');
+ break;
+ case 'r':
+ consume();
+ delegate.atomPatternCharacter('\r');
+ break;
+ case 't':
+ consume();
+ delegate.atomPatternCharacter('\t');
+ break;
+ case 'v':
+ consume();
+ delegate.atomPatternCharacter('\v');
+ break;
+
+ // ControlLetter
+ case 'c': {
+ ParseState state = saveState();
+ consume();
+ if (!atEndOfPattern()) {
+ int control = consume();
+
+ // To match Firefox, inside a character class, we also accept numbers and '_' as control characters.
+ if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) {
+ delegate.atomPatternCharacter(control & 0x1f);
+ break;
+ }
+ }
+ restoreState(state);
+ delegate.atomPatternCharacter('\\');
+ break;
+ }
+
+ // HexEscape
+ case 'x': {
+ consume();
+ int x = tryConsumeHex(2);
+ if (x == -1)
+ delegate.atomPatternCharacter('x');
+ else
+ delegate.atomPatternCharacter(x);
+ break;
+ }
+
+ // UnicodeEscape
+ case 'u': {
+ consume();
+ int u = tryConsumeHex(4);
+ if (u == -1)
+ delegate.atomPatternCharacter('u');
+ else
+ delegate.atomPatternCharacter(u);
+ break;
+ }
+
+ // IdentityEscape
+ default:
+ delegate.atomPatternCharacter(consume());
+ }
+
+ return true;
+ }
+
+ /*
+ * parseAtomEscape(), parseCharacterClassEscape():
+ *
+ * These methods alias to parseEscape().
+ */
+ bool parseAtomEscape()
+ {
+ return parseEscape<false>(m_delegate);
+ }
+ void parseCharacterClassEscape(CharacterClassParserDelegate& delegate)
+ {
+ parseEscape<true>(delegate);
+ }
+
+ /*
+ * parseCharacterClass():
+ *
+ * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape)
+ * to an instance of CharacterClassParserDelegate, to describe the character class to the
+ * delegate.
+ */
+ void parseCharacterClass()
+ {
+ ASSERT(!m_err);
+ ASSERT(peek() == '[');
+ consume();
+
+ CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err);
+
+ characterClassConstructor.begin(tryConsume('^'));
+
+ while (!atEndOfPattern()) {
+ switch (peek()) {
+ case ']':
+ consume();
+ characterClassConstructor.end();
+ return;
+
+ case '\\':
+ parseCharacterClassEscape(characterClassConstructor);
+ break;
+
+ default:
+ characterClassConstructor.atomPatternCharacter(consume(), true);
+ }
+
+ if (m_err)
+ return;
+ }
+
+ m_err = CharacterClassUnmatched;
+ }
+
+ /*
+ * parseParenthesesBegin():
+ *
+ * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns.
+ */
+ void parseParenthesesBegin()
+ {
+ ASSERT(!m_err);
+ ASSERT(peek() == '(');
+ consume();
+
+ if (tryConsume('?')) {
+ if (atEndOfPattern()) {
+ m_err = ParenthesesTypeInvalid;
+ return;
+ }
+
+ switch (consume()) {
+ case ':':
+ m_delegate.atomParenthesesSubpatternBegin(false);
+ break;
+
+ case '=':
+ m_delegate.atomParentheticalAssertionBegin();
+ break;
+
+ case '!':
+ m_delegate.atomParentheticalAssertionBegin(true);
+ break;
+
+ default:
+ m_err = ParenthesesTypeInvalid;
+ }
+ } else
+ m_delegate.atomParenthesesSubpatternBegin();
+
+ ++m_parenthesesNestingDepth;
+ }
+
+ /*
+ * parseParenthesesEnd():
+ *
+ * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses).
+ */
+ void parseParenthesesEnd()
+ {
+ ASSERT(!m_err);
+ ASSERT(peek() == ')');
+ consume();
+
+ if (m_parenthesesNestingDepth > 0)
+ m_delegate.atomParenthesesEnd();
+ else
+ m_err = ParenthesesUnmatched;
+
+ --m_parenthesesNestingDepth;
+ }
+
+ /*
+ * parseQuantifier():
+ *
+ * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers.
+ */
+ void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max)
+ {
+ ASSERT(!m_err);
+ ASSERT(min <= max);
+
+ if (min == UINT_MAX) {
+ m_err = QuantifierTooLarge;
+ return;
+ }
+
+ if (lastTokenWasAnAtom)
+ m_delegate.quantifyAtom(min, max, !tryConsume('?'));
+ else
+ m_err = QuantifierWithoutAtom;
+ }
+
+ /*
+ * parseTokens():
+ *
+ * This method loops over the input pattern reporting tokens to the delegate.
+ * The method returns when a parse error is detected, or the end of the pattern
+ * is reached. One piece of state is tracked around the loop, which is whether
+ * the last token passed to the delegate was an atom (this is necessary to detect
+ * a parse error when a quantifier provided without an atom to quantify).
+ */
+ void parseTokens()
+ {
+ bool lastTokenWasAnAtom = false;
+
+ while (!atEndOfPattern()) {
+ switch (peek()) {
+ case '|':
+ consume();
+ m_delegate.disjunction();
+ lastTokenWasAnAtom = false;
+ break;
+
+ case '(':
+ parseParenthesesBegin();
+ lastTokenWasAnAtom = false;
+ break;
+
+ case ')':
+ parseParenthesesEnd();
+ lastTokenWasAnAtom = true;
+ break;
+
+ case '^':
+ consume();
+ m_delegate.assertionBOL();
+ lastTokenWasAnAtom = false;
+ break;
+
+ case '$':
+ consume();
+ m_delegate.assertionEOL();
+ lastTokenWasAnAtom = false;
+ break;
+
+ case '.':
+ consume();
+ m_delegate.atomBuiltInCharacterClass(NewlineClassID, true);
+ lastTokenWasAnAtom = true;
+ break;
+
+ case '[':
+ parseCharacterClass();
+ lastTokenWasAnAtom = true;
+ break;
+
+ case '\\':
+ lastTokenWasAnAtom = parseAtomEscape();
+ break;
+
+ case '*':
+ consume();
+ parseQuantifier(lastTokenWasAnAtom, 0, quantifyInfinite);
+ lastTokenWasAnAtom = false;
+ break;
+
+ case '+':
+ consume();
+ parseQuantifier(lastTokenWasAnAtom, 1, quantifyInfinite);
+ lastTokenWasAnAtom = false;
+ break;
+
+ case '?':
+ consume();
+ parseQuantifier(lastTokenWasAnAtom, 0, 1);
+ lastTokenWasAnAtom = false;
+ break;
+
+ case '{': {
+ ParseState state = saveState();
+
+ consume();
+ if (peekIsDigit()) {
+ unsigned min = consumeNumber();
+ unsigned max = min;
+
+ if (tryConsume(','))
+ max = peekIsDigit() ? consumeNumber() : quantifyInfinite;
+
+ if (tryConsume('}')) {
+ if (min <= max)
+ parseQuantifier(lastTokenWasAnAtom, min, max);
+ else
+ m_err = QuantifierOutOfOrder;
+ lastTokenWasAnAtom = false;
+ break;
+ }
+ }
+
+ restoreState(state);
+ } // if we did not find a complete quantifer, fall through to the default case.
+
+ default:
+ m_delegate.atomPatternCharacter(consume());
+ lastTokenWasAnAtom = true;
+ }
+
+ if (m_err)
+ return;
+ }
+
+ if (m_parenthesesNestingDepth > 0)
+ m_err = MissingParentheses;
+ }
+
+ /*
+ * parse():
+ *
+ * This method calls parseTokens() to parse over the input and converts any
+ * error code to a const char* for a result.
+ */
+ const char* parse()
+ {
+ if (m_size > MAX_PATTERN_SIZE)
+ m_err = PatternTooLarge;
+ else
+ parseTokens();
+ ASSERT(atEndOfPattern() || m_err);
+
+ // The order of this array must match the ErrorCode enum.
+ static const char* errorMessages[NumberOfErrorCodes] = {
+ 0, // NoError
+ REGEXP_ERROR_PREFIX "regular expression too large",
+ REGEXP_ERROR_PREFIX "numbers out of order in {} quantifier",
+ REGEXP_ERROR_PREFIX "nothing to repeat",
+ REGEXP_ERROR_PREFIX "number too large in {} quantifier",
+ REGEXP_ERROR_PREFIX "missing )",
+ REGEXP_ERROR_PREFIX "unmatched parentheses",
+ REGEXP_ERROR_PREFIX "unrecognized character after (?",
+ REGEXP_ERROR_PREFIX "missing terminating ] for character class",
+ REGEXP_ERROR_PREFIX "range out of order in character class",
+ REGEXP_ERROR_PREFIX "\\ at end of pattern"
+ };
+
+ return errorMessages[m_err];
+ }
+
+ // Misc helper functions:
+
+ typedef unsigned ParseState;
+
+ ParseState saveState()
+ {
+ return m_index;
+ }
+
+ void restoreState(ParseState state)
+ {
+ m_index = state;
+ }
+
+ bool atEndOfPattern()
+ {
+ ASSERT(m_index <= m_size);
+ return m_index == m_size;
+ }
+
+ int peek()
+ {
+ ASSERT(m_index < m_size);
+ return m_data[m_index];
+ }
+
+ bool peekIsDigit()
+ {
+ return !atEndOfPattern() && WTF::isASCIIDigit(peek());
+ }
+
+ unsigned peekDigit()
+ {
+ ASSERT(peekIsDigit());
+ return peek() - '0';
+ }
+
+ int consume()
+ {
+ ASSERT(m_index < m_size);
+ return m_data[m_index++];
+ }
+
+ unsigned consumeDigit()
+ {
+ ASSERT(peekIsDigit());
+ return consume() - '0';
+ }
+
+ unsigned consumeNumber()
+ {
+ unsigned n = consumeDigit();
+ // check for overflow.
+ for (unsigned newValue; peekIsDigit() && ((newValue = n * 10 + peekDigit()) >= n); ) {
+ n = newValue;
+ consume();
+ }
+ return n;
+ }
+
+ unsigned consumeOctal()
+ {
+ ASSERT(WTF::isASCIIOctalDigit(peek()));
+
+ unsigned n = consumeDigit();
+ while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek()))
+ n = n * 8 + consumeDigit();
+ return n;
+ }
+
+ bool tryConsume(UChar ch)
+ {
+ if (atEndOfPattern() || (m_data[m_index] != ch))
+ return false;
+ ++m_index;
+ return true;
+ }
+
+ int tryConsumeHex(int count)
+ {
+ ParseState state = saveState();
+
+ int n = 0;
+ while (count--) {
+ if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) {
+ restoreState(state);
+ return -1;
+ }
+ n = (n << 4) | WTF::toASCIIHexValue(consume());
+ }
+ return n;
+ }
+
+ Delegate& m_delegate;
+ unsigned m_backReferenceLimit;
+ ErrorCode m_err;
+ const CharType* m_data;
+ unsigned m_size;
+ unsigned m_index;
+ unsigned m_parenthesesNestingDepth;
+
+ // Derived by empirical testing of compile time in PCRE and WREC.
+ static const unsigned MAX_PATTERN_SIZE = 1024 * 1024;
+};
+
+/*
+ * Yarr::parse():
+ *
+ * The parse method is passed a pattern to be parsed and a delegate upon which
+ * callbacks will be made to record the parsed tokens forming the regex.
+ * Yarr::parse() returns null on success, or a const C string providing an error
+ * message where a parse error occurs.
+ *
+ * The Delegate must implement the following interface:
+ *
+ * void assertionBOL();
+ * void assertionEOL();
+ * void assertionWordBoundary(bool invert);
+ *
+ * void atomPatternCharacter(UChar ch);
+ * void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert);
+ * void atomCharacterClassBegin(bool invert)
+ * void atomCharacterClassAtom(UChar ch)
+ * void atomCharacterClassRange(UChar begin, UChar end)
+ * void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
+ * void atomCharacterClassEnd()
+ * void atomParenthesesSubpatternBegin(bool capture = true);
+ * void atomParentheticalAssertionBegin(bool invert = false);
+ * void atomParenthesesEnd();
+ * void atomBackReference(unsigned subpatternId);
+ *
+ * void quantifyAtom(unsigned min, unsigned max, bool greedy);
+ *
+ * void disjunction();
+ *
+ * The regular expression is described by a sequence of assertion*() and atom*()
+ * callbacks to the delegate, describing the terms in the regular expression.
+ * Following an atom a quantifyAtom() call may occur to indicate that the previous
+ * atom should be quantified. In the case of atoms described across multiple
+ * calls (parentheses and character classes) the call to quantifyAtom() will come
+ * after the call to the atom*End() method, never after atom*Begin().
+ *
+ * Character classes may either be described by a single call to
+ * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls.
+ * In the latter case, ...Begin() will be called, followed by a sequence of
+ * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End().
+ *
+ * Sequences of atoms and assertions are broken into alternatives via calls to
+ * disjunction(). Assertions, atoms, and disjunctions emitted between calls to
+ * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern.
+ * atomParenthesesBegin() is passed a subpatternId. In the case of a regular
+ * capturing subpattern, this will be the subpatternId associated with these
+ * parentheses, and will also by definition be the lowest subpatternId of these
+ * parentheses and of any nested paretheses. The atomParenthesesEnd() method
+ * is passed the subpatternId of the last capturing subexpression nested within
+ * these paretheses. In the case of a capturing subpattern with no nested
+ * capturing subpatterns, the same subpatternId will be passed to the begin and
+ * end functions. In the case of non-capturing subpatterns the subpatternId
+ * passed to the begin method is also the first possible subpatternId that might
+ * be nested within these paretheses. If a set of non-capturing parentheses does
+ * not contain any capturing subpatterns, then the subpatternId passed to begin
+ * will be greater than the subpatternId passed to end.
+ */
+
+template<class Delegate>
+const char* parse(Delegate& delegate, const String& pattern, unsigned backReferenceLimit = quantifyInfinite)
+{
+ if (pattern.is8Bit())
+ return Parser<Delegate, LChar>(delegate, pattern, backReferenceLimit).parse();
+ return Parser<Delegate, UChar>(delegate, pattern, backReferenceLimit).parse();
+}
+
+} } // namespace JSC::Yarr
+
+#endif // YarrParser_h
diff --git a/src/3rdparty/masm/yarr/YarrPattern.cpp b/src/3rdparty/masm/yarr/YarrPattern.cpp
new file mode 100644
index 0000000000..c953a38d2f
--- /dev/null
+++ b/src/3rdparty/masm/yarr/YarrPattern.cpp
@@ -0,0 +1,874 @@
+/*
+ * Copyright (C) 2009 Apple Inc. All rights reserved.
+ * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "YarrPattern.h"
+
+#include "Yarr.h"
+#include "YarrCanonicalizeUCS2.h"
+#include "YarrParser.h"
+#include <wtf/Vector.h>
+
+using namespace WTF;
+
+namespace JSC { namespace Yarr {
+
+#include "RegExpJitTables.h"
+
+class CharacterClassConstructor {
+public:
+ CharacterClassConstructor(bool isCaseInsensitive = false)
+ : m_isCaseInsensitive(isCaseInsensitive)
+ {
+ }
+
+ void reset()
+ {
+ m_matches.clear();
+ m_ranges.clear();
+ m_matchesUnicode.clear();
+ m_rangesUnicode.clear();
+ }
+
+ void append(const CharacterClass* other)
+ {
+ for (size_t i = 0; i < other->m_matches.size(); ++i)
+ addSorted(m_matches, other->m_matches[i]);
+ for (size_t i = 0; i < other->m_ranges.size(); ++i)
+ addSortedRange(m_ranges, other->m_ranges[i].begin, other->m_ranges[i].end);
+ for (size_t i = 0; i < other->m_matchesUnicode.size(); ++i)
+ addSorted(m_matchesUnicode, other->m_matchesUnicode[i]);
+ for (size_t i = 0; i < other->m_rangesUnicode.size(); ++i)
+ addSortedRange(m_rangesUnicode, other->m_rangesUnicode[i].begin, other->m_rangesUnicode[i].end);
+ }
+
+ void putChar(UChar ch)
+ {
+ // Handle ascii cases.
+ if (ch <= 0x7f) {
+ if (m_isCaseInsensitive && isASCIIAlpha(ch)) {
+ addSorted(m_matches, toASCIIUpper(ch));
+ addSorted(m_matches, toASCIILower(ch));
+ } else
+ addSorted(m_matches, ch);
+ return;
+ }
+
+ // Simple case, not a case-insensitive match.
+ if (!m_isCaseInsensitive) {
+ addSorted(m_matchesUnicode, ch);
+ return;
+ }
+
+ // Add multiple matches, if necessary.
+ UCS2CanonicalizationRange* info = rangeInfoFor(ch);
+ if (info->type == CanonicalizeUnique)
+ addSorted(m_matchesUnicode, ch);
+ else
+ putUnicodeIgnoreCase(ch, info);
+ }
+
+ void putUnicodeIgnoreCase(UChar ch, UCS2CanonicalizationRange* info)
+ {
+ ASSERT(m_isCaseInsensitive);
+ ASSERT(ch > 0x7f);
+ ASSERT(ch >= info->begin && ch <= info->end);
+ ASSERT(info->type != CanonicalizeUnique);
+ if (info->type == CanonicalizeSet) {
+ for (uint16_t* set = characterSetInfo[info->value]; (ch = *set); ++set)
+ addSorted(m_matchesUnicode, ch);
+ } else {
+ addSorted(m_matchesUnicode, ch);
+ addSorted(m_matchesUnicode, getCanonicalPair(info, ch));
+ }
+ }
+
+ void putRange(UChar lo, UChar hi)
+ {
+ if (lo <= 0x7f) {
+ char asciiLo = lo;
+ char asciiHi = std::min(hi, (UChar)0x7f);
+ addSortedRange(m_ranges, lo, asciiHi);
+
+ if (m_isCaseInsensitive) {
+ if ((asciiLo <= 'Z') && (asciiHi >= 'A'))
+ addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A'));
+ if ((asciiLo <= 'z') && (asciiHi >= 'a'))
+ addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a'));
+ }
+ }
+ if (hi <= 0x7f)
+ return;
+
+ lo = std::max(lo, (UChar)0x80);
+ addSortedRange(m_rangesUnicode, lo, hi);
+
+ if (!m_isCaseInsensitive)
+ return;
+
+ UCS2CanonicalizationRange* info = rangeInfoFor(lo);
+ while (true) {
+ // Handle the range [lo .. end]
+ UChar end = std::min<UChar>(info->end, hi);
+
+ switch (info->type) {
+ case CanonicalizeUnique:
+ // Nothing to do - no canonical equivalents.
+ break;
+ case CanonicalizeSet: {
+ UChar ch;
+ for (uint16_t* set = characterSetInfo[info->value]; (ch = *set); ++set)
+ addSorted(m_matchesUnicode, ch);
+ break;
+ }
+ case CanonicalizeRangeLo:
+ addSortedRange(m_rangesUnicode, lo + info->value, end + info->value);
+ break;
+ case CanonicalizeRangeHi:
+ addSortedRange(m_rangesUnicode, lo - info->value, end - info->value);
+ break;
+ case CanonicalizeAlternatingAligned:
+ // Use addSortedRange since there is likely an abutting range to combine with.
+ if (lo & 1)
+ addSortedRange(m_rangesUnicode, lo - 1, lo - 1);
+ if (!(end & 1))
+ addSortedRange(m_rangesUnicode, end + 1, end + 1);
+ break;
+ case CanonicalizeAlternatingUnaligned:
+ // Use addSortedRange since there is likely an abutting range to combine with.
+ if (!(lo & 1))
+ addSortedRange(m_rangesUnicode, lo - 1, lo - 1);
+ if (end & 1)
+ addSortedRange(m_rangesUnicode, end + 1, end + 1);
+ break;
+ }
+
+ if (hi == end)
+ return;
+
+ ++info;
+ lo = info->begin;
+ };
+
+ }
+
+ CharacterClass* charClass()
+ {
+ CharacterClass* characterClass = new CharacterClass(0);
+
+ characterClass->m_matches.swap(m_matches);
+ characterClass->m_ranges.swap(m_ranges);
+ characterClass->m_matchesUnicode.swap(m_matchesUnicode);
+ characterClass->m_rangesUnicode.swap(m_rangesUnicode);
+
+ return characterClass;
+ }
+
+private:
+ void addSorted(Vector<UChar>& matches, UChar ch)
+ {
+ unsigned pos = 0;
+ unsigned range = matches.size();
+
+ // binary chop, find position to insert char.
+ while (range) {
+ unsigned index = range >> 1;
+
+ int val = matches[pos+index] - ch;
+ if (!val)
+ return;
+ else if (val > 0)
+ range = index;
+ else {
+ pos += (index+1);
+ range -= (index+1);
+ }
+ }
+
+ if (pos == matches.size())
+ matches.append(ch);
+ else
+ matches.insert(pos, ch);
+ }
+
+ void addSortedRange(Vector<CharacterRange>& ranges, UChar lo, UChar hi)
+ {
+ unsigned end = ranges.size();
+
+ // Simple linear scan - I doubt there are that many ranges anyway...
+ // feel free to fix this with something faster (eg binary chop).
+ for (unsigned i = 0; i < end; ++i) {
+ // does the new range fall before the current position in the array
+ if (hi < ranges[i].begin) {
+ // optional optimization: concatenate appending ranges? - may not be worthwhile.
+ if (hi == (ranges[i].begin - 1)) {
+ ranges[i].begin = lo;
+ return;
+ }
+ ranges.insert(i, CharacterRange(lo, hi));
+ return;
+ }
+ // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining
+ // If the new range start at or before the end of the last range, then the overlap (if it starts one after the
+ // end of the last range they concatenate, which is just as good.
+ if (lo <= (ranges[i].end + 1)) {
+ // found an intersect! we'll replace this entry in the array.
+ ranges[i].begin = std::min(ranges[i].begin, lo);
+ ranges[i].end = std::max(ranges[i].end, hi);
+
+ // now check if the new range can subsume any subsequent ranges.
+ unsigned next = i+1;
+ // each iteration of the loop we will either remove something from the list, or break the loop.
+ while (next < ranges.size()) {
+ if (ranges[next].begin <= (ranges[i].end + 1)) {
+ // the next entry now overlaps / concatenates this one.
+ ranges[i].end = std::max(ranges[i].end, ranges[next].end);
+ ranges.remove(next);
+ } else
+ break;
+ }
+
+ return;
+ }
+ }
+
+ // CharacterRange comes after all existing ranges.
+ ranges.append(CharacterRange(lo, hi));
+ }
+
+ bool m_isCaseInsensitive;
+
+ Vector<UChar> m_matches;
+ Vector<CharacterRange> m_ranges;
+ Vector<UChar> m_matchesUnicode;
+ Vector<CharacterRange> m_rangesUnicode;
+};
+
+class YarrPatternConstructor {
+public:
+ YarrPatternConstructor(YarrPattern& pattern)
+ : m_pattern(pattern)
+ , m_characterClassConstructor(pattern.m_ignoreCase)
+ , m_invertParentheticalAssertion(false)
+ {
+ m_pattern.m_body = new PatternDisjunction();
+ m_alternative = m_pattern.m_body->addNewAlternative();
+ m_pattern.m_disjunctions.append(m_pattern.m_body);
+ }
+
+ ~YarrPatternConstructor()
+ {
+ }
+
+ void reset()
+ {
+ m_pattern.reset();
+ m_characterClassConstructor.reset();
+
+ m_pattern.m_body = new PatternDisjunction();
+ m_alternative = m_pattern.m_body->addNewAlternative();
+ m_pattern.m_disjunctions.append(m_pattern.m_body);
+ }
+
+ void assertionBOL()
+ {
+ if (!m_alternative->m_terms.size() & !m_invertParentheticalAssertion) {
+ m_alternative->m_startsWithBOL = true;
+ m_alternative->m_containsBOL = true;
+ m_pattern.m_containsBOL = true;
+ }
+ m_alternative->m_terms.append(PatternTerm::BOL());
+ }
+ void assertionEOL()
+ {
+ m_alternative->m_terms.append(PatternTerm::EOL());
+ }
+ void assertionWordBoundary(bool invert)
+ {
+ m_alternative->m_terms.append(PatternTerm::WordBoundary(invert));
+ }
+
+ void atomPatternCharacter(UChar ch)
+ {
+ // We handle case-insensitive checking of unicode characters which do have both
+ // cases by handling them as if they were defined using a CharacterClass.
+ if (!m_pattern.m_ignoreCase || isASCII(ch)) {
+ m_alternative->m_terms.append(PatternTerm(ch));
+ return;
+ }
+
+ UCS2CanonicalizationRange* info = rangeInfoFor(ch);
+ if (info->type == CanonicalizeUnique) {
+ m_alternative->m_terms.append(PatternTerm(ch));
+ return;
+ }
+
+ m_characterClassConstructor.putUnicodeIgnoreCase(ch, info);
+ CharacterClass* newCharacterClass = m_characterClassConstructor.charClass();
+ m_pattern.m_userCharacterClasses.append(newCharacterClass);
+ m_alternative->m_terms.append(PatternTerm(newCharacterClass, false));
+ }
+
+ void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
+ {
+ switch (classID) {
+ case DigitClassID:
+ m_alternative->m_terms.append(PatternTerm(m_pattern.digitsCharacterClass(), invert));
+ break;
+ case SpaceClassID:
+ m_alternative->m_terms.append(PatternTerm(m_pattern.spacesCharacterClass(), invert));
+ break;
+ case WordClassID:
+ m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert));
+ break;
+ case NewlineClassID:
+ m_alternative->m_terms.append(PatternTerm(m_pattern.newlineCharacterClass(), invert));
+ break;
+ }
+ }
+
+ void atomCharacterClassBegin(bool invert = false)
+ {
+ m_invertCharacterClass = invert;
+ }
+
+ void atomCharacterClassAtom(UChar ch)
+ {
+ m_characterClassConstructor.putChar(ch);
+ }
+
+ void atomCharacterClassRange(UChar begin, UChar end)
+ {
+ m_characterClassConstructor.putRange(begin, end);
+ }
+
+ void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
+ {
+ ASSERT(classID != NewlineClassID);
+
+ switch (classID) {
+ case DigitClassID:
+ m_characterClassConstructor.append(invert ? m_pattern.nondigitsCharacterClass() : m_pattern.digitsCharacterClass());
+ break;
+
+ case SpaceClassID:
+ m_characterClassConstructor.append(invert ? m_pattern.nonspacesCharacterClass() : m_pattern.spacesCharacterClass());
+ break;
+
+ case WordClassID:
+ m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass());
+ break;
+
+ default:
+ ASSERT_NOT_REACHED();
+ }
+ }
+
+ void atomCharacterClassEnd()
+ {
+ CharacterClass* newCharacterClass = m_characterClassConstructor.charClass();
+ m_pattern.m_userCharacterClasses.append(newCharacterClass);
+ m_alternative->m_terms.append(PatternTerm(newCharacterClass, m_invertCharacterClass));
+ }
+
+ void atomParenthesesSubpatternBegin(bool capture = true)
+ {
+ unsigned subpatternId = m_pattern.m_numSubpatterns + 1;
+ if (capture)
+ m_pattern.m_numSubpatterns++;
+
+ PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative);
+ m_pattern.m_disjunctions.append(parenthesesDisjunction);
+ m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, capture, false));
+ m_alternative = parenthesesDisjunction->addNewAlternative();
+ }
+
+ void atomParentheticalAssertionBegin(bool invert = false)
+ {
+ PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative);
+ m_pattern.m_disjunctions.append(parenthesesDisjunction);
+ m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction, false, invert));
+ m_alternative = parenthesesDisjunction->addNewAlternative();
+ m_invertParentheticalAssertion = invert;
+ }
+
+ void atomParenthesesEnd()
+ {
+ ASSERT(m_alternative->m_parent);
+ ASSERT(m_alternative->m_parent->m_parent);
+
+ PatternDisjunction* parenthesesDisjunction = m_alternative->m_parent;
+ m_alternative = m_alternative->m_parent->m_parent;
+
+ PatternTerm& lastTerm = m_alternative->lastTerm();
+
+ unsigned numParenAlternatives = parenthesesDisjunction->m_alternatives.size();
+ unsigned numBOLAnchoredAlts = 0;
+
+ for (unsigned i = 0; i < numParenAlternatives; i++) {
+ // Bubble up BOL flags
+ if (parenthesesDisjunction->m_alternatives[i]->m_startsWithBOL)
+ numBOLAnchoredAlts++;
+ }
+
+ if (numBOLAnchoredAlts) {
+ m_alternative->m_containsBOL = true;
+ // If all the alternatives in parens start with BOL, then so does this one
+ if (numBOLAnchoredAlts == numParenAlternatives)
+ m_alternative->m_startsWithBOL = true;
+ }
+
+ lastTerm.parentheses.lastSubpatternId = m_pattern.m_numSubpatterns;
+ m_invertParentheticalAssertion = false;
+ }
+
+ void atomBackReference(unsigned subpatternId)
+ {
+ ASSERT(subpatternId);
+ m_pattern.m_containsBackreferences = true;
+ m_pattern.m_maxBackReference = std::max(m_pattern.m_maxBackReference, subpatternId);
+
+ if (subpatternId > m_pattern.m_numSubpatterns) {
+ m_alternative->m_terms.append(PatternTerm::ForwardReference());
+ return;
+ }
+
+ PatternAlternative* currentAlternative = m_alternative;
+ ASSERT(currentAlternative);
+
+ // Note to self: if we waited until the AST was baked, we could also remove forwards refs
+ while ((currentAlternative = currentAlternative->m_parent->m_parent)) {
+ PatternTerm& term = currentAlternative->lastTerm();
+ ASSERT((term.type == PatternTerm::TypeParenthesesSubpattern) || (term.type == PatternTerm::TypeParentheticalAssertion));
+
+ if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.capture() && (subpatternId == term.parentheses.subpatternId)) {
+ m_alternative->m_terms.append(PatternTerm::ForwardReference());
+ return;
+ }
+ }
+
+ m_alternative->m_terms.append(PatternTerm(subpatternId));
+ }
+
+ // deep copy the argument disjunction. If filterStartsWithBOL is true,
+ // skip alternatives with m_startsWithBOL set true.
+ PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction, bool filterStartsWithBOL = false)
+ {
+ PatternDisjunction* newDisjunction = 0;
+ for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
+ PatternAlternative* alternative = disjunction->m_alternatives[alt];
+ if (!filterStartsWithBOL || !alternative->m_startsWithBOL) {
+ if (!newDisjunction) {
+ newDisjunction = new PatternDisjunction();
+ newDisjunction->m_parent = disjunction->m_parent;
+ }
+ PatternAlternative* newAlternative = newDisjunction->addNewAlternative();
+ for (unsigned i = 0; i < alternative->m_terms.size(); ++i)
+ newAlternative->m_terms.append(copyTerm(alternative->m_terms[i], filterStartsWithBOL));
+ }
+ }
+
+ if (newDisjunction)
+ m_pattern.m_disjunctions.append(newDisjunction);
+ return newDisjunction;
+ }
+
+ PatternTerm copyTerm(PatternTerm& term, bool filterStartsWithBOL = false)
+ {
+ if ((term.type != PatternTerm::TypeParenthesesSubpattern) && (term.type != PatternTerm::TypeParentheticalAssertion))
+ return PatternTerm(term);
+
+ PatternTerm termCopy = term;
+ termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction, filterStartsWithBOL);
+ return termCopy;
+ }
+
+ void quantifyAtom(unsigned min, unsigned max, bool greedy)
+ {
+ ASSERT(min <= max);
+ ASSERT(m_alternative->m_terms.size());
+
+ if (!max) {
+ m_alternative->removeLastTerm();
+ return;
+ }
+
+ PatternTerm& term = m_alternative->lastTerm();
+ ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary);
+ ASSERT((term.quantityCount == 1) && (term.quantityType == QuantifierFixedCount));
+
+ if (term.type == PatternTerm::TypeParentheticalAssertion) {
+ // If an assertion is quantified with a minimum count of zero, it can simply be removed.
+ // This arises from the RepeatMatcher behaviour in the spec. Matching an assertion never
+ // results in any input being consumed, however the continuation passed to the assertion
+ // (called in steps, 8c and 9 of the RepeatMatcher definition, ES5.1 15.10.2.5) will
+ // reject all zero length matches (see step 2.1). A match from the continuation of the
+ // expression will still be accepted regardless (via steps 8a and 11) - the upshot of all
+ // this is that matches from the assertion are not required, and won't be accepted anyway,
+ // so no need to ever run it.
+ if (!min)
+ m_alternative->removeLastTerm();
+ // We never need to run an assertion more than once. Subsequent interations will be run
+ // with the same start index (since assertions are non-capturing) and the same captures
+ // (per step 4 of RepeatMatcher in ES5.1 15.10.2.5), and as such will always produce the
+ // same result and captures. If the first match succeeds then the subsequent (min - 1)
+ // matches will too. Any additional optional matches will fail (on the same basis as the
+ // minimum zero quantified assertions, above), but this will still result in a match.
+ return;
+ }
+
+ if (min == 0)
+ term.quantify(max, greedy ? QuantifierGreedy : QuantifierNonGreedy);
+ else if (min == max)
+ term.quantify(min, QuantifierFixedCount);
+ else {
+ term.quantify(min, QuantifierFixedCount);
+ m_alternative->m_terms.append(copyTerm(term));
+ // NOTE: this term is interesting from an analysis perspective, in that it can be ignored.....
+ m_alternative->lastTerm().quantify((max == quantifyInfinite) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy);
+ if (m_alternative->lastTerm().type == PatternTerm::TypeParenthesesSubpattern)
+ m_alternative->lastTerm().parentheses.isCopy = true;
+ }
+ }
+
+ void disjunction()
+ {
+ m_alternative = m_alternative->m_parent->addNewAlternative();
+ }
+
+ unsigned setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition)
+ {
+ alternative->m_hasFixedSize = true;
+ Checked<unsigned> currentInputPosition = initialInputPosition;
+
+ for (unsigned i = 0; i < alternative->m_terms.size(); ++i) {
+ PatternTerm& term = alternative->m_terms[i];
+
+ switch (term.type) {
+ case PatternTerm::TypeAssertionBOL:
+ case PatternTerm::TypeAssertionEOL:
+ case PatternTerm::TypeAssertionWordBoundary:
+ term.inputPosition = currentInputPosition.unsafeGet();
+ break;
+
+ case PatternTerm::TypeBackReference:
+ term.inputPosition = currentInputPosition.unsafeGet();
+ term.frameLocation = currentCallFrameSize;
+ currentCallFrameSize += YarrStackSpaceForBackTrackInfoBackReference;
+ alternative->m_hasFixedSize = false;
+ break;
+
+ case PatternTerm::TypeForwardReference:
+ break;
+
+ case PatternTerm::TypePatternCharacter:
+ term.inputPosition = currentInputPosition.unsafeGet();
+ if (term.quantityType != QuantifierFixedCount) {
+ term.frameLocation = currentCallFrameSize;
+ currentCallFrameSize += YarrStackSpaceForBackTrackInfoPatternCharacter;
+ alternative->m_hasFixedSize = false;
+ } else
+ currentInputPosition += term.quantityCount;
+ break;
+
+ case PatternTerm::TypeCharacterClass:
+ term.inputPosition = currentInputPosition.unsafeGet();
+ if (term.quantityType != QuantifierFixedCount) {
+ term.frameLocation = currentCallFrameSize;
+ currentCallFrameSize += YarrStackSpaceForBackTrackInfoCharacterClass;
+ alternative->m_hasFixedSize = false;
+ } else
+ currentInputPosition += term.quantityCount;
+ break;
+
+ case PatternTerm::TypeParenthesesSubpattern:
+ // Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own.
+ term.frameLocation = currentCallFrameSize;
+ if (term.quantityCount == 1 && !term.parentheses.isCopy) {
+ if (term.quantityType != QuantifierFixedCount)
+ currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesOnce;
+ currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet());
+ // If quantity is fixed, then pre-check its minimum size.
+ if (term.quantityType == QuantifierFixedCount)
+ currentInputPosition += term.parentheses.disjunction->m_minimumSize;
+ term.inputPosition = currentInputPosition.unsafeGet();
+ } else if (term.parentheses.isTerminal) {
+ currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesTerminal;
+ currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet());
+ term.inputPosition = currentInputPosition.unsafeGet();
+ } else {
+ term.inputPosition = currentInputPosition.unsafeGet();
+ setupDisjunctionOffsets(term.parentheses.disjunction, 0, currentInputPosition.unsafeGet());
+ currentCallFrameSize += YarrStackSpaceForBackTrackInfoParentheses;
+ }
+ // Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length.
+ alternative->m_hasFixedSize = false;
+ break;
+
+ case PatternTerm::TypeParentheticalAssertion:
+ term.inputPosition = currentInputPosition.unsafeGet();
+ term.frameLocation = currentCallFrameSize;
+ currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + YarrStackSpaceForBackTrackInfoParentheticalAssertion, currentInputPosition.unsafeGet());
+ break;
+
+ case PatternTerm::TypeDotStarEnclosure:
+ alternative->m_hasFixedSize = false;
+ term.inputPosition = initialInputPosition;
+ break;
+ }
+ }
+
+ alternative->m_minimumSize = (currentInputPosition - initialInputPosition).unsafeGet();
+ return currentCallFrameSize;
+ }
+
+ unsigned setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition)
+ {
+ if ((disjunction != m_pattern.m_body) && (disjunction->m_alternatives.size() > 1))
+ initialCallFrameSize += YarrStackSpaceForBackTrackInfoAlternative;
+
+ unsigned minimumInputSize = UINT_MAX;
+ unsigned maximumCallFrameSize = 0;
+ bool hasFixedSize = true;
+
+ for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
+ PatternAlternative* alternative = disjunction->m_alternatives[alt];
+ unsigned currentAlternativeCallFrameSize = setupAlternativeOffsets(alternative, initialCallFrameSize, initialInputPosition);
+ minimumInputSize = std::min(minimumInputSize, alternative->m_minimumSize);
+ maximumCallFrameSize = std::max(maximumCallFrameSize, currentAlternativeCallFrameSize);
+ hasFixedSize &= alternative->m_hasFixedSize;
+ }
+
+ ASSERT(minimumInputSize != UINT_MAX);
+ ASSERT(maximumCallFrameSize >= initialCallFrameSize);
+
+ disjunction->m_hasFixedSize = hasFixedSize;
+ disjunction->m_minimumSize = minimumInputSize;
+ disjunction->m_callFrameSize = maximumCallFrameSize;
+ return maximumCallFrameSize;
+ }
+
+ void setupOffsets()
+ {
+ setupDisjunctionOffsets(m_pattern.m_body, 0, 0);
+ }
+
+ // This optimization identifies sets of parentheses that we will never need to backtrack.
+ // In these cases we do not need to store state from prior iterations.
+ // We can presently avoid backtracking for:
+ // * where the parens are at the end of the regular expression (last term in any of the
+ // alternatives of the main body disjunction).
+ // * where the parens are non-capturing, and quantified unbounded greedy (*).
+ // * where the parens do not contain any capturing subpatterns.
+ void checkForTerminalParentheses()
+ {
+ // This check is much too crude; should be just checking whether the candidate
+ // node contains nested capturing subpatterns, not the whole expression!
+ if (m_pattern.m_numSubpatterns)
+ return;
+
+ Vector<PatternAlternative*>& alternatives = m_pattern.m_body->m_alternatives;
+ for (size_t i = 0; i < alternatives.size(); ++i) {
+ Vector<PatternTerm>& terms = alternatives[i]->m_terms;
+ if (terms.size()) {
+ PatternTerm& term = terms.last();
+ if (term.type == PatternTerm::TypeParenthesesSubpattern
+ && term.quantityType == QuantifierGreedy
+ && term.quantityCount == quantifyInfinite
+ && !term.capture())
+ term.parentheses.isTerminal = true;
+ }
+ }
+ }
+
+ void optimizeBOL()
+ {
+ // Look for expressions containing beginning of line (^) anchoring and unroll them.
+ // e.g. /^a|^b|c/ becomes /^a|^b|c/ which is executed once followed by /c/ which loops
+ // This code relies on the parsing code tagging alternatives with m_containsBOL and
+ // m_startsWithBOL and rolling those up to containing alternatives.
+ // At this point, this is only valid for non-multiline expressions.
+ PatternDisjunction* disjunction = m_pattern.m_body;
+
+ if (!m_pattern.m_containsBOL || m_pattern.m_multiline)
+ return;
+
+ PatternDisjunction* loopDisjunction = copyDisjunction(disjunction, true);
+
+ // Set alternatives in disjunction to "onceThrough"
+ for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt)
+ disjunction->m_alternatives[alt]->setOnceThrough();
+
+ if (loopDisjunction) {
+ // Move alternatives from loopDisjunction to disjunction
+ for (unsigned alt = 0; alt < loopDisjunction->m_alternatives.size(); ++alt)
+ disjunction->m_alternatives.append(loopDisjunction->m_alternatives[alt]);
+
+ loopDisjunction->m_alternatives.clear();
+ }
+ }
+
+ bool containsCapturingTerms(PatternAlternative* alternative, size_t firstTermIndex, size_t lastTermIndex)
+ {
+ Vector<PatternTerm>& terms = alternative->m_terms;
+
+ for (size_t termIndex = firstTermIndex; termIndex <= lastTermIndex; ++termIndex) {
+ PatternTerm& term = terms[termIndex];
+
+ if (term.m_capture)
+ return true;
+
+ if (term.type == PatternTerm::TypeParenthesesSubpattern) {
+ PatternDisjunction* nestedDisjunction = term.parentheses.disjunction;
+ for (unsigned alt = 0; alt < nestedDisjunction->m_alternatives.size(); ++alt) {
+ if (containsCapturingTerms(nestedDisjunction->m_alternatives[alt], 0, nestedDisjunction->m_alternatives[alt]->m_terms.size() - 1))
+ return true;
+ }
+ }
+ }
+
+ return false;
+ }
+
+ // This optimization identifies alternatives in the form of
+ // [^].*[?]<expression>.*[$] for expressions that don't have any
+ // capturing terms. The alternative is changed to <expression>
+ // followed by processing of the dot stars to find and adjust the
+ // beginning and the end of the match.
+ void optimizeDotStarWrappedExpressions()
+ {
+ Vector<PatternAlternative*>& alternatives = m_pattern.m_body->m_alternatives;
+ if (alternatives.size() != 1)
+ return;
+
+ PatternAlternative* alternative = alternatives[0];
+ Vector<PatternTerm>& terms = alternative->m_terms;
+ if (terms.size() >= 3) {
+ bool startsWithBOL = false;
+ bool endsWithEOL = false;
+ size_t termIndex, firstExpressionTerm, lastExpressionTerm;
+
+ termIndex = 0;
+ if (terms[termIndex].type == PatternTerm::TypeAssertionBOL) {
+ startsWithBOL = true;
+ ++termIndex;
+ }
+
+ PatternTerm& firstNonAnchorTerm = terms[termIndex];
+ if ((firstNonAnchorTerm.type != PatternTerm::TypeCharacterClass) || (firstNonAnchorTerm.characterClass != m_pattern.newlineCharacterClass()) || !((firstNonAnchorTerm.quantityType == QuantifierGreedy) || (firstNonAnchorTerm.quantityType == QuantifierNonGreedy)))
+ return;
+
+ firstExpressionTerm = termIndex + 1;
+
+ termIndex = terms.size() - 1;
+ if (terms[termIndex].type == PatternTerm::TypeAssertionEOL) {
+ endsWithEOL = true;
+ --termIndex;
+ }
+
+ PatternTerm& lastNonAnchorTerm = terms[termIndex];
+ if ((lastNonAnchorTerm.type != PatternTerm::TypeCharacterClass) || (lastNonAnchorTerm.characterClass != m_pattern.newlineCharacterClass()) || (lastNonAnchorTerm.quantityType != QuantifierGreedy))
+ return;
+
+ lastExpressionTerm = termIndex - 1;
+
+ if (firstExpressionTerm > lastExpressionTerm)
+ return;
+
+ if (!containsCapturingTerms(alternative, firstExpressionTerm, lastExpressionTerm)) {
+ for (termIndex = terms.size() - 1; termIndex > lastExpressionTerm; --termIndex)
+ terms.remove(termIndex);
+
+ for (termIndex = firstExpressionTerm; termIndex > 0; --termIndex)
+ terms.remove(termIndex - 1);
+
+ terms.append(PatternTerm(startsWithBOL, endsWithEOL));
+
+ m_pattern.m_containsBOL = false;
+ }
+ }
+ }
+
+private:
+ YarrPattern& m_pattern;
+ PatternAlternative* m_alternative;
+ CharacterClassConstructor m_characterClassConstructor;
+ bool m_invertCharacterClass;
+ bool m_invertParentheticalAssertion;
+};
+
+const char* YarrPattern::compile(const String& patternString)
+{
+ YarrPatternConstructor constructor(*this);
+
+ if (const char* error = parse(constructor, patternString))
+ return error;
+
+ // If the pattern contains illegal backreferences reset & reparse.
+ // Quoting Netscape's "What's new in JavaScript 1.2",
+ // "Note: if the number of left parentheses is less than the number specified
+ // in \#, the \# is taken as an octal escape as described in the next row."
+ if (containsIllegalBackReference()) {
+ unsigned numSubpatterns = m_numSubpatterns;
+
+ constructor.reset();
+#if !ASSERT_DISABLED
+ const char* error =
+#endif
+ parse(constructor, patternString, numSubpatterns);
+
+ ASSERT(!error);
+ ASSERT(numSubpatterns == m_numSubpatterns);
+ }
+
+ constructor.checkForTerminalParentheses();
+ constructor.optimizeDotStarWrappedExpressions();
+ constructor.optimizeBOL();
+
+ constructor.setupOffsets();
+
+ return 0;
+}
+
+YarrPattern::YarrPattern(const String& pattern, bool ignoreCase, bool multiline, const char** error)
+ : m_ignoreCase(ignoreCase)
+ , m_multiline(multiline)
+ , m_containsBackreferences(false)
+ , m_containsBOL(false)
+ , m_numSubpatterns(0)
+ , m_maxBackReference(0)
+ , newlineCached(0)
+ , digitsCached(0)
+ , spacesCached(0)
+ , wordcharCached(0)
+ , nondigitsCached(0)
+ , nonspacesCached(0)
+ , nonwordcharCached(0)
+{
+ *error = compile(pattern);
+}
+
+} }
diff --git a/src/3rdparty/masm/yarr/YarrPattern.h b/src/3rdparty/masm/yarr/YarrPattern.h
new file mode 100644
index 0000000000..14e89b8e09
--- /dev/null
+++ b/src/3rdparty/masm/yarr/YarrPattern.h
@@ -0,0 +1,421 @@
+/*
+ * Copyright (C) 2009 Apple Inc. All rights reserved.
+ * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef YarrPattern_h
+#define YarrPattern_h
+
+#include <wtf/CheckedArithmetic.h>
+#include <wtf/RefCounted.h>
+#include <wtf/Vector.h>
+#include <wtf/text/WTFString.h>
+#include <wtf/unicode/Unicode.h>
+
+namespace JSC { namespace Yarr {
+
+struct PatternDisjunction;
+
+struct CharacterRange {
+ UChar begin;
+ UChar end;
+
+ CharacterRange(UChar begin, UChar end)
+ : begin(begin)
+ , end(end)
+ {
+ }
+};
+
+struct CharacterClassTable : RefCounted<CharacterClassTable> {
+ const char* m_table;
+ bool m_inverted;
+ static PassRefPtr<CharacterClassTable> create(const char* table, bool inverted)
+ {
+ return adoptRef(new CharacterClassTable(table, inverted));
+ }
+
+private:
+ CharacterClassTable(const char* table, bool inverted)
+ : m_table(table)
+ , m_inverted(inverted)
+ {
+ }
+};
+
+struct CharacterClass {
+ WTF_MAKE_FAST_ALLOCATED;
+public:
+ // All CharacterClass instances have to have the full set of matches and ranges,
+ // they may have an optional table for faster lookups (which must match the
+ // specified matches and ranges)
+ CharacterClass(PassRefPtr<CharacterClassTable> table)
+ : m_table(table)
+ {
+ }
+ Vector<UChar> m_matches;
+ Vector<CharacterRange> m_ranges;
+ Vector<UChar> m_matchesUnicode;
+ Vector<CharacterRange> m_rangesUnicode;
+ RefPtr<CharacterClassTable> m_table;
+};
+
+enum QuantifierType {
+ QuantifierFixedCount,
+ QuantifierGreedy,
+ QuantifierNonGreedy,
+};
+
+struct PatternTerm {
+ enum Type {
+ TypeAssertionBOL,
+ TypeAssertionEOL,
+ TypeAssertionWordBoundary,
+ TypePatternCharacter,
+ TypeCharacterClass,
+ TypeBackReference,
+ TypeForwardReference,
+ TypeParenthesesSubpattern,
+ TypeParentheticalAssertion,
+ TypeDotStarEnclosure,
+ } type;
+ bool m_capture :1;
+ bool m_invert :1;
+ union {
+ UChar patternCharacter;
+ CharacterClass* characterClass;
+ unsigned backReferenceSubpatternId;
+ struct {
+ PatternDisjunction* disjunction;
+ unsigned subpatternId;
+ unsigned lastSubpatternId;
+ bool isCopy;
+ bool isTerminal;
+ } parentheses;
+ struct {
+ bool bolAnchor : 1;
+ bool eolAnchor : 1;
+ } anchors;
+ };
+ QuantifierType quantityType;
+ Checked<unsigned> quantityCount;
+ int inputPosition;
+ unsigned frameLocation;
+
+ PatternTerm(UChar ch)
+ : type(PatternTerm::TypePatternCharacter)
+ , m_capture(false)
+ , m_invert(false)
+ {
+ patternCharacter = ch;
+ quantityType = QuantifierFixedCount;
+ quantityCount = 1;
+ }
+
+ PatternTerm(CharacterClass* charClass, bool invert)
+ : type(PatternTerm::TypeCharacterClass)
+ , m_capture(false)
+ , m_invert(invert)
+ {
+ characterClass = charClass;
+ quantityType = QuantifierFixedCount;
+ quantityCount = 1;
+ }
+
+ PatternTerm(Type type, unsigned subpatternId, PatternDisjunction* disjunction, bool capture = false, bool invert = false)
+ : type(type)
+ , m_capture(capture)
+ , m_invert(invert)
+ {
+ parentheses.disjunction = disjunction;
+ parentheses.subpatternId = subpatternId;
+ parentheses.isCopy = false;
+ parentheses.isTerminal = false;
+ quantityType = QuantifierFixedCount;
+ quantityCount = 1;
+ }
+
+ PatternTerm(Type type, bool invert = false)
+ : type(type)
+ , m_capture(false)
+ , m_invert(invert)
+ {
+ quantityType = QuantifierFixedCount;
+ quantityCount = 1;
+ }
+
+ PatternTerm(unsigned spatternId)
+ : type(TypeBackReference)
+ , m_capture(false)
+ , m_invert(false)
+ {
+ backReferenceSubpatternId = spatternId;
+ quantityType = QuantifierFixedCount;
+ quantityCount = 1;
+ }
+
+ PatternTerm(bool bolAnchor, bool eolAnchor)
+ : type(TypeDotStarEnclosure)
+ , m_capture(false)
+ , m_invert(false)
+ {
+ anchors.bolAnchor = bolAnchor;
+ anchors.eolAnchor = eolAnchor;
+ quantityType = QuantifierFixedCount;
+ quantityCount = 1;
+ }
+
+ static PatternTerm ForwardReference()
+ {
+ return PatternTerm(TypeForwardReference);
+ }
+
+ static PatternTerm BOL()
+ {
+ return PatternTerm(TypeAssertionBOL);
+ }
+
+ static PatternTerm EOL()
+ {
+ return PatternTerm(TypeAssertionEOL);
+ }
+
+ static PatternTerm WordBoundary(bool invert)
+ {
+ return PatternTerm(TypeAssertionWordBoundary, invert);
+ }
+
+ bool invert()
+ {
+ return m_invert;
+ }
+
+ bool capture()
+ {
+ return m_capture;
+ }
+
+ void quantify(unsigned count, QuantifierType type)
+ {
+ quantityCount = count;
+ quantityType = type;
+ }
+};
+
+struct PatternAlternative {
+ WTF_MAKE_FAST_ALLOCATED;
+public:
+ PatternAlternative(PatternDisjunction* disjunction)
+ : m_parent(disjunction)
+ , m_onceThrough(false)
+ , m_hasFixedSize(false)
+ , m_startsWithBOL(false)
+ , m_containsBOL(false)
+ {
+ }
+
+ PatternTerm& lastTerm()
+ {
+ ASSERT(m_terms.size());
+ return m_terms[m_terms.size() - 1];
+ }
+
+ void removeLastTerm()
+ {
+ ASSERT(m_terms.size());
+ m_terms.shrink(m_terms.size() - 1);
+ }
+
+ void setOnceThrough()
+ {
+ m_onceThrough = true;
+ }
+
+ bool onceThrough()
+ {
+ return m_onceThrough;
+ }
+
+ Vector<PatternTerm> m_terms;
+ PatternDisjunction* m_parent;
+ unsigned m_minimumSize;
+ bool m_onceThrough : 1;
+ bool m_hasFixedSize : 1;
+ bool m_startsWithBOL : 1;
+ bool m_containsBOL : 1;
+};
+
+struct PatternDisjunction {
+ WTF_MAKE_FAST_ALLOCATED;
+public:
+ PatternDisjunction(PatternAlternative* parent = 0)
+ : m_parent(parent)
+ , m_hasFixedSize(false)
+ {
+ }
+
+ ~PatternDisjunction()
+ {
+ deleteAllValues(m_alternatives);
+ }
+
+ PatternAlternative* addNewAlternative()
+ {
+ PatternAlternative* alternative = new PatternAlternative(this);
+ m_alternatives.append(alternative);
+ return alternative;
+ }
+
+ Vector<PatternAlternative*> m_alternatives;
+ PatternAlternative* m_parent;
+ unsigned m_minimumSize;
+ unsigned m_callFrameSize;
+ bool m_hasFixedSize;
+};
+
+// You probably don't want to be calling these functions directly
+// (please to be calling newlineCharacterClass() et al on your
+// friendly neighborhood YarrPattern instance to get nicely
+// cached copies).
+CharacterClass* newlineCreate();
+CharacterClass* digitsCreate();
+CharacterClass* spacesCreate();
+CharacterClass* wordcharCreate();
+CharacterClass* nondigitsCreate();
+CharacterClass* nonspacesCreate();
+CharacterClass* nonwordcharCreate();
+
+struct TermChain {
+ TermChain(PatternTerm term)
+ : term(term)
+ {}
+
+ PatternTerm term;
+ Vector<TermChain> hotTerms;
+};
+
+struct YarrPattern {
+ JS_EXPORT_PRIVATE YarrPattern(const String& pattern, bool ignoreCase, bool multiline, const char** error);
+
+ ~YarrPattern()
+ {
+ deleteAllValues(m_disjunctions);
+ deleteAllValues(m_userCharacterClasses);
+ }
+
+ void reset()
+ {
+ m_numSubpatterns = 0;
+ m_maxBackReference = 0;
+
+ m_containsBackreferences = false;
+ m_containsBOL = false;
+
+ newlineCached = 0;
+ digitsCached = 0;
+ spacesCached = 0;
+ wordcharCached = 0;
+ nondigitsCached = 0;
+ nonspacesCached = 0;
+ nonwordcharCached = 0;
+
+ deleteAllValues(m_disjunctions);
+ m_disjunctions.clear();
+ deleteAllValues(m_userCharacterClasses);
+ m_userCharacterClasses.clear();
+ }
+
+ bool containsIllegalBackReference()
+ {
+ return m_maxBackReference > m_numSubpatterns;
+ }
+
+ CharacterClass* newlineCharacterClass()
+ {
+ if (!newlineCached)
+ m_userCharacterClasses.append(newlineCached = newlineCreate());
+ return newlineCached;
+ }
+ CharacterClass* digitsCharacterClass()
+ {
+ if (!digitsCached)
+ m_userCharacterClasses.append(digitsCached = digitsCreate());
+ return digitsCached;
+ }
+ CharacterClass* spacesCharacterClass()
+ {
+ if (!spacesCached)
+ m_userCharacterClasses.append(spacesCached = spacesCreate());
+ return spacesCached;
+ }
+ CharacterClass* wordcharCharacterClass()
+ {
+ if (!wordcharCached)
+ m_userCharacterClasses.append(wordcharCached = wordcharCreate());
+ return wordcharCached;
+ }
+ CharacterClass* nondigitsCharacterClass()
+ {
+ if (!nondigitsCached)
+ m_userCharacterClasses.append(nondigitsCached = nondigitsCreate());
+ return nondigitsCached;
+ }
+ CharacterClass* nonspacesCharacterClass()
+ {
+ if (!nonspacesCached)
+ m_userCharacterClasses.append(nonspacesCached = nonspacesCreate());
+ return nonspacesCached;
+ }
+ CharacterClass* nonwordcharCharacterClass()
+ {
+ if (!nonwordcharCached)
+ m_userCharacterClasses.append(nonwordcharCached = nonwordcharCreate());
+ return nonwordcharCached;
+ }
+
+ bool m_ignoreCase : 1;
+ bool m_multiline : 1;
+ bool m_containsBackreferences : 1;
+ bool m_containsBOL : 1;
+ unsigned m_numSubpatterns;
+ unsigned m_maxBackReference;
+ PatternDisjunction* m_body;
+ Vector<PatternDisjunction*, 4> m_disjunctions;
+ Vector<CharacterClass*> m_userCharacterClasses;
+
+private:
+ const char* compile(const String& patternString);
+
+ CharacterClass* newlineCached;
+ CharacterClass* digitsCached;
+ CharacterClass* spacesCached;
+ CharacterClass* wordcharCached;
+ CharacterClass* nondigitsCached;
+ CharacterClass* nonspacesCached;
+ CharacterClass* nonwordcharCached;
+};
+
+} } // namespace JSC::Yarr
+
+#endif // YarrPattern_h
diff --git a/src/3rdparty/masm/yarr/YarrSyntaxChecker.cpp b/src/3rdparty/masm/yarr/YarrSyntaxChecker.cpp
new file mode 100644
index 0000000000..aa98c4a354
--- /dev/null
+++ b/src/3rdparty/masm/yarr/YarrSyntaxChecker.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (C) 2011 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "YarrSyntaxChecker.h"
+
+#include "YarrParser.h"
+
+namespace JSC { namespace Yarr {
+
+class SyntaxChecker {
+public:
+ void assertionBOL() {}
+ void assertionEOL() {}
+ void assertionWordBoundary(bool) {}
+ void atomPatternCharacter(UChar) {}
+ void atomBuiltInCharacterClass(BuiltInCharacterClassID, bool) {}
+ void atomCharacterClassBegin(bool = false) {}
+ void atomCharacterClassAtom(UChar) {}
+ void atomCharacterClassRange(UChar, UChar) {}
+ void atomCharacterClassBuiltIn(BuiltInCharacterClassID, bool) {}
+ void atomCharacterClassEnd() {}
+ void atomParenthesesSubpatternBegin(bool = true) {}
+ void atomParentheticalAssertionBegin(bool = false) {}
+ void atomParenthesesEnd() {}
+ void atomBackReference(unsigned) {}
+ void quantifyAtom(unsigned, unsigned, bool) {}
+ void disjunction() {}
+};
+
+const char* checkSyntax(const String& pattern)
+{
+ SyntaxChecker syntaxChecker;
+ return parse(syntaxChecker, pattern);
+}
+
+}} // JSC::YARR
diff --git a/src/3rdparty/masm/yarr/YarrSyntaxChecker.h b/src/3rdparty/masm/yarr/YarrSyntaxChecker.h
new file mode 100644
index 0000000000..104ced3ab4
--- /dev/null
+++ b/src/3rdparty/masm/yarr/YarrSyntaxChecker.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (C) 2011 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef YarrSyntaxChecker_h
+#define YarrSyntaxChecker_h
+
+#include <wtf/text/WTFString.h>
+
+namespace JSC { namespace Yarr {
+
+const char* checkSyntax(const String& pattern);
+
+}} // JSC::YARR
+
+#endif // YarrSyntaxChecker_h
+
diff --git a/src/3rdparty/masm/yarr/yarr.pri b/src/3rdparty/masm/yarr/yarr.pri
new file mode 100644
index 0000000000..7e9b4d3f3b
--- /dev/null
+++ b/src/3rdparty/masm/yarr/yarr.pri
@@ -0,0 +1,12 @@
+# -------------------------------------------------------------------
+# Project file for YARR
+#
+# See 'Tools/qmake/README' for an overview of the build system
+# -------------------------------------------------------------------
+
+SOURCES += \
+ $$PWD/YarrInterpreter.cpp \
+ $$PWD/YarrPattern.cpp \
+ $$PWD/YarrSyntaxChecker.cpp \
+ $$PWD/YarrCanonicalizeUCS2.cpp
+