diff options
author | Jędrzej Nowacki <jedrzej.nowacki@digia.com> | 2013-01-29 14:20:50 +0100 |
---|---|---|
committer | Lars Knoll <lars.knoll@digia.com> | 2013-01-30 13:20:40 +0100 |
commit | 74fba4d8069c946c1ba12b9ac4d4026aaf14118b (patch) | |
tree | 138cc204ec33ef0c838c98355ea84846e11ca1a7 /src/3rdparty/masm/yarr | |
parent | 0781ecb087b027cccc1c44de1a1c7520cc89e2d2 (diff) |
Say hello to QtV4 module.
Change-Id: I507cd5707b7d7223a0d901cf939896fb2649b684
Reviewed-by: Lars Knoll <lars.knoll@digia.com>
Diffstat (limited to 'src/3rdparty/masm/yarr')
-rw-r--r-- | src/3rdparty/masm/yarr/Yarr.h | 69 | ||||
-rw-r--r-- | src/3rdparty/masm/yarr/YarrCanonicalizeUCS2.cpp | 463 | ||||
-rw-r--r-- | src/3rdparty/masm/yarr/YarrCanonicalizeUCS2.h | 138 | ||||
-rw-r--r-- | src/3rdparty/masm/yarr/YarrCanonicalizeUCS2.js | 219 | ||||
-rw-r--r-- | src/3rdparty/masm/yarr/YarrInterpreter.cpp | 1964 | ||||
-rw-r--r-- | src/3rdparty/masm/yarr/YarrInterpreter.h | 385 | ||||
-rw-r--r-- | src/3rdparty/masm/yarr/YarrJIT.cpp | 2667 | ||||
-rw-r--r-- | src/3rdparty/masm/yarr/YarrJIT.h | 141 | ||||
-rw-r--r-- | src/3rdparty/masm/yarr/YarrParser.h | 880 | ||||
-rw-r--r-- | src/3rdparty/masm/yarr/YarrPattern.cpp | 874 | ||||
-rw-r--r-- | src/3rdparty/masm/yarr/YarrPattern.h | 421 | ||||
-rw-r--r-- | src/3rdparty/masm/yarr/YarrSyntaxChecker.cpp | 59 | ||||
-rw-r--r-- | src/3rdparty/masm/yarr/YarrSyntaxChecker.h | 38 | ||||
-rw-r--r-- | src/3rdparty/masm/yarr/yarr.pri | 12 |
14 files changed, 8330 insertions, 0 deletions
diff --git a/src/3rdparty/masm/yarr/Yarr.h b/src/3rdparty/masm/yarr/Yarr.h new file mode 100644 index 0000000000..d393e9fa90 --- /dev/null +++ b/src/3rdparty/masm/yarr/Yarr.h @@ -0,0 +1,69 @@ +/* + * Copyright (C) 2009 Apple Inc. All rights reserved. + * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF SZEGED ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL UNIVERSITY OF SZEGED OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef Yarr_h +#define Yarr_h + +#include "YarrInterpreter.h" +#include "YarrPattern.h" + +namespace JSC { namespace Yarr { + +#define YarrStackSpaceForBackTrackInfoPatternCharacter 1 // Only for !fixed quantifiers. +#define YarrStackSpaceForBackTrackInfoCharacterClass 1 // Only for !fixed quantifiers. +#define YarrStackSpaceForBackTrackInfoBackReference 2 +#define YarrStackSpaceForBackTrackInfoAlternative 1 // One per alternative. +#define YarrStackSpaceForBackTrackInfoParentheticalAssertion 1 +#define YarrStackSpaceForBackTrackInfoParenthesesOnce 1 // Only for !fixed quantifiers. +#define YarrStackSpaceForBackTrackInfoParenthesesTerminal 1 +#define YarrStackSpaceForBackTrackInfoParentheses 2 + +static const unsigned quantifyInfinite = UINT_MAX; +static const unsigned offsetNoMatch = (unsigned)-1; + +// The below limit restricts the number of "recursive" match calls in order to +// avoid spending exponential time on complex regular expressions. +static const unsigned matchLimit = 1000000; + +enum JSRegExpResult { + JSRegExpMatch = 1, + JSRegExpNoMatch = 0, + JSRegExpErrorNoMatch = -1, + JSRegExpErrorHitLimit = -2, + JSRegExpErrorNoMemory = -3, + JSRegExpErrorInternal = -4 +}; + +enum YarrCharSize { + Char8, + Char16 +}; + +} } // namespace JSC::Yarr + +#endif // Yarr_h + diff --git a/src/3rdparty/masm/yarr/YarrCanonicalizeUCS2.cpp b/src/3rdparty/masm/yarr/YarrCanonicalizeUCS2.cpp new file mode 100644 index 0000000000..7bb3d08eb5 --- /dev/null +++ b/src/3rdparty/masm/yarr/YarrCanonicalizeUCS2.cpp @@ -0,0 +1,463 @@ +/* + * Copyright (C) 2012 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// DO NOT EDIT! - this file autogenerated by YarrCanonicalizeUCS2.js + +#include "config.h" +#include "YarrCanonicalizeUCS2.h" + +namespace JSC { namespace Yarr { + +#include <stdint.h> + +uint16_t ucs2CharacterSet0[] = { 0x01c4u, 0x01c5u, 0x01c6u, 0 }; +uint16_t ucs2CharacterSet1[] = { 0x01c7u, 0x01c8u, 0x01c9u, 0 }; +uint16_t ucs2CharacterSet2[] = { 0x01cau, 0x01cbu, 0x01ccu, 0 }; +uint16_t ucs2CharacterSet3[] = { 0x01f1u, 0x01f2u, 0x01f3u, 0 }; +uint16_t ucs2CharacterSet4[] = { 0x0392u, 0x03b2u, 0x03d0u, 0 }; +uint16_t ucs2CharacterSet5[] = { 0x0395u, 0x03b5u, 0x03f5u, 0 }; +uint16_t ucs2CharacterSet6[] = { 0x0398u, 0x03b8u, 0x03d1u, 0 }; +uint16_t ucs2CharacterSet7[] = { 0x0345u, 0x0399u, 0x03b9u, 0x1fbeu, 0 }; +uint16_t ucs2CharacterSet8[] = { 0x039au, 0x03bau, 0x03f0u, 0 }; +uint16_t ucs2CharacterSet9[] = { 0x00b5u, 0x039cu, 0x03bcu, 0 }; +uint16_t ucs2CharacterSet10[] = { 0x03a0u, 0x03c0u, 0x03d6u, 0 }; +uint16_t ucs2CharacterSet11[] = { 0x03a1u, 0x03c1u, 0x03f1u, 0 }; +uint16_t ucs2CharacterSet12[] = { 0x03a3u, 0x03c2u, 0x03c3u, 0 }; +uint16_t ucs2CharacterSet13[] = { 0x03a6u, 0x03c6u, 0x03d5u, 0 }; +uint16_t ucs2CharacterSet14[] = { 0x1e60u, 0x1e61u, 0x1e9bu, 0 }; + +static const size_t UCS2_CANONICALIZATION_SETS = 15; +uint16_t* characterSetInfo[UCS2_CANONICALIZATION_SETS] = { + ucs2CharacterSet0, + ucs2CharacterSet1, + ucs2CharacterSet2, + ucs2CharacterSet3, + ucs2CharacterSet4, + ucs2CharacterSet5, + ucs2CharacterSet6, + ucs2CharacterSet7, + ucs2CharacterSet8, + ucs2CharacterSet9, + ucs2CharacterSet10, + ucs2CharacterSet11, + ucs2CharacterSet12, + ucs2CharacterSet13, + ucs2CharacterSet14, +}; + +const size_t UCS2_CANONICALIZATION_RANGES = 364; +UCS2CanonicalizationRange rangeInfo[UCS2_CANONICALIZATION_RANGES] = { + { 0x0000u, 0x0040u, 0x0000u, CanonicalizeUnique }, + { 0x0041u, 0x005au, 0x0020u, CanonicalizeRangeLo }, + { 0x005bu, 0x0060u, 0x0000u, CanonicalizeUnique }, + { 0x0061u, 0x007au, 0x0020u, CanonicalizeRangeHi }, + { 0x007bu, 0x00b4u, 0x0000u, CanonicalizeUnique }, + { 0x00b5u, 0x00b5u, 0x0009u, CanonicalizeSet }, + { 0x00b6u, 0x00bfu, 0x0000u, CanonicalizeUnique }, + { 0x00c0u, 0x00d6u, 0x0020u, CanonicalizeRangeLo }, + { 0x00d7u, 0x00d7u, 0x0000u, CanonicalizeUnique }, + { 0x00d8u, 0x00deu, 0x0020u, CanonicalizeRangeLo }, + { 0x00dfu, 0x00dfu, 0x0000u, CanonicalizeUnique }, + { 0x00e0u, 0x00f6u, 0x0020u, CanonicalizeRangeHi }, + { 0x00f7u, 0x00f7u, 0x0000u, CanonicalizeUnique }, + { 0x00f8u, 0x00feu, 0x0020u, CanonicalizeRangeHi }, + { 0x00ffu, 0x00ffu, 0x0079u, CanonicalizeRangeLo }, + { 0x0100u, 0x012fu, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x0130u, 0x0131u, 0x0000u, CanonicalizeUnique }, + { 0x0132u, 0x0137u, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x0138u, 0x0138u, 0x0000u, CanonicalizeUnique }, + { 0x0139u, 0x0148u, 0x0000u, CanonicalizeAlternatingUnaligned }, + { 0x0149u, 0x0149u, 0x0000u, CanonicalizeUnique }, + { 0x014au, 0x0177u, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x0178u, 0x0178u, 0x0079u, CanonicalizeRangeHi }, + { 0x0179u, 0x017eu, 0x0000u, CanonicalizeAlternatingUnaligned }, + { 0x017fu, 0x017fu, 0x0000u, CanonicalizeUnique }, + { 0x0180u, 0x0180u, 0x00c3u, CanonicalizeRangeLo }, + { 0x0181u, 0x0181u, 0x00d2u, CanonicalizeRangeLo }, + { 0x0182u, 0x0185u, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x0186u, 0x0186u, 0x00ceu, CanonicalizeRangeLo }, + { 0x0187u, 0x0188u, 0x0000u, CanonicalizeAlternatingUnaligned }, + { 0x0189u, 0x018au, 0x00cdu, CanonicalizeRangeLo }, + { 0x018bu, 0x018cu, 0x0000u, CanonicalizeAlternatingUnaligned }, + { 0x018du, 0x018du, 0x0000u, CanonicalizeUnique }, + { 0x018eu, 0x018eu, 0x004fu, CanonicalizeRangeLo }, + { 0x018fu, 0x018fu, 0x00cau, CanonicalizeRangeLo }, + { 0x0190u, 0x0190u, 0x00cbu, CanonicalizeRangeLo }, + { 0x0191u, 0x0192u, 0x0000u, CanonicalizeAlternatingUnaligned }, + { 0x0193u, 0x0193u, 0x00cdu, CanonicalizeRangeLo }, + { 0x0194u, 0x0194u, 0x00cfu, CanonicalizeRangeLo }, + { 0x0195u, 0x0195u, 0x0061u, CanonicalizeRangeLo }, + { 0x0196u, 0x0196u, 0x00d3u, CanonicalizeRangeLo }, + { 0x0197u, 0x0197u, 0x00d1u, CanonicalizeRangeLo }, + { 0x0198u, 0x0199u, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x019au, 0x019au, 0x00a3u, CanonicalizeRangeLo }, + { 0x019bu, 0x019bu, 0x0000u, CanonicalizeUnique }, + { 0x019cu, 0x019cu, 0x00d3u, CanonicalizeRangeLo }, + { 0x019du, 0x019du, 0x00d5u, CanonicalizeRangeLo }, + { 0x019eu, 0x019eu, 0x0082u, CanonicalizeRangeLo }, + { 0x019fu, 0x019fu, 0x00d6u, CanonicalizeRangeLo }, + { 0x01a0u, 0x01a5u, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x01a6u, 0x01a6u, 0x00dau, CanonicalizeRangeLo }, + { 0x01a7u, 0x01a8u, 0x0000u, CanonicalizeAlternatingUnaligned }, + { 0x01a9u, 0x01a9u, 0x00dau, CanonicalizeRangeLo }, + { 0x01aau, 0x01abu, 0x0000u, CanonicalizeUnique }, + { 0x01acu, 0x01adu, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x01aeu, 0x01aeu, 0x00dau, CanonicalizeRangeLo }, + { 0x01afu, 0x01b0u, 0x0000u, CanonicalizeAlternatingUnaligned }, + { 0x01b1u, 0x01b2u, 0x00d9u, CanonicalizeRangeLo }, + { 0x01b3u, 0x01b6u, 0x0000u, CanonicalizeAlternatingUnaligned }, + { 0x01b7u, 0x01b7u, 0x00dbu, CanonicalizeRangeLo }, + { 0x01b8u, 0x01b9u, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x01bau, 0x01bbu, 0x0000u, CanonicalizeUnique }, + { 0x01bcu, 0x01bdu, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x01beu, 0x01beu, 0x0000u, CanonicalizeUnique }, + { 0x01bfu, 0x01bfu, 0x0038u, CanonicalizeRangeLo }, + { 0x01c0u, 0x01c3u, 0x0000u, CanonicalizeUnique }, + { 0x01c4u, 0x01c6u, 0x0000u, CanonicalizeSet }, + { 0x01c7u, 0x01c9u, 0x0001u, CanonicalizeSet }, + { 0x01cau, 0x01ccu, 0x0002u, CanonicalizeSet }, + { 0x01cdu, 0x01dcu, 0x0000u, CanonicalizeAlternatingUnaligned }, + { 0x01ddu, 0x01ddu, 0x004fu, CanonicalizeRangeHi }, + { 0x01deu, 0x01efu, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x01f0u, 0x01f0u, 0x0000u, CanonicalizeUnique }, + { 0x01f1u, 0x01f3u, 0x0003u, CanonicalizeSet }, + { 0x01f4u, 0x01f5u, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x01f6u, 0x01f6u, 0x0061u, CanonicalizeRangeHi }, + { 0x01f7u, 0x01f7u, 0x0038u, CanonicalizeRangeHi }, + { 0x01f8u, 0x021fu, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x0220u, 0x0220u, 0x0082u, CanonicalizeRangeHi }, + { 0x0221u, 0x0221u, 0x0000u, CanonicalizeUnique }, + { 0x0222u, 0x0233u, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x0234u, 0x0239u, 0x0000u, CanonicalizeUnique }, + { 0x023au, 0x023au, 0x2a2bu, CanonicalizeRangeLo }, + { 0x023bu, 0x023cu, 0x0000u, CanonicalizeAlternatingUnaligned }, + { 0x023du, 0x023du, 0x00a3u, CanonicalizeRangeHi }, + { 0x023eu, 0x023eu, 0x2a28u, CanonicalizeRangeLo }, + { 0x023fu, 0x0240u, 0x2a3fu, CanonicalizeRangeLo }, + { 0x0241u, 0x0242u, 0x0000u, CanonicalizeAlternatingUnaligned }, + { 0x0243u, 0x0243u, 0x00c3u, CanonicalizeRangeHi }, + { 0x0244u, 0x0244u, 0x0045u, CanonicalizeRangeLo }, + { 0x0245u, 0x0245u, 0x0047u, CanonicalizeRangeLo }, + { 0x0246u, 0x024fu, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x0250u, 0x0250u, 0x2a1fu, CanonicalizeRangeLo }, + { 0x0251u, 0x0251u, 0x2a1cu, CanonicalizeRangeLo }, + { 0x0252u, 0x0252u, 0x2a1eu, CanonicalizeRangeLo }, + { 0x0253u, 0x0253u, 0x00d2u, CanonicalizeRangeHi }, + { 0x0254u, 0x0254u, 0x00ceu, CanonicalizeRangeHi }, + { 0x0255u, 0x0255u, 0x0000u, CanonicalizeUnique }, + { 0x0256u, 0x0257u, 0x00cdu, CanonicalizeRangeHi }, + { 0x0258u, 0x0258u, 0x0000u, CanonicalizeUnique }, + { 0x0259u, 0x0259u, 0x00cau, CanonicalizeRangeHi }, + { 0x025au, 0x025au, 0x0000u, CanonicalizeUnique }, + { 0x025bu, 0x025bu, 0x00cbu, CanonicalizeRangeHi }, + { 0x025cu, 0x025fu, 0x0000u, CanonicalizeUnique }, + { 0x0260u, 0x0260u, 0x00cdu, CanonicalizeRangeHi }, + { 0x0261u, 0x0262u, 0x0000u, CanonicalizeUnique }, + { 0x0263u, 0x0263u, 0x00cfu, CanonicalizeRangeHi }, + { 0x0264u, 0x0264u, 0x0000u, CanonicalizeUnique }, + { 0x0265u, 0x0265u, 0xa528u, CanonicalizeRangeLo }, + { 0x0266u, 0x0267u, 0x0000u, CanonicalizeUnique }, + { 0x0268u, 0x0268u, 0x00d1u, CanonicalizeRangeHi }, + { 0x0269u, 0x0269u, 0x00d3u, CanonicalizeRangeHi }, + { 0x026au, 0x026au, 0x0000u, CanonicalizeUnique }, + { 0x026bu, 0x026bu, 0x29f7u, CanonicalizeRangeLo }, + { 0x026cu, 0x026eu, 0x0000u, CanonicalizeUnique }, + { 0x026fu, 0x026fu, 0x00d3u, CanonicalizeRangeHi }, + { 0x0270u, 0x0270u, 0x0000u, CanonicalizeUnique }, + { 0x0271u, 0x0271u, 0x29fdu, CanonicalizeRangeLo }, + { 0x0272u, 0x0272u, 0x00d5u, CanonicalizeRangeHi }, + { 0x0273u, 0x0274u, 0x0000u, CanonicalizeUnique }, + { 0x0275u, 0x0275u, 0x00d6u, CanonicalizeRangeHi }, + { 0x0276u, 0x027cu, 0x0000u, CanonicalizeUnique }, + { 0x027du, 0x027du, 0x29e7u, CanonicalizeRangeLo }, + { 0x027eu, 0x027fu, 0x0000u, CanonicalizeUnique }, + { 0x0280u, 0x0280u, 0x00dau, CanonicalizeRangeHi }, + { 0x0281u, 0x0282u, 0x0000u, CanonicalizeUnique }, + { 0x0283u, 0x0283u, 0x00dau, CanonicalizeRangeHi }, + { 0x0284u, 0x0287u, 0x0000u, CanonicalizeUnique }, + { 0x0288u, 0x0288u, 0x00dau, CanonicalizeRangeHi }, + { 0x0289u, 0x0289u, 0x0045u, CanonicalizeRangeHi }, + { 0x028au, 0x028bu, 0x00d9u, CanonicalizeRangeHi }, + { 0x028cu, 0x028cu, 0x0047u, CanonicalizeRangeHi }, + { 0x028du, 0x0291u, 0x0000u, CanonicalizeUnique }, + { 0x0292u, 0x0292u, 0x00dbu, CanonicalizeRangeHi }, + { 0x0293u, 0x0344u, 0x0000u, CanonicalizeUnique }, + { 0x0345u, 0x0345u, 0x0007u, CanonicalizeSet }, + { 0x0346u, 0x036fu, 0x0000u, CanonicalizeUnique }, + { 0x0370u, 0x0373u, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x0374u, 0x0375u, 0x0000u, CanonicalizeUnique }, + { 0x0376u, 0x0377u, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x0378u, 0x037au, 0x0000u, CanonicalizeUnique }, + { 0x037bu, 0x037du, 0x0082u, CanonicalizeRangeLo }, + { 0x037eu, 0x0385u, 0x0000u, CanonicalizeUnique }, + { 0x0386u, 0x0386u, 0x0026u, CanonicalizeRangeLo }, + { 0x0387u, 0x0387u, 0x0000u, CanonicalizeUnique }, + { 0x0388u, 0x038au, 0x0025u, CanonicalizeRangeLo }, + { 0x038bu, 0x038bu, 0x0000u, CanonicalizeUnique }, + { 0x038cu, 0x038cu, 0x0040u, CanonicalizeRangeLo }, + { 0x038du, 0x038du, 0x0000u, CanonicalizeUnique }, + { 0x038eu, 0x038fu, 0x003fu, CanonicalizeRangeLo }, + { 0x0390u, 0x0390u, 0x0000u, CanonicalizeUnique }, + { 0x0391u, 0x0391u, 0x0020u, CanonicalizeRangeLo }, + { 0x0392u, 0x0392u, 0x0004u, CanonicalizeSet }, + { 0x0393u, 0x0394u, 0x0020u, CanonicalizeRangeLo }, + { 0x0395u, 0x0395u, 0x0005u, CanonicalizeSet }, + { 0x0396u, 0x0397u, 0x0020u, CanonicalizeRangeLo }, + { 0x0398u, 0x0398u, 0x0006u, CanonicalizeSet }, + { 0x0399u, 0x0399u, 0x0007u, CanonicalizeSet }, + { 0x039au, 0x039au, 0x0008u, CanonicalizeSet }, + { 0x039bu, 0x039bu, 0x0020u, CanonicalizeRangeLo }, + { 0x039cu, 0x039cu, 0x0009u, CanonicalizeSet }, + { 0x039du, 0x039fu, 0x0020u, CanonicalizeRangeLo }, + { 0x03a0u, 0x03a0u, 0x000au, CanonicalizeSet }, + { 0x03a1u, 0x03a1u, 0x000bu, CanonicalizeSet }, + { 0x03a2u, 0x03a2u, 0x0000u, CanonicalizeUnique }, + { 0x03a3u, 0x03a3u, 0x000cu, CanonicalizeSet }, + { 0x03a4u, 0x03a5u, 0x0020u, CanonicalizeRangeLo }, + { 0x03a6u, 0x03a6u, 0x000du, CanonicalizeSet }, + { 0x03a7u, 0x03abu, 0x0020u, CanonicalizeRangeLo }, + { 0x03acu, 0x03acu, 0x0026u, CanonicalizeRangeHi }, + { 0x03adu, 0x03afu, 0x0025u, CanonicalizeRangeHi }, + { 0x03b0u, 0x03b0u, 0x0000u, CanonicalizeUnique }, + { 0x03b1u, 0x03b1u, 0x0020u, CanonicalizeRangeHi }, + { 0x03b2u, 0x03b2u, 0x0004u, CanonicalizeSet }, + { 0x03b3u, 0x03b4u, 0x0020u, CanonicalizeRangeHi }, + { 0x03b5u, 0x03b5u, 0x0005u, CanonicalizeSet }, + { 0x03b6u, 0x03b7u, 0x0020u, CanonicalizeRangeHi }, + { 0x03b8u, 0x03b8u, 0x0006u, CanonicalizeSet }, + { 0x03b9u, 0x03b9u, 0x0007u, CanonicalizeSet }, + { 0x03bau, 0x03bau, 0x0008u, CanonicalizeSet }, + { 0x03bbu, 0x03bbu, 0x0020u, CanonicalizeRangeHi }, + { 0x03bcu, 0x03bcu, 0x0009u, CanonicalizeSet }, + { 0x03bdu, 0x03bfu, 0x0020u, CanonicalizeRangeHi }, + { 0x03c0u, 0x03c0u, 0x000au, CanonicalizeSet }, + { 0x03c1u, 0x03c1u, 0x000bu, CanonicalizeSet }, + { 0x03c2u, 0x03c3u, 0x000cu, CanonicalizeSet }, + { 0x03c4u, 0x03c5u, 0x0020u, CanonicalizeRangeHi }, + { 0x03c6u, 0x03c6u, 0x000du, CanonicalizeSet }, + { 0x03c7u, 0x03cbu, 0x0020u, CanonicalizeRangeHi }, + { 0x03ccu, 0x03ccu, 0x0040u, CanonicalizeRangeHi }, + { 0x03cdu, 0x03ceu, 0x003fu, CanonicalizeRangeHi }, + { 0x03cfu, 0x03cfu, 0x0008u, CanonicalizeRangeLo }, + { 0x03d0u, 0x03d0u, 0x0004u, CanonicalizeSet }, + { 0x03d1u, 0x03d1u, 0x0006u, CanonicalizeSet }, + { 0x03d2u, 0x03d4u, 0x0000u, CanonicalizeUnique }, + { 0x03d5u, 0x03d5u, 0x000du, CanonicalizeSet }, + { 0x03d6u, 0x03d6u, 0x000au, CanonicalizeSet }, + { 0x03d7u, 0x03d7u, 0x0008u, CanonicalizeRangeHi }, + { 0x03d8u, 0x03efu, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x03f0u, 0x03f0u, 0x0008u, CanonicalizeSet }, + { 0x03f1u, 0x03f1u, 0x000bu, CanonicalizeSet }, + { 0x03f2u, 0x03f2u, 0x0007u, CanonicalizeRangeLo }, + { 0x03f3u, 0x03f4u, 0x0000u, CanonicalizeUnique }, + { 0x03f5u, 0x03f5u, 0x0005u, CanonicalizeSet }, + { 0x03f6u, 0x03f6u, 0x0000u, CanonicalizeUnique }, + { 0x03f7u, 0x03f8u, 0x0000u, CanonicalizeAlternatingUnaligned }, + { 0x03f9u, 0x03f9u, 0x0007u, CanonicalizeRangeHi }, + { 0x03fau, 0x03fbu, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x03fcu, 0x03fcu, 0x0000u, CanonicalizeUnique }, + { 0x03fdu, 0x03ffu, 0x0082u, CanonicalizeRangeHi }, + { 0x0400u, 0x040fu, 0x0050u, CanonicalizeRangeLo }, + { 0x0410u, 0x042fu, 0x0020u, CanonicalizeRangeLo }, + { 0x0430u, 0x044fu, 0x0020u, CanonicalizeRangeHi }, + { 0x0450u, 0x045fu, 0x0050u, CanonicalizeRangeHi }, + { 0x0460u, 0x0481u, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x0482u, 0x0489u, 0x0000u, CanonicalizeUnique }, + { 0x048au, 0x04bfu, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x04c0u, 0x04c0u, 0x000fu, CanonicalizeRangeLo }, + { 0x04c1u, 0x04ceu, 0x0000u, CanonicalizeAlternatingUnaligned }, + { 0x04cfu, 0x04cfu, 0x000fu, CanonicalizeRangeHi }, + { 0x04d0u, 0x0527u, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x0528u, 0x0530u, 0x0000u, CanonicalizeUnique }, + { 0x0531u, 0x0556u, 0x0030u, CanonicalizeRangeLo }, + { 0x0557u, 0x0560u, 0x0000u, CanonicalizeUnique }, + { 0x0561u, 0x0586u, 0x0030u, CanonicalizeRangeHi }, + { 0x0587u, 0x109fu, 0x0000u, CanonicalizeUnique }, + { 0x10a0u, 0x10c5u, 0x1c60u, CanonicalizeRangeLo }, + { 0x10c6u, 0x1d78u, 0x0000u, CanonicalizeUnique }, + { 0x1d79u, 0x1d79u, 0x8a04u, CanonicalizeRangeLo }, + { 0x1d7au, 0x1d7cu, 0x0000u, CanonicalizeUnique }, + { 0x1d7du, 0x1d7du, 0x0ee6u, CanonicalizeRangeLo }, + { 0x1d7eu, 0x1dffu, 0x0000u, CanonicalizeUnique }, + { 0x1e00u, 0x1e5fu, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x1e60u, 0x1e61u, 0x000eu, CanonicalizeSet }, + { 0x1e62u, 0x1e95u, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x1e96u, 0x1e9au, 0x0000u, CanonicalizeUnique }, + { 0x1e9bu, 0x1e9bu, 0x000eu, CanonicalizeSet }, + { 0x1e9cu, 0x1e9fu, 0x0000u, CanonicalizeUnique }, + { 0x1ea0u, 0x1effu, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x1f00u, 0x1f07u, 0x0008u, CanonicalizeRangeLo }, + { 0x1f08u, 0x1f0fu, 0x0008u, CanonicalizeRangeHi }, + { 0x1f10u, 0x1f15u, 0x0008u, CanonicalizeRangeLo }, + { 0x1f16u, 0x1f17u, 0x0000u, CanonicalizeUnique }, + { 0x1f18u, 0x1f1du, 0x0008u, CanonicalizeRangeHi }, + { 0x1f1eu, 0x1f1fu, 0x0000u, CanonicalizeUnique }, + { 0x1f20u, 0x1f27u, 0x0008u, CanonicalizeRangeLo }, + { 0x1f28u, 0x1f2fu, 0x0008u, CanonicalizeRangeHi }, + { 0x1f30u, 0x1f37u, 0x0008u, CanonicalizeRangeLo }, + { 0x1f38u, 0x1f3fu, 0x0008u, CanonicalizeRangeHi }, + { 0x1f40u, 0x1f45u, 0x0008u, CanonicalizeRangeLo }, + { 0x1f46u, 0x1f47u, 0x0000u, CanonicalizeUnique }, + { 0x1f48u, 0x1f4du, 0x0008u, CanonicalizeRangeHi }, + { 0x1f4eu, 0x1f50u, 0x0000u, CanonicalizeUnique }, + { 0x1f51u, 0x1f51u, 0x0008u, CanonicalizeRangeLo }, + { 0x1f52u, 0x1f52u, 0x0000u, CanonicalizeUnique }, + { 0x1f53u, 0x1f53u, 0x0008u, CanonicalizeRangeLo }, + { 0x1f54u, 0x1f54u, 0x0000u, CanonicalizeUnique }, + { 0x1f55u, 0x1f55u, 0x0008u, CanonicalizeRangeLo }, + { 0x1f56u, 0x1f56u, 0x0000u, CanonicalizeUnique }, + { 0x1f57u, 0x1f57u, 0x0008u, CanonicalizeRangeLo }, + { 0x1f58u, 0x1f58u, 0x0000u, CanonicalizeUnique }, + { 0x1f59u, 0x1f59u, 0x0008u, CanonicalizeRangeHi }, + { 0x1f5au, 0x1f5au, 0x0000u, CanonicalizeUnique }, + { 0x1f5bu, 0x1f5bu, 0x0008u, CanonicalizeRangeHi }, + { 0x1f5cu, 0x1f5cu, 0x0000u, CanonicalizeUnique }, + { 0x1f5du, 0x1f5du, 0x0008u, CanonicalizeRangeHi }, + { 0x1f5eu, 0x1f5eu, 0x0000u, CanonicalizeUnique }, + { 0x1f5fu, 0x1f5fu, 0x0008u, CanonicalizeRangeHi }, + { 0x1f60u, 0x1f67u, 0x0008u, CanonicalizeRangeLo }, + { 0x1f68u, 0x1f6fu, 0x0008u, CanonicalizeRangeHi }, + { 0x1f70u, 0x1f71u, 0x004au, CanonicalizeRangeLo }, + { 0x1f72u, 0x1f75u, 0x0056u, CanonicalizeRangeLo }, + { 0x1f76u, 0x1f77u, 0x0064u, CanonicalizeRangeLo }, + { 0x1f78u, 0x1f79u, 0x0080u, CanonicalizeRangeLo }, + { 0x1f7au, 0x1f7bu, 0x0070u, CanonicalizeRangeLo }, + { 0x1f7cu, 0x1f7du, 0x007eu, CanonicalizeRangeLo }, + { 0x1f7eu, 0x1fafu, 0x0000u, CanonicalizeUnique }, + { 0x1fb0u, 0x1fb1u, 0x0008u, CanonicalizeRangeLo }, + { 0x1fb2u, 0x1fb7u, 0x0000u, CanonicalizeUnique }, + { 0x1fb8u, 0x1fb9u, 0x0008u, CanonicalizeRangeHi }, + { 0x1fbau, 0x1fbbu, 0x004au, CanonicalizeRangeHi }, + { 0x1fbcu, 0x1fbdu, 0x0000u, CanonicalizeUnique }, + { 0x1fbeu, 0x1fbeu, 0x0007u, CanonicalizeSet }, + { 0x1fbfu, 0x1fc7u, 0x0000u, CanonicalizeUnique }, + { 0x1fc8u, 0x1fcbu, 0x0056u, CanonicalizeRangeHi }, + { 0x1fccu, 0x1fcfu, 0x0000u, CanonicalizeUnique }, + { 0x1fd0u, 0x1fd1u, 0x0008u, CanonicalizeRangeLo }, + { 0x1fd2u, 0x1fd7u, 0x0000u, CanonicalizeUnique }, + { 0x1fd8u, 0x1fd9u, 0x0008u, CanonicalizeRangeHi }, + { 0x1fdau, 0x1fdbu, 0x0064u, CanonicalizeRangeHi }, + { 0x1fdcu, 0x1fdfu, 0x0000u, CanonicalizeUnique }, + { 0x1fe0u, 0x1fe1u, 0x0008u, CanonicalizeRangeLo }, + { 0x1fe2u, 0x1fe4u, 0x0000u, CanonicalizeUnique }, + { 0x1fe5u, 0x1fe5u, 0x0007u, CanonicalizeRangeLo }, + { 0x1fe6u, 0x1fe7u, 0x0000u, CanonicalizeUnique }, + { 0x1fe8u, 0x1fe9u, 0x0008u, CanonicalizeRangeHi }, + { 0x1feau, 0x1febu, 0x0070u, CanonicalizeRangeHi }, + { 0x1fecu, 0x1fecu, 0x0007u, CanonicalizeRangeHi }, + { 0x1fedu, 0x1ff7u, 0x0000u, CanonicalizeUnique }, + { 0x1ff8u, 0x1ff9u, 0x0080u, CanonicalizeRangeHi }, + { 0x1ffau, 0x1ffbu, 0x007eu, CanonicalizeRangeHi }, + { 0x1ffcu, 0x2131u, 0x0000u, CanonicalizeUnique }, + { 0x2132u, 0x2132u, 0x001cu, CanonicalizeRangeLo }, + { 0x2133u, 0x214du, 0x0000u, CanonicalizeUnique }, + { 0x214eu, 0x214eu, 0x001cu, CanonicalizeRangeHi }, + { 0x214fu, 0x215fu, 0x0000u, CanonicalizeUnique }, + { 0x2160u, 0x216fu, 0x0010u, CanonicalizeRangeLo }, + { 0x2170u, 0x217fu, 0x0010u, CanonicalizeRangeHi }, + { 0x2180u, 0x2182u, 0x0000u, CanonicalizeUnique }, + { 0x2183u, 0x2184u, 0x0000u, CanonicalizeAlternatingUnaligned }, + { 0x2185u, 0x24b5u, 0x0000u, CanonicalizeUnique }, + { 0x24b6u, 0x24cfu, 0x001au, CanonicalizeRangeLo }, + { 0x24d0u, 0x24e9u, 0x001au, CanonicalizeRangeHi }, + { 0x24eau, 0x2bffu, 0x0000u, CanonicalizeUnique }, + { 0x2c00u, 0x2c2eu, 0x0030u, CanonicalizeRangeLo }, + { 0x2c2fu, 0x2c2fu, 0x0000u, CanonicalizeUnique }, + { 0x2c30u, 0x2c5eu, 0x0030u, CanonicalizeRangeHi }, + { 0x2c5fu, 0x2c5fu, 0x0000u, CanonicalizeUnique }, + { 0x2c60u, 0x2c61u, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x2c62u, 0x2c62u, 0x29f7u, CanonicalizeRangeHi }, + { 0x2c63u, 0x2c63u, 0x0ee6u, CanonicalizeRangeHi }, + { 0x2c64u, 0x2c64u, 0x29e7u, CanonicalizeRangeHi }, + { 0x2c65u, 0x2c65u, 0x2a2bu, CanonicalizeRangeHi }, + { 0x2c66u, 0x2c66u, 0x2a28u, CanonicalizeRangeHi }, + { 0x2c67u, 0x2c6cu, 0x0000u, CanonicalizeAlternatingUnaligned }, + { 0x2c6du, 0x2c6du, 0x2a1cu, CanonicalizeRangeHi }, + { 0x2c6eu, 0x2c6eu, 0x29fdu, CanonicalizeRangeHi }, + { 0x2c6fu, 0x2c6fu, 0x2a1fu, CanonicalizeRangeHi }, + { 0x2c70u, 0x2c70u, 0x2a1eu, CanonicalizeRangeHi }, + { 0x2c71u, 0x2c71u, 0x0000u, CanonicalizeUnique }, + { 0x2c72u, 0x2c73u, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x2c74u, 0x2c74u, 0x0000u, CanonicalizeUnique }, + { 0x2c75u, 0x2c76u, 0x0000u, CanonicalizeAlternatingUnaligned }, + { 0x2c77u, 0x2c7du, 0x0000u, CanonicalizeUnique }, + { 0x2c7eu, 0x2c7fu, 0x2a3fu, CanonicalizeRangeHi }, + { 0x2c80u, 0x2ce3u, 0x0000u, CanonicalizeAlternatingAligned }, + { 0x2ce4u, 0x2ceau, 0x0000u, CanonicalizeUnique }, + { 0x2cebu, 0x2ceeu, 0x0000u, CanonicalizeAlternatingUnaligned }, + { 0x2cefu, 0x2cffu, 0x0000u, CanonicalizeUnique }, + { 0x2d00u, 0x2d25u, 0x1c60u, CanonicalizeRangeHi }, + { 0x2d26u, 0xa63fu, 0x0000u, CanonicalizeUnique }, + { 0xa640u, 0xa66du, 0x0000u, CanonicalizeAlternatingAligned }, + { 0xa66eu, 0xa67fu, 0x0000u, CanonicalizeUnique }, + { 0xa680u, 0xa697u, 0x0000u, CanonicalizeAlternatingAligned }, + { 0xa698u, 0xa721u, 0x0000u, CanonicalizeUnique }, + { 0xa722u, 0xa72fu, 0x0000u, CanonicalizeAlternatingAligned }, + { 0xa730u, 0xa731u, 0x0000u, CanonicalizeUnique }, + { 0xa732u, 0xa76fu, 0x0000u, CanonicalizeAlternatingAligned }, + { 0xa770u, 0xa778u, 0x0000u, CanonicalizeUnique }, + { 0xa779u, 0xa77cu, 0x0000u, CanonicalizeAlternatingUnaligned }, + { 0xa77du, 0xa77du, 0x8a04u, CanonicalizeRangeHi }, + { 0xa77eu, 0xa787u, 0x0000u, CanonicalizeAlternatingAligned }, + { 0xa788u, 0xa78au, 0x0000u, CanonicalizeUnique }, + { 0xa78bu, 0xa78cu, 0x0000u, CanonicalizeAlternatingUnaligned }, + { 0xa78du, 0xa78du, 0xa528u, CanonicalizeRangeHi }, + { 0xa78eu, 0xa78fu, 0x0000u, CanonicalizeUnique }, + { 0xa790u, 0xa791u, 0x0000u, CanonicalizeAlternatingAligned }, + { 0xa792u, 0xa79fu, 0x0000u, CanonicalizeUnique }, + { 0xa7a0u, 0xa7a9u, 0x0000u, CanonicalizeAlternatingAligned }, + { 0xa7aau, 0xff20u, 0x0000u, CanonicalizeUnique }, + { 0xff21u, 0xff3au, 0x0020u, CanonicalizeRangeLo }, + { 0xff3bu, 0xff40u, 0x0000u, CanonicalizeUnique }, + { 0xff41u, 0xff5au, 0x0020u, CanonicalizeRangeHi }, + { 0xff5bu, 0xffffu, 0x0000u, CanonicalizeUnique }, +}; + +const size_t LATIN_CANONICALIZATION_RANGES = 20; +LatinCanonicalizationRange latinRangeInfo[LATIN_CANONICALIZATION_RANGES] = { + { 0x0000u, 0x0040u, 0x0000u, CanonicalizeLatinSelf }, + { 0x0041u, 0x005au, 0x0000u, CanonicalizeLatinMask0x20 }, + { 0x005bu, 0x0060u, 0x0000u, CanonicalizeLatinSelf }, + { 0x0061u, 0x007au, 0x0000u, CanonicalizeLatinMask0x20 }, + { 0x007bu, 0x00bfu, 0x0000u, CanonicalizeLatinSelf }, + { 0x00c0u, 0x00d6u, 0x0000u, CanonicalizeLatinMask0x20 }, + { 0x00d7u, 0x00d7u, 0x0000u, CanonicalizeLatinSelf }, + { 0x00d8u, 0x00deu, 0x0000u, CanonicalizeLatinMask0x20 }, + { 0x00dfu, 0x00dfu, 0x0000u, CanonicalizeLatinSelf }, + { 0x00e0u, 0x00f6u, 0x0000u, CanonicalizeLatinMask0x20 }, + { 0x00f7u, 0x00f7u, 0x0000u, CanonicalizeLatinSelf }, + { 0x00f8u, 0x00feu, 0x0000u, CanonicalizeLatinMask0x20 }, + { 0x00ffu, 0x00ffu, 0x0000u, CanonicalizeLatinSelf }, + { 0x0100u, 0x0177u, 0x0000u, CanonicalizeLatinInvalid }, + { 0x0178u, 0x0178u, 0x00ffu, CanonicalizeLatinOther }, + { 0x0179u, 0x039bu, 0x0000u, CanonicalizeLatinInvalid }, + { 0x039cu, 0x039cu, 0x00b5u, CanonicalizeLatinOther }, + { 0x039du, 0x03bbu, 0x0000u, CanonicalizeLatinInvalid }, + { 0x03bcu, 0x03bcu, 0x00b5u, CanonicalizeLatinOther }, + { 0x03bdu, 0xffffu, 0x0000u, CanonicalizeLatinInvalid }, +}; + +} } // JSC::Yarr + diff --git a/src/3rdparty/masm/yarr/YarrCanonicalizeUCS2.h b/src/3rdparty/masm/yarr/YarrCanonicalizeUCS2.h new file mode 100644 index 0000000000..be0ead43d2 --- /dev/null +++ b/src/3rdparty/masm/yarr/YarrCanonicalizeUCS2.h @@ -0,0 +1,138 @@ +/* + * Copyright (C) 2012 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef YarrCanonicalizeUCS2_H +#define YarrCanonicalizeUCS2_H + +#include <stdint.h> +#include <wtf/unicode/Unicode.h> + +namespace JSC { namespace Yarr { + +// This set of data (autogenerated using YarrCanonicalizeUCS2.js into YarrCanonicalizeUCS2.cpp) +// provides information for each UCS2 code point as to the set of code points that it should +// match under the ES5.1 case insensitive RegExp matching rules, specified in 15.10.2.8. +enum UCS2CanonicalizationType { + CanonicalizeUnique, // No canonically equal values, e.g. 0x0. + CanonicalizeSet, // Value indicates a set in characterSetInfo. + CanonicalizeRangeLo, // Value is positive delta to pair, E.g. 0x41 has value 0x20, -> 0x61. + CanonicalizeRangeHi, // Value is positive delta to pair, E.g. 0x61 has value 0x20, -> 0x41. + CanonicalizeAlternatingAligned, // Aligned consequtive pair, e.g. 0x1f4,0x1f5. + CanonicalizeAlternatingUnaligned, // Unaligned consequtive pair, e.g. 0x241,0x242. +}; +struct UCS2CanonicalizationRange { uint16_t begin, end, value, type; }; +extern const size_t UCS2_CANONICALIZATION_RANGES; +extern uint16_t* characterSetInfo[]; +extern UCS2CanonicalizationRange rangeInfo[]; + +// This table is similar to the full rangeInfo table, however this maps from UCS2 codepoints to +// the set of Latin1 codepoints that could match. +enum LatinCanonicalizationType { + CanonicalizeLatinSelf, // This character is in the Latin1 range, but has no canonical equivalent in the range. + CanonicalizeLatinMask0x20, // One of a pair of characters, under the mask 0x20. + CanonicalizeLatinOther, // This character is not in the Latin1 range, but canonicalizes to another that is. + CanonicalizeLatinInvalid, // Cannot match against Latin1 input. +}; +struct LatinCanonicalizationRange { uint16_t begin, end, value, type; }; +extern const size_t LATIN_CANONICALIZATION_RANGES; +extern LatinCanonicalizationRange latinRangeInfo[]; + +// This searches in log2 time over ~364 entries, so should typically result in 8 compares. +inline UCS2CanonicalizationRange* rangeInfoFor(UChar ch) +{ + UCS2CanonicalizationRange* info = rangeInfo; + size_t entries = UCS2_CANONICALIZATION_RANGES; + + while (true) { + size_t candidate = entries >> 1; + UCS2CanonicalizationRange* candidateInfo = info + candidate; + if (ch < candidateInfo->begin) + entries = candidate; + else if (ch <= candidateInfo->end) + return candidateInfo; + else { + info = candidateInfo + 1; + entries -= (candidate + 1); + } + } +} + +// Should only be called for characters that have one canonically matching value. +inline UChar getCanonicalPair(UCS2CanonicalizationRange* info, UChar ch) +{ + ASSERT(ch >= info->begin && ch <= info->end); + switch (info->type) { + case CanonicalizeRangeLo: + return ch + info->value; + case CanonicalizeRangeHi: + return ch - info->value; + case CanonicalizeAlternatingAligned: + return ch ^ 1; + case CanonicalizeAlternatingUnaligned: + return ((ch - 1) ^ 1) + 1; + default: + ASSERT_NOT_REACHED(); + } + ASSERT_NOT_REACHED(); + return 0; +} + +// Returns true if no other UCS2 codepoint can match this value. +inline bool isCanonicallyUnique(UChar ch) +{ + return rangeInfoFor(ch)->type == CanonicalizeUnique; +} + +// Returns true if values are equal, under the canonicalization rules. +inline bool areCanonicallyEquivalent(UChar a, UChar b) +{ + UCS2CanonicalizationRange* info = rangeInfoFor(a); + switch (info->type) { + case CanonicalizeUnique: + return a == b; + case CanonicalizeSet: { + for (uint16_t* set = characterSetInfo[info->value]; (a = *set); ++set) { + if (a == b) + return true; + } + return false; + } + case CanonicalizeRangeLo: + return (a == b) || (a + info->value == b); + case CanonicalizeRangeHi: + return (a == b) || (a - info->value == b); + case CanonicalizeAlternatingAligned: + return (a | 1) == (b | 1); + case CanonicalizeAlternatingUnaligned: + return ((a - 1) | 1) == ((b - 1) | 1); + } + + ASSERT_NOT_REACHED(); + return false; +} + +} } // JSC::Yarr + +#endif diff --git a/src/3rdparty/masm/yarr/YarrCanonicalizeUCS2.js b/src/3rdparty/masm/yarr/YarrCanonicalizeUCS2.js new file mode 100644 index 0000000000..00361dd46e --- /dev/null +++ b/src/3rdparty/masm/yarr/YarrCanonicalizeUCS2.js @@ -0,0 +1,219 @@ +/* + * Copyright (C) 2012 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// See ES 5.1, 15.10.2.8 +function canonicalize(ch) +{ + var u = String.fromCharCode(ch).toUpperCase(); + if (u.length > 1) + return ch; + var cu = u.charCodeAt(0); + if (ch >= 128 && cu < 128) + return ch; + return cu; +} + +var MAX_UCS2 = 0xFFFF; +var MAX_LATIN = 0xFF; + +var groupedCanonically = []; +// Pass 1: populate groupedCanonically - this is mapping from canonicalized +// values back to the set of character code that canonicalize to them. +for (var i = 0; i <= MAX_UCS2; ++i) { + var ch = canonicalize(i); + if (!groupedCanonically[ch]) + groupedCanonically[ch] = []; + groupedCanonically[ch].push(i); +} + +var typeInfo = []; +var latinTypeInfo = []; +var characterSetInfo = []; +// Pass 2: populate typeInfo & characterSetInfo. For every character calculate +// a typeInfo value, described by the types above, and a value payload. +for (cu in groupedCanonically) { + // The set of characters that canonicalize to cu + var characters = groupedCanonically[cu]; + + // If there is only one, it is unique. + if (characters.length == 1) { + typeInfo[characters[0]] = "CanonicalizeUnique:0"; + latinTypeInfo[characters[0]] = characters[0] <= MAX_LATIN ? "CanonicalizeLatinSelf:0" : "CanonicalizeLatinInvalid:0"; + continue; + } + + // Sort the array. + characters.sort(function(x,y){return x-y;}); + + // If there are more than two characters, create an entry in characterSetInfo. + if (characters.length > 2) { + for (i in characters) + typeInfo[characters[i]] = "CanonicalizeSet:" + characterSetInfo.length; + characterSetInfo.push(characters); + + if (characters[1] <= MAX_LATIN) + throw new Error("sets with more than one latin character not supported!"); + if (characters[0] <= MAX_LATIN) { + for (i in characters) + latinTypeInfo[characters[i]] = "CanonicalizeLatinOther:" + characters[0]; + latinTypeInfo[characters[0]] = "CanonicalizeLatinSelf:0"; + } else { + for (i in characters) + latinTypeInfo[characters[i]] = "CanonicalizeLatinInvalid:0"; + } + + continue; + } + + // We have a pair, mark alternating ranges, otherwise track whether this is the low or high partner. + var lo = characters[0]; + var hi = characters[1]; + var delta = hi - lo; + if (delta == 1) { + var type = lo & 1 ? "CanonicalizeAlternatingUnaligned:0" : "CanonicalizeAlternatingAligned:0"; + typeInfo[lo] = type; + typeInfo[hi] = type; + } else { + typeInfo[lo] = "CanonicalizeRangeLo:" + delta; + typeInfo[hi] = "CanonicalizeRangeHi:" + delta; + } + + if (lo > MAX_LATIN) { + latinTypeInfo[lo] = "CanonicalizeLatinInvalid:0"; + latinTypeInfo[hi] = "CanonicalizeLatinInvalid:0"; + } else if (hi > MAX_LATIN) { + latinTypeInfo[lo] = "CanonicalizeLatinSelf:0"; + latinTypeInfo[hi] = "CanonicalizeLatinOther:" + lo; + } else { + if (delta != 0x20 || lo & 0x20) + throw new Error("pairs of latin characters that don't mask with 0x20 not supported!"); + latinTypeInfo[lo] = "CanonicalizeLatinMask0x20:0"; + latinTypeInfo[hi] = "CanonicalizeLatinMask0x20:0"; + } +} + +var rangeInfo = []; +// Pass 3: coallesce types into ranges. +for (var end = 0; end <= MAX_UCS2; ++end) { + var begin = end; + var type = typeInfo[end]; + while (end < MAX_UCS2 && typeInfo[end + 1] == type) + ++end; + rangeInfo.push({begin:begin, end:end, type:type}); +} + +var latinRangeInfo = []; +// Pass 4: coallesce latin-1 types into ranges. +for (var end = 0; end <= MAX_UCS2; ++end) { + var begin = end; + var type = latinTypeInfo[end]; + while (end < MAX_UCS2 && latinTypeInfo[end + 1] == type) + ++end; + latinRangeInfo.push({begin:begin, end:end, type:type}); +} + + +// Helper function to convert a number to a fixed width hex representation of a C uint16_t. +function hex(x) +{ + var s = Number(x).toString(16); + while (s.length < 4) + s = 0 + s; + return "0x" + s + "u"; +} + +var copyright = ( + "/*" + "\n" + + " * Copyright (C) 2012 Apple Inc. All rights reserved." + "\n" + + " *" + "\n" + + " * Redistribution and use in source and binary forms, with or without" + "\n" + + " * modification, are permitted provided that the following conditions" + "\n" + + " * are met:" + "\n" + + " * 1. Redistributions of source code must retain the above copyright" + "\n" + + " * notice, this list of conditions and the following disclaimer." + "\n" + + " * 2. Redistributions in binary form must reproduce the above copyright" + "\n" + + " * notice, this list of conditions and the following disclaimer in the" + "\n" + + " * documentation and/or other materials provided with the distribution." + "\n" + + " *" + "\n" + + " * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY" + "\n" + + " * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE" + "\n" + + " * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR" + "\n" + + " * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR" + "\n" + + " * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL," + "\n" + + " * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO," + "\n" + + " * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR" + "\n" + + " * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY" + "\n" + + " * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT" + "\n" + + " * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE" + "\n" + + " * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. " + "\n" + + " */"); + +print(copyright); +print(); +print("// DO NOT EDIT! - this file autogenerated by YarrCanonicalizeUCS2.js"); +print(); +print('#include "config.h"'); +print('#include "YarrCanonicalizeUCS2.h"'); +print(); +print("namespace JSC { namespace Yarr {"); +print(); +print("#include <stdint.h>"); +print(); + +for (i in characterSetInfo) { + var characters = "" + var set = characterSetInfo[i]; + for (var j in set) + characters += hex(set[j]) + ", "; + print("uint16_t ucs2CharacterSet" + i + "[] = { " + characters + "0 };"); +} +print(); +print("static const size_t UCS2_CANONICALIZATION_SETS = " + characterSetInfo.length + ";"); +print("uint16_t* characterSetInfo[UCS2_CANONICALIZATION_SETS] = {"); +for (i in characterSetInfo) +print(" ucs2CharacterSet" + i + ","); +print("};"); +print(); +print("const size_t UCS2_CANONICALIZATION_RANGES = " + rangeInfo.length + ";"); +print("UCS2CanonicalizationRange rangeInfo[UCS2_CANONICALIZATION_RANGES] = {"); +for (i in rangeInfo) { + var info = rangeInfo[i]; + var typeAndValue = info.type.split(':'); + print(" { " + hex(info.begin) + ", " + hex(info.end) + ", " + hex(typeAndValue[1]) + ", " + typeAndValue[0] + " },"); +} +print("};"); +print(); +print("const size_t LATIN_CANONICALIZATION_RANGES = " + latinRangeInfo.length + ";"); +print("LatinCanonicalizationRange latinRangeInfo[LATIN_CANONICALIZATION_RANGES] = {"); +for (i in latinRangeInfo) { + var info = latinRangeInfo[i]; + var typeAndValue = info.type.split(':'); + print(" { " + hex(info.begin) + ", " + hex(info.end) + ", " + hex(typeAndValue[1]) + ", " + typeAndValue[0] + " },"); +} +print("};"); +print(); +print("} } // JSC::Yarr"); +print(); + diff --git a/src/3rdparty/masm/yarr/YarrInterpreter.cpp b/src/3rdparty/masm/yarr/YarrInterpreter.cpp new file mode 100644 index 0000000000..31603f6d34 --- /dev/null +++ b/src/3rdparty/masm/yarr/YarrInterpreter.cpp @@ -0,0 +1,1964 @@ +/* + * Copyright (C) 2009 Apple Inc. All rights reserved. + * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "YarrInterpreter.h" + +#include "Yarr.h" +#include "YarrCanonicalizeUCS2.h" +#include <wtf/BumpPointerAllocator.h> +#include <wtf/DataLog.h> +#include <wtf/text/CString.h> +#include <wtf/text/WTFString.h> + +#ifndef NDEBUG +#include <stdio.h> +#endif + +using namespace WTF; + +namespace JSC { namespace Yarr { + +template<typename CharType> +class Interpreter { +public: + struct ParenthesesDisjunctionContext; + + struct BackTrackInfoPatternCharacter { + uintptr_t matchAmount; + }; + struct BackTrackInfoCharacterClass { + uintptr_t matchAmount; + }; + struct BackTrackInfoBackReference { + uintptr_t begin; // Not really needed for greedy quantifiers. + uintptr_t matchAmount; // Not really needed for fixed quantifiers. + }; + struct BackTrackInfoAlternative { + uintptr_t offset; + }; + struct BackTrackInfoParentheticalAssertion { + uintptr_t begin; + }; + struct BackTrackInfoParenthesesOnce { + uintptr_t begin; + }; + struct BackTrackInfoParenthesesTerminal { + uintptr_t begin; + }; + struct BackTrackInfoParentheses { + uintptr_t matchAmount; + ParenthesesDisjunctionContext* lastContext; + }; + + static inline void appendParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack, ParenthesesDisjunctionContext* context) + { + context->next = backTrack->lastContext; + backTrack->lastContext = context; + ++backTrack->matchAmount; + } + + static inline void popParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack) + { + ASSERT(backTrack->matchAmount); + ASSERT(backTrack->lastContext); + backTrack->lastContext = backTrack->lastContext->next; + --backTrack->matchAmount; + } + + struct DisjunctionContext + { + DisjunctionContext() + : term(0) + { + } + + void* operator new(size_t, void* where) + { + return where; + } + + int term; + unsigned matchBegin; + unsigned matchEnd; + uintptr_t frame[1]; + }; + + DisjunctionContext* allocDisjunctionContext(ByteDisjunction* disjunction) + { + size_t size = sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t); + allocatorPool = allocatorPool->ensureCapacity(size); + if (!allocatorPool) + CRASH(); + return new (allocatorPool->alloc(size)) DisjunctionContext(); + } + + void freeDisjunctionContext(DisjunctionContext* context) + { + allocatorPool = allocatorPool->dealloc(context); + } + + struct ParenthesesDisjunctionContext + { + ParenthesesDisjunctionContext(unsigned* output, ByteTerm& term) + : next(0) + { + unsigned firstSubpatternId = term.atom.subpatternId; + unsigned numNestedSubpatterns = term.atom.parenthesesDisjunction->m_numSubpatterns; + + for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) { + subpatternBackup[i] = output[(firstSubpatternId << 1) + i]; + output[(firstSubpatternId << 1) + i] = offsetNoMatch; + } + + new (getDisjunctionContext(term)) DisjunctionContext(); + } + + void* operator new(size_t, void* where) + { + return where; + } + + void restoreOutput(unsigned* output, unsigned firstSubpatternId, unsigned numNestedSubpatterns) + { + for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) + output[(firstSubpatternId << 1) + i] = subpatternBackup[i]; + } + + DisjunctionContext* getDisjunctionContext(ByteTerm& term) + { + return reinterpret_cast<DisjunctionContext*>(&(subpatternBackup[term.atom.parenthesesDisjunction->m_numSubpatterns << 1])); + } + + ParenthesesDisjunctionContext* next; + unsigned subpatternBackup[1]; + }; + + ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, unsigned* output, ByteTerm& term) + { + size_t size = sizeof(ParenthesesDisjunctionContext) - sizeof(unsigned) + (term.atom.parenthesesDisjunction->m_numSubpatterns << 1) * sizeof(unsigned) + sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t); + allocatorPool = allocatorPool->ensureCapacity(size); + if (!allocatorPool) + CRASH(); + return new (allocatorPool->alloc(size)) ParenthesesDisjunctionContext(output, term); + } + + void freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext* context) + { + allocatorPool = allocatorPool->dealloc(context); + } + + class InputStream { + public: + InputStream(const CharType* input, unsigned start, unsigned length) + : input(input) + , pos(start) + , length(length) + { + } + + void next() + { + ++pos; + } + + void rewind(unsigned amount) + { + ASSERT(pos >= amount); + pos -= amount; + } + + int read() + { + ASSERT(pos < length); + if (pos < length) + return input[pos]; + return -1; + } + + int readPair() + { + ASSERT(pos + 1 < length); + return input[pos] | input[pos + 1] << 16; + } + + int readChecked(unsigned negativePositionOffest) + { + if (pos < negativePositionOffest) + CRASH(); + unsigned p = pos - negativePositionOffest; + ASSERT(p < length); + return input[p]; + } + + int reread(unsigned from) + { + ASSERT(from < length); + return input[from]; + } + + int prev() + { + ASSERT(!(pos > length)); + if (pos && length) + return input[pos - 1]; + return -1; + } + + unsigned getPos() + { + return pos; + } + + void setPos(unsigned p) + { + pos = p; + } + + bool atStart() + { + return pos == 0; + } + + bool atEnd() + { + return pos == length; + } + + unsigned end() + { + return length; + } + + bool checkInput(unsigned count) + { + if (((pos + count) <= length) && ((pos + count) >= pos)) { + pos += count; + return true; + } + return false; + } + + void uncheckInput(unsigned count) + { + if (pos < count) + CRASH(); + pos -= count; + } + + bool atStart(unsigned negativePositionOffest) + { + return pos == negativePositionOffest; + } + + bool atEnd(unsigned negativePositionOffest) + { + if (pos < negativePositionOffest) + CRASH(); + return (pos - negativePositionOffest) == length; + } + + bool isAvailableInput(unsigned offset) + { + return (((pos + offset) <= length) && ((pos + offset) >= pos)); + } + + private: + const CharType* input; + unsigned pos; + unsigned length; + }; + + bool testCharacterClass(CharacterClass* characterClass, int ch) + { + if (ch & 0xFF80) { + for (unsigned i = 0; i < characterClass->m_matchesUnicode.size(); ++i) + if (ch == characterClass->m_matchesUnicode[i]) + return true; + for (unsigned i = 0; i < characterClass->m_rangesUnicode.size(); ++i) + if ((ch >= characterClass->m_rangesUnicode[i].begin) && (ch <= characterClass->m_rangesUnicode[i].end)) + return true; + } else { + for (unsigned i = 0; i < characterClass->m_matches.size(); ++i) + if (ch == characterClass->m_matches[i]) + return true; + for (unsigned i = 0; i < characterClass->m_ranges.size(); ++i) + if ((ch >= characterClass->m_ranges[i].begin) && (ch <= characterClass->m_ranges[i].end)) + return true; + } + + return false; + } + + bool checkCharacter(int testChar, unsigned negativeInputOffset) + { + return testChar == input.readChecked(negativeInputOffset); + } + + bool checkCasedCharacter(int loChar, int hiChar, unsigned negativeInputOffset) + { + int ch = input.readChecked(negativeInputOffset); + return (loChar == ch) || (hiChar == ch); + } + + bool checkCharacterClass(CharacterClass* characterClass, bool invert, unsigned negativeInputOffset) + { + bool match = testCharacterClass(characterClass, input.readChecked(negativeInputOffset)); + return invert ? !match : match; + } + + bool tryConsumeBackReference(int matchBegin, int matchEnd, unsigned negativeInputOffset) + { + unsigned matchSize = (unsigned)(matchEnd - matchBegin); + + if (!input.checkInput(matchSize)) + return false; + + if (pattern->m_ignoreCase) { + for (unsigned i = 0; i < matchSize; ++i) { + int oldCh = input.reread(matchBegin + i); + int ch = input.readChecked(negativeInputOffset + matchSize - i); + + if (oldCh == ch) + continue; + + // The definition for canonicalize (see ES 5.1, 15.10.2.8) means that + // unicode values are never allowed to match against ascii ones. + if (isASCII(oldCh) || isASCII(ch)) { + if (toASCIIUpper(oldCh) == toASCIIUpper(ch)) + continue; + } else if (areCanonicallyEquivalent(oldCh, ch)) + continue; + + input.uncheckInput(matchSize); + return false; + } + } else { + for (unsigned i = 0; i < matchSize; ++i) { + if (!checkCharacter(input.reread(matchBegin + i), negativeInputOffset + matchSize - i)) { + input.uncheckInput(matchSize); + return false; + } + } + } + + return true; + } + + bool matchAssertionBOL(ByteTerm& term) + { + return (input.atStart(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition + 1))); + } + + bool matchAssertionEOL(ByteTerm& term) + { + if (term.inputPosition) + return (input.atEnd(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition))); + + return (input.atEnd()) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.read())); + } + + bool matchAssertionWordBoundary(ByteTerm& term) + { + bool prevIsWordchar = !input.atStart(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition + 1)); + bool readIsWordchar; + if (term.inputPosition) + readIsWordchar = !input.atEnd(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition)); + else + readIsWordchar = !input.atEnd() && testCharacterClass(pattern->wordcharCharacterClass, input.read()); + + bool wordBoundary = prevIsWordchar != readIsWordchar; + return term.invert() ? !wordBoundary : wordBoundary; + } + + bool backtrackPatternCharacter(ByteTerm& term, DisjunctionContext* context) + { + BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); + + switch (term.atom.quantityType) { + case QuantifierFixedCount: + break; + + case QuantifierGreedy: + if (backTrack->matchAmount) { + --backTrack->matchAmount; + input.uncheckInput(1); + return true; + } + break; + + case QuantifierNonGreedy: + if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { + ++backTrack->matchAmount; + if (checkCharacter(term.atom.patternCharacter, term.inputPosition + 1)) + return true; + } + input.uncheckInput(backTrack->matchAmount); + break; + } + + return false; + } + + bool backtrackPatternCasedCharacter(ByteTerm& term, DisjunctionContext* context) + { + BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); + + switch (term.atom.quantityType) { + case QuantifierFixedCount: + break; + + case QuantifierGreedy: + if (backTrack->matchAmount) { + --backTrack->matchAmount; + input.uncheckInput(1); + return true; + } + break; + + case QuantifierNonGreedy: + if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { + ++backTrack->matchAmount; + if (checkCasedCharacter(term.atom.casedCharacter.lo, term.atom.casedCharacter.hi, term.inputPosition + 1)) + return true; + } + input.uncheckInput(backTrack->matchAmount); + break; + } + + return false; + } + + bool matchCharacterClass(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeCharacterClass); + BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); + + switch (term.atom.quantityType) { + case QuantifierFixedCount: { + for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) { + if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - matchAmount)) + return false; + } + return true; + } + + case QuantifierGreedy: { + unsigned matchAmount = 0; + while ((matchAmount < term.atom.quantityCount) && input.checkInput(1)) { + if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + 1)) { + input.uncheckInput(1); + break; + } + ++matchAmount; + } + backTrack->matchAmount = matchAmount; + + return true; + } + + case QuantifierNonGreedy: + backTrack->matchAmount = 0; + return true; + } + + ASSERT_NOT_REACHED(); + return false; + } + + bool backtrackCharacterClass(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeCharacterClass); + BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); + + switch (term.atom.quantityType) { + case QuantifierFixedCount: + break; + + case QuantifierGreedy: + if (backTrack->matchAmount) { + --backTrack->matchAmount; + input.uncheckInput(1); + return true; + } + break; + + case QuantifierNonGreedy: + if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { + ++backTrack->matchAmount; + if (checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + 1)) + return true; + } + input.uncheckInput(backTrack->matchAmount); + break; + } + + return false; + } + + bool matchBackReference(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeBackReference); + BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation); + + unsigned matchBegin = output[(term.atom.subpatternId << 1)]; + unsigned matchEnd = output[(term.atom.subpatternId << 1) + 1]; + + // If the end position of the referenced match hasn't set yet then the backreference in the same parentheses where it references to that. + // In this case the result of match is empty string like when it references to a parentheses with zero-width match. + // Eg.: /(a\1)/ + if (matchEnd == offsetNoMatch) + return true; + + if (matchBegin == offsetNoMatch) + return true; + + ASSERT(matchBegin <= matchEnd); + + if (matchBegin == matchEnd) + return true; + + switch (term.atom.quantityType) { + case QuantifierFixedCount: { + backTrack->begin = input.getPos(); + for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) { + if (!tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) { + input.setPos(backTrack->begin); + return false; + } + } + return true; + } + + case QuantifierGreedy: { + unsigned matchAmount = 0; + while ((matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) + ++matchAmount; + backTrack->matchAmount = matchAmount; + return true; + } + + case QuantifierNonGreedy: + backTrack->begin = input.getPos(); + backTrack->matchAmount = 0; + return true; + } + + ASSERT_NOT_REACHED(); + return false; + } + + bool backtrackBackReference(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeBackReference); + BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation); + + unsigned matchBegin = output[(term.atom.subpatternId << 1)]; + unsigned matchEnd = output[(term.atom.subpatternId << 1) + 1]; + + if (matchBegin == offsetNoMatch) + return false; + + ASSERT(matchBegin <= matchEnd); + + if (matchBegin == matchEnd) + return false; + + switch (term.atom.quantityType) { + case QuantifierFixedCount: + // for quantityCount == 1, could rewind. + input.setPos(backTrack->begin); + break; + + case QuantifierGreedy: + if (backTrack->matchAmount) { + --backTrack->matchAmount; + input.rewind(matchEnd - matchBegin); + return true; + } + break; + + case QuantifierNonGreedy: + if ((backTrack->matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) { + ++backTrack->matchAmount; + return true; + } + input.setPos(backTrack->begin); + break; + } + + return false; + } + + void recordParenthesesMatch(ByteTerm& term, ParenthesesDisjunctionContext* context) + { + if (term.capture()) { + unsigned subpatternId = term.atom.subpatternId; + output[(subpatternId << 1)] = context->getDisjunctionContext(term)->matchBegin + term.inputPosition; + output[(subpatternId << 1) + 1] = context->getDisjunctionContext(term)->matchEnd + term.inputPosition; + } + } + void resetMatches(ByteTerm& term, ParenthesesDisjunctionContext* context) + { + unsigned firstSubpatternId = term.atom.subpatternId; + unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns; + context->restoreOutput(output, firstSubpatternId, count); + } + JSRegExpResult parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack) + { + while (backTrack->matchAmount) { + ParenthesesDisjunctionContext* context = backTrack->lastContext; + + JSRegExpResult result = matchDisjunction(term.atom.parenthesesDisjunction, context->getDisjunctionContext(term), true); + if (result == JSRegExpMatch) + return JSRegExpMatch; + + resetMatches(term, context); + popParenthesesDisjunctionContext(backTrack); + freeParenthesesDisjunctionContext(context); + + if (result != JSRegExpNoMatch) + return result; + } + + return JSRegExpNoMatch; + } + + bool matchParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin); + ASSERT(term.atom.quantityCount == 1); + + BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); + + switch (term.atom.quantityType) { + case QuantifierGreedy: { + // set this speculatively; if we get to the parens end this will be true. + backTrack->begin = input.getPos(); + break; + } + case QuantifierNonGreedy: { + backTrack->begin = notFound; + context->term += term.atom.parenthesesWidth; + return true; + } + case QuantifierFixedCount: + break; + } + + if (term.capture()) { + unsigned subpatternId = term.atom.subpatternId; + output[(subpatternId << 1)] = input.getPos() - term.inputPosition; + } + + return true; + } + + bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd); + ASSERT(term.atom.quantityCount == 1); + + if (term.capture()) { + unsigned subpatternId = term.atom.subpatternId; + output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition; + } + + if (term.atom.quantityType == QuantifierFixedCount) + return true; + + BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); + return backTrack->begin != input.getPos(); + } + + bool backtrackParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin); + ASSERT(term.atom.quantityCount == 1); + + BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); + + if (term.capture()) { + unsigned subpatternId = term.atom.subpatternId; + output[(subpatternId << 1)] = offsetNoMatch; + output[(subpatternId << 1) + 1] = offsetNoMatch; + } + + switch (term.atom.quantityType) { + case QuantifierGreedy: + // if we backtrack to this point, there is another chance - try matching nothing. + ASSERT(backTrack->begin != notFound); + backTrack->begin = notFound; + context->term += term.atom.parenthesesWidth; + return true; + case QuantifierNonGreedy: + ASSERT(backTrack->begin != notFound); + case QuantifierFixedCount: + break; + } + + return false; + } + + bool backtrackParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd); + ASSERT(term.atom.quantityCount == 1); + + BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); + + switch (term.atom.quantityType) { + case QuantifierGreedy: + if (backTrack->begin == notFound) { + context->term -= term.atom.parenthesesWidth; + return false; + } + case QuantifierNonGreedy: + if (backTrack->begin == notFound) { + backTrack->begin = input.getPos(); + if (term.capture()) { + // Technically this access to inputPosition should be accessing the begin term's + // inputPosition, but for repeats other than fixed these values should be + // the same anyway! (We don't pre-check for greedy or non-greedy matches.) + ASSERT((&term - term.atom.parenthesesWidth)->type == ByteTerm::TypeParenthesesSubpatternOnceBegin); + ASSERT((&term - term.atom.parenthesesWidth)->inputPosition == term.inputPosition); + unsigned subpatternId = term.atom.subpatternId; + output[subpatternId << 1] = input.getPos() + term.inputPosition; + } + context->term -= term.atom.parenthesesWidth; + return true; + } + case QuantifierFixedCount: + break; + } + + return false; + } + + bool matchParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin); + ASSERT(term.atom.quantityType == QuantifierGreedy); + ASSERT(term.atom.quantityCount == quantifyInfinite); + ASSERT(!term.capture()); + + BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation); + backTrack->begin = input.getPos(); + return true; + } + + bool matchParenthesesTerminalEnd(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalEnd); + + BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation); + // Empty match is a failed match. + if (backTrack->begin == input.getPos()) + return false; + + // Successful match! Okay, what's next? - loop around and try to match moar! + context->term -= (term.atom.parenthesesWidth + 1); + return true; + } + + bool backtrackParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin); + ASSERT(term.atom.quantityType == QuantifierGreedy); + ASSERT(term.atom.quantityCount == quantifyInfinite); + ASSERT(!term.capture()); + + // If we backtrack to this point, we have failed to match this iteration of the parens. + // Since this is greedy / zero minimum a failed is also accepted as a match! + context->term += term.atom.parenthesesWidth; + return true; + } + + bool backtrackParenthesesTerminalEnd(ByteTerm&, DisjunctionContext*) + { + // 'Terminal' parentheses are at the end of the regex, and as such a match past end + // should always be returned as a successful match - we should never backtrack to here. + ASSERT_NOT_REACHED(); + return false; + } + + bool matchParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin); + ASSERT(term.atom.quantityCount == 1); + + BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation); + + backTrack->begin = input.getPos(); + return true; + } + + bool matchParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd); + ASSERT(term.atom.quantityCount == 1); + + BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation); + + input.setPos(backTrack->begin); + + // We've reached the end of the parens; if they are inverted, this is failure. + if (term.invert()) { + context->term -= term.atom.parenthesesWidth; + return false; + } + + return true; + } + + bool backtrackParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin); + ASSERT(term.atom.quantityCount == 1); + + // We've failed to match parens; if they are inverted, this is win! + if (term.invert()) { + context->term += term.atom.parenthesesWidth; + return true; + } + + return false; + } + + bool backtrackParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd); + ASSERT(term.atom.quantityCount == 1); + + BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation); + + input.setPos(backTrack->begin); + + context->term -= term.atom.parenthesesWidth; + return false; + } + + JSRegExpResult matchParentheses(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern); + + BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation); + ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction; + + backTrack->matchAmount = 0; + backTrack->lastContext = 0; + + switch (term.atom.quantityType) { + case QuantifierFixedCount: { + // While we haven't yet reached our fixed limit, + while (backTrack->matchAmount < term.atom.quantityCount) { + // Try to do a match, and it it succeeds, add it to the list. + ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); + JSRegExpResult result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term)); + if (result == JSRegExpMatch) + appendParenthesesDisjunctionContext(backTrack, context); + else { + // The match failed; try to find an alternate point to carry on from. + resetMatches(term, context); + freeParenthesesDisjunctionContext(context); + + if (result != JSRegExpNoMatch) + return result; + JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack); + if (backtrackResult != JSRegExpMatch) + return backtrackResult; + } + } + + ASSERT(backTrack->matchAmount == term.atom.quantityCount); + ParenthesesDisjunctionContext* context = backTrack->lastContext; + recordParenthesesMatch(term, context); + return JSRegExpMatch; + } + + case QuantifierGreedy: { + while (backTrack->matchAmount < term.atom.quantityCount) { + ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); + JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)); + if (result == JSRegExpMatch) + appendParenthesesDisjunctionContext(backTrack, context); + else { + resetMatches(term, context); + freeParenthesesDisjunctionContext(context); + + if (result != JSRegExpNoMatch) + return result; + + break; + } + } + + if (backTrack->matchAmount) { + ParenthesesDisjunctionContext* context = backTrack->lastContext; + recordParenthesesMatch(term, context); + } + return JSRegExpMatch; + } + + case QuantifierNonGreedy: + return JSRegExpMatch; + } + + ASSERT_NOT_REACHED(); + return JSRegExpErrorNoMatch; + } + + // Rules for backtracking differ depending on whether this is greedy or non-greedy. + // + // Greedy matches never should try just adding more - you should already have done + // the 'more' cases. Always backtrack, at least a leetle bit. However cases where + // you backtrack an item off the list needs checking, since we'll never have matched + // the one less case. Tracking forwards, still add as much as possible. + // + // Non-greedy, we've already done the one less case, so don't match on popping. + // We haven't done the one more case, so always try to add that. + // + JSRegExpResult backtrackParentheses(ByteTerm& term, DisjunctionContext* context) + { + ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern); + + BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation); + ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction; + + switch (term.atom.quantityType) { + case QuantifierFixedCount: { + ASSERT(backTrack->matchAmount == term.atom.quantityCount); + + ParenthesesDisjunctionContext* context = 0; + JSRegExpResult result = parenthesesDoBacktrack(term, backTrack); + + if (result != JSRegExpMatch) + return result; + + // While we haven't yet reached our fixed limit, + while (backTrack->matchAmount < term.atom.quantityCount) { + // Try to do a match, and it it succeeds, add it to the list. + context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); + result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term)); + + if (result == JSRegExpMatch) + appendParenthesesDisjunctionContext(backTrack, context); + else { + // The match failed; try to find an alternate point to carry on from. + resetMatches(term, context); + freeParenthesesDisjunctionContext(context); + + if (result != JSRegExpNoMatch) + return result; + JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack); + if (backtrackResult != JSRegExpMatch) + return backtrackResult; + } + } + + ASSERT(backTrack->matchAmount == term.atom.quantityCount); + context = backTrack->lastContext; + recordParenthesesMatch(term, context); + return JSRegExpMatch; + } + + case QuantifierGreedy: { + if (!backTrack->matchAmount) + return JSRegExpNoMatch; + + ParenthesesDisjunctionContext* context = backTrack->lastContext; + JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true); + if (result == JSRegExpMatch) { + while (backTrack->matchAmount < term.atom.quantityCount) { + ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); + JSRegExpResult parenthesesResult = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)); + if (parenthesesResult == JSRegExpMatch) + appendParenthesesDisjunctionContext(backTrack, context); + else { + resetMatches(term, context); + freeParenthesesDisjunctionContext(context); + + if (parenthesesResult != JSRegExpNoMatch) + return parenthesesResult; + + break; + } + } + } else { + resetMatches(term, context); + popParenthesesDisjunctionContext(backTrack); + freeParenthesesDisjunctionContext(context); + + if (result != JSRegExpNoMatch) + return result; + } + + if (backTrack->matchAmount) { + ParenthesesDisjunctionContext* context = backTrack->lastContext; + recordParenthesesMatch(term, context); + } + return JSRegExpMatch; + } + + case QuantifierNonGreedy: { + // If we've not reached the limit, try to add one more match. + if (backTrack->matchAmount < term.atom.quantityCount) { + ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); + JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)); + if (result == JSRegExpMatch) { + appendParenthesesDisjunctionContext(backTrack, context); + recordParenthesesMatch(term, context); + return JSRegExpMatch; + } + + resetMatches(term, context); + freeParenthesesDisjunctionContext(context); + + if (result != JSRegExpNoMatch) + return result; + } + + // Nope - okay backtrack looking for an alternative. + while (backTrack->matchAmount) { + ParenthesesDisjunctionContext* context = backTrack->lastContext; + JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true); + if (result == JSRegExpMatch) { + // successful backtrack! we're back in the game! + if (backTrack->matchAmount) { + context = backTrack->lastContext; + recordParenthesesMatch(term, context); + } + return JSRegExpMatch; + } + + // pop a match off the stack + resetMatches(term, context); + popParenthesesDisjunctionContext(backTrack); + freeParenthesesDisjunctionContext(context); + + if (result != JSRegExpNoMatch) + return result; + } + + return JSRegExpNoMatch; + } + } + + ASSERT_NOT_REACHED(); + return JSRegExpErrorNoMatch; + } + + bool matchDotStarEnclosure(ByteTerm& term, DisjunctionContext* context) + { + UNUSED_PARAM(term); + unsigned matchBegin = context->matchBegin; + + if (matchBegin) { + for (matchBegin--; true; matchBegin--) { + if (testCharacterClass(pattern->newlineCharacterClass, input.reread(matchBegin))) { + ++matchBegin; + break; + } + + if (!matchBegin) + break; + } + } + + unsigned matchEnd = input.getPos(); + + for (; (matchEnd != input.end()) + && (!testCharacterClass(pattern->newlineCharacterClass, input.reread(matchEnd))); matchEnd++) { } + + if (((matchBegin && term.anchors.m_bol) + || ((matchEnd != input.end()) && term.anchors.m_eol)) + && !pattern->m_multiline) + return false; + + context->matchBegin = matchBegin; + context->matchEnd = matchEnd; + return true; + } + +#define MATCH_NEXT() { ++context->term; goto matchAgain; } +#define BACKTRACK() { --context->term; goto backtrack; } +#define currentTerm() (disjunction->terms[context->term]) + JSRegExpResult matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false) + { + if (!--remainingMatchCount) + return JSRegExpErrorHitLimit; + + if (btrack) + BACKTRACK(); + + context->matchBegin = input.getPos(); + context->term = 0; + + matchAgain: + ASSERT(context->term < static_cast<int>(disjunction->terms.size())); + + switch (currentTerm().type) { + case ByteTerm::TypeSubpatternBegin: + MATCH_NEXT(); + case ByteTerm::TypeSubpatternEnd: + context->matchEnd = input.getPos(); + return JSRegExpMatch; + + case ByteTerm::TypeBodyAlternativeBegin: + MATCH_NEXT(); + case ByteTerm::TypeBodyAlternativeDisjunction: + case ByteTerm::TypeBodyAlternativeEnd: + context->matchEnd = input.getPos(); + return JSRegExpMatch; + + case ByteTerm::TypeAlternativeBegin: + MATCH_NEXT(); + case ByteTerm::TypeAlternativeDisjunction: + case ByteTerm::TypeAlternativeEnd: { + int offset = currentTerm().alternative.end; + BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation); + backTrack->offset = offset; + context->term += offset; + MATCH_NEXT(); + } + + case ByteTerm::TypeAssertionBOL: + if (matchAssertionBOL(currentTerm())) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeAssertionEOL: + if (matchAssertionEOL(currentTerm())) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeAssertionWordBoundary: + if (matchAssertionWordBoundary(currentTerm())) + MATCH_NEXT(); + BACKTRACK(); + + case ByteTerm::TypePatternCharacterOnce: + case ByteTerm::TypePatternCharacterFixed: { + for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) { + if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - matchAmount)) + BACKTRACK(); + } + MATCH_NEXT(); + } + case ByteTerm::TypePatternCharacterGreedy: { + BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); + unsigned matchAmount = 0; + while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) { + if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition + 1)) { + input.uncheckInput(1); + break; + } + ++matchAmount; + } + backTrack->matchAmount = matchAmount; + + MATCH_NEXT(); + } + case ByteTerm::TypePatternCharacterNonGreedy: { + BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); + backTrack->matchAmount = 0; + MATCH_NEXT(); + } + + case ByteTerm::TypePatternCasedCharacterOnce: + case ByteTerm::TypePatternCasedCharacterFixed: { + for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) { + if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - matchAmount)) + BACKTRACK(); + } + MATCH_NEXT(); + } + case ByteTerm::TypePatternCasedCharacterGreedy: { + BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); + unsigned matchAmount = 0; + while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) { + if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition + 1)) { + input.uncheckInput(1); + break; + } + ++matchAmount; + } + backTrack->matchAmount = matchAmount; + + MATCH_NEXT(); + } + case ByteTerm::TypePatternCasedCharacterNonGreedy: { + BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); + backTrack->matchAmount = 0; + MATCH_NEXT(); + } + + case ByteTerm::TypeCharacterClass: + if (matchCharacterClass(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeBackReference: + if (matchBackReference(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParenthesesSubpattern: { + JSRegExpResult result = matchParentheses(currentTerm(), context); + + if (result == JSRegExpMatch) { + MATCH_NEXT(); + } else if (result != JSRegExpNoMatch) + return result; + + BACKTRACK(); + } + case ByteTerm::TypeParenthesesSubpatternOnceBegin: + if (matchParenthesesOnceBegin(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParenthesesSubpatternOnceEnd: + if (matchParenthesesOnceEnd(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParenthesesSubpatternTerminalBegin: + if (matchParenthesesTerminalBegin(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParenthesesSubpatternTerminalEnd: + if (matchParenthesesTerminalEnd(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParentheticalAssertionBegin: + if (matchParentheticalAssertionBegin(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParentheticalAssertionEnd: + if (matchParentheticalAssertionEnd(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + + case ByteTerm::TypeCheckInput: + if (input.checkInput(currentTerm().checkInputCount)) + MATCH_NEXT(); + BACKTRACK(); + + case ByteTerm::TypeUncheckInput: + input.uncheckInput(currentTerm().checkInputCount); + MATCH_NEXT(); + + case ByteTerm::TypeDotStarEnclosure: + if (matchDotStarEnclosure(currentTerm(), context)) + return JSRegExpMatch; + BACKTRACK(); + } + + // We should never fall-through to here. + ASSERT_NOT_REACHED(); + + backtrack: + ASSERT(context->term < static_cast<int>(disjunction->terms.size())); + + switch (currentTerm().type) { + case ByteTerm::TypeSubpatternBegin: + return JSRegExpNoMatch; + case ByteTerm::TypeSubpatternEnd: + ASSERT_NOT_REACHED(); + + case ByteTerm::TypeBodyAlternativeBegin: + case ByteTerm::TypeBodyAlternativeDisjunction: { + int offset = currentTerm().alternative.next; + context->term += offset; + if (offset > 0) + MATCH_NEXT(); + + if (input.atEnd()) + return JSRegExpNoMatch; + + input.next(); + + context->matchBegin = input.getPos(); + + if (currentTerm().alternative.onceThrough) + context->term += currentTerm().alternative.next; + + MATCH_NEXT(); + } + case ByteTerm::TypeBodyAlternativeEnd: + ASSERT_NOT_REACHED(); + + case ByteTerm::TypeAlternativeBegin: + case ByteTerm::TypeAlternativeDisjunction: { + int offset = currentTerm().alternative.next; + context->term += offset; + if (offset > 0) + MATCH_NEXT(); + BACKTRACK(); + } + case ByteTerm::TypeAlternativeEnd: { + // We should never backtrack back into an alternative of the main body of the regex. + BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation); + unsigned offset = backTrack->offset; + context->term -= offset; + BACKTRACK(); + } + + case ByteTerm::TypeAssertionBOL: + case ByteTerm::TypeAssertionEOL: + case ByteTerm::TypeAssertionWordBoundary: + BACKTRACK(); + + case ByteTerm::TypePatternCharacterOnce: + case ByteTerm::TypePatternCharacterFixed: + case ByteTerm::TypePatternCharacterGreedy: + case ByteTerm::TypePatternCharacterNonGreedy: + if (backtrackPatternCharacter(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypePatternCasedCharacterOnce: + case ByteTerm::TypePatternCasedCharacterFixed: + case ByteTerm::TypePatternCasedCharacterGreedy: + case ByteTerm::TypePatternCasedCharacterNonGreedy: + if (backtrackPatternCasedCharacter(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeCharacterClass: + if (backtrackCharacterClass(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeBackReference: + if (backtrackBackReference(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParenthesesSubpattern: { + JSRegExpResult result = backtrackParentheses(currentTerm(), context); + + if (result == JSRegExpMatch) { + MATCH_NEXT(); + } else if (result != JSRegExpNoMatch) + return result; + + BACKTRACK(); + } + case ByteTerm::TypeParenthesesSubpatternOnceBegin: + if (backtrackParenthesesOnceBegin(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParenthesesSubpatternOnceEnd: + if (backtrackParenthesesOnceEnd(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParenthesesSubpatternTerminalBegin: + if (backtrackParenthesesTerminalBegin(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParenthesesSubpatternTerminalEnd: + if (backtrackParenthesesTerminalEnd(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParentheticalAssertionBegin: + if (backtrackParentheticalAssertionBegin(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + case ByteTerm::TypeParentheticalAssertionEnd: + if (backtrackParentheticalAssertionEnd(currentTerm(), context)) + MATCH_NEXT(); + BACKTRACK(); + + case ByteTerm::TypeCheckInput: + input.uncheckInput(currentTerm().checkInputCount); + BACKTRACK(); + + case ByteTerm::TypeUncheckInput: + input.checkInput(currentTerm().checkInputCount); + BACKTRACK(); + + case ByteTerm::TypeDotStarEnclosure: + ASSERT_NOT_REACHED(); + } + + ASSERT_NOT_REACHED(); + return JSRegExpErrorNoMatch; + } + + JSRegExpResult matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false) + { + JSRegExpResult result = matchDisjunction(disjunction, context, btrack); + + if (result == JSRegExpMatch) { + while (context->matchBegin == context->matchEnd) { + result = matchDisjunction(disjunction, context, true); + if (result != JSRegExpMatch) + return result; + } + return JSRegExpMatch; + } + + return result; + } + + unsigned interpret() + { + if (!input.isAvailableInput(0)) + return offsetNoMatch; + + for (unsigned i = 0; i < pattern->m_body->m_numSubpatterns + 1; ++i) + output[i << 1] = offsetNoMatch; + + allocatorPool = pattern->m_allocator->startAllocator(); + if (!allocatorPool) + CRASH(); + + DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get()); + + JSRegExpResult result = matchDisjunction(pattern->m_body.get(), context, false); + if (result == JSRegExpMatch) { + output[0] = context->matchBegin; + output[1] = context->matchEnd; + } + + freeDisjunctionContext(context); + + pattern->m_allocator->stopAllocator(); + + ASSERT((result == JSRegExpMatch) == (output[0] != offsetNoMatch)); + return output[0]; + } + + Interpreter(BytecodePattern* pattern, unsigned* output, const CharType* input, unsigned length, unsigned start) + : pattern(pattern) + , output(output) + , input(input, start, length) + , allocatorPool(0) + , remainingMatchCount(matchLimit) + { + } + +private: + BytecodePattern* pattern; + unsigned* output; + InputStream input; + BumpPointerPool* allocatorPool; + unsigned remainingMatchCount; +}; + + + +class ByteCompiler { + struct ParenthesesStackEntry { + unsigned beginTerm; + unsigned savedAlternativeIndex; + ParenthesesStackEntry(unsigned beginTerm, unsigned savedAlternativeIndex/*, unsigned subpatternId, bool capture = false*/) + : beginTerm(beginTerm) + , savedAlternativeIndex(savedAlternativeIndex) + { + } + }; + +public: + ByteCompiler(YarrPattern& pattern) + : m_pattern(pattern) + { + m_currentAlternativeIndex = 0; + } + + PassOwnPtr<BytecodePattern> compile(BumpPointerAllocator* allocator) + { + regexBegin(m_pattern.m_numSubpatterns, m_pattern.m_body->m_callFrameSize, m_pattern.m_body->m_alternatives[0]->onceThrough()); + emitDisjunction(m_pattern.m_body); + regexEnd(); + + return adoptPtr(new BytecodePattern(m_bodyDisjunction.release(), m_allParenthesesInfo, m_pattern, allocator)); + } + + void checkInput(unsigned count) + { + m_bodyDisjunction->terms.append(ByteTerm::CheckInput(count)); + } + + void uncheckInput(unsigned count) + { + m_bodyDisjunction->terms.append(ByteTerm::UncheckInput(count)); + } + + void assertionBOL(unsigned inputPosition) + { + m_bodyDisjunction->terms.append(ByteTerm::BOL(inputPosition)); + } + + void assertionEOL(unsigned inputPosition) + { + m_bodyDisjunction->terms.append(ByteTerm::EOL(inputPosition)); + } + + void assertionWordBoundary(bool invert, unsigned inputPosition) + { + m_bodyDisjunction->terms.append(ByteTerm::WordBoundary(invert, inputPosition)); + } + + void atomPatternCharacter(UChar ch, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) + { + if (m_pattern.m_ignoreCase) { + UChar lo = Unicode::toLower(ch); + UChar hi = Unicode::toUpper(ch); + + if (lo != hi) { + m_bodyDisjunction->terms.append(ByteTerm(lo, hi, inputPosition, frameLocation, quantityCount, quantityType)); + return; + } + } + + m_bodyDisjunction->terms.append(ByteTerm(ch, inputPosition, frameLocation, quantityCount, quantityType)); + } + + void atomCharacterClass(CharacterClass* characterClass, bool invert, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) + { + m_bodyDisjunction->terms.append(ByteTerm(characterClass, invert, inputPosition)); + + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount.unsafeGet(); + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType; + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; + } + + void atomBackReference(unsigned subpatternId, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) + { + ASSERT(subpatternId); + + m_bodyDisjunction->terms.append(ByteTerm::BackReference(subpatternId, inputPosition)); + + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount.unsafeGet(); + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType; + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; + } + + void atomParenthesesOnceBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) + { + int beginTerm = m_bodyDisjunction->terms.size(); + + m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition)); + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; + m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation; + + m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); + m_currentAlternativeIndex = beginTerm + 1; + } + + void atomParenthesesTerminalBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) + { + int beginTerm = m_bodyDisjunction->terms.size(); + + m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalBegin, subpatternId, capture, false, inputPosition)); + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; + m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation; + + m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); + m_currentAlternativeIndex = beginTerm + 1; + } + + void atomParenthesesSubpatternBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) + { + // Errrk! - this is a little crazy, we initially generate as a TypeParenthesesSubpatternOnceBegin, + // then fix this up at the end! - simplifying this should make it much clearer. + // https://bugs.webkit.org/show_bug.cgi?id=50136 + + int beginTerm = m_bodyDisjunction->terms.size(); + + m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition)); + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; + m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation; + + m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); + m_currentAlternativeIndex = beginTerm + 1; + } + + void atomParentheticalAssertionBegin(unsigned subpatternId, bool invert, unsigned frameLocation, unsigned alternativeFrameLocation) + { + int beginTerm = m_bodyDisjunction->terms.size(); + + m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionBegin, subpatternId, false, invert, 0)); + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; + m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation; + + m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); + m_currentAlternativeIndex = beginTerm + 1; + } + + void atomParentheticalAssertionEnd(unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) + { + unsigned beginTerm = popParenthesesStack(); + closeAlternative(beginTerm + 1); + unsigned endTerm = m_bodyDisjunction->terms.size(); + + ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParentheticalAssertionBegin); + + bool invert = m_bodyDisjunction->terms[beginTerm].invert(); + unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId; + + m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionEnd, subpatternId, false, invert, inputPosition)); + m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm; + m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; + m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation; + + m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet(); + m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; + m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount.unsafeGet(); + m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; + } + + void assertionDotStarEnclosure(bool bolAnchored, bool eolAnchored) + { + m_bodyDisjunction->terms.append(ByteTerm::DotStarEnclosure(bolAnchored, eolAnchored)); + } + + unsigned popParenthesesStack() + { + ASSERT(m_parenthesesStack.size()); + int stackEnd = m_parenthesesStack.size() - 1; + unsigned beginTerm = m_parenthesesStack[stackEnd].beginTerm; + m_currentAlternativeIndex = m_parenthesesStack[stackEnd].savedAlternativeIndex; + m_parenthesesStack.shrink(stackEnd); + + ASSERT(beginTerm < m_bodyDisjunction->terms.size()); + ASSERT(m_currentAlternativeIndex < m_bodyDisjunction->terms.size()); + + return beginTerm; + } + +#ifndef NDEBUG + void dumpDisjunction(ByteDisjunction* disjunction) + { + dataLogF("ByteDisjunction(%p):\n\t", disjunction); + for (unsigned i = 0; i < disjunction->terms.size(); ++i) + dataLogF("{ %d } ", disjunction->terms[i].type); + dataLogF("\n"); + } +#endif + + void closeAlternative(int beginTerm) + { + int origBeginTerm = beginTerm; + ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeBegin); + int endIndex = m_bodyDisjunction->terms.size(); + + unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation; + + if (!m_bodyDisjunction->terms[beginTerm].alternative.next) + m_bodyDisjunction->terms.remove(beginTerm); + else { + while (m_bodyDisjunction->terms[beginTerm].alternative.next) { + beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next; + ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeDisjunction); + m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm; + m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; + } + + m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm; + + m_bodyDisjunction->terms.append(ByteTerm::AlternativeEnd()); + m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation; + } + } + + void closeBodyAlternative() + { + int beginTerm = 0; + int origBeginTerm = 0; + ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeBegin); + int endIndex = m_bodyDisjunction->terms.size(); + + unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation; + + while (m_bodyDisjunction->terms[beginTerm].alternative.next) { + beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next; + ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeDisjunction); + m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm; + m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; + } + + m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm; + + m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeEnd()); + m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation; + } + + void atomParenthesesSubpatternEnd(unsigned lastSubpatternId, int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType, unsigned callFrameSize = 0) + { + unsigned beginTerm = popParenthesesStack(); + closeAlternative(beginTerm + 1); + unsigned endTerm = m_bodyDisjunction->terms.size(); + + ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin); + + ByteTerm& parenthesesBegin = m_bodyDisjunction->terms[beginTerm]; + + bool capture = parenthesesBegin.capture(); + unsigned subpatternId = parenthesesBegin.atom.subpatternId; + + unsigned numSubpatterns = lastSubpatternId - subpatternId + 1; + ByteDisjunction* parenthesesDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize); + + parenthesesDisjunction->terms.append(ByteTerm::SubpatternBegin()); + for (unsigned termInParentheses = beginTerm + 1; termInParentheses < endTerm; ++termInParentheses) + parenthesesDisjunction->terms.append(m_bodyDisjunction->terms[termInParentheses]); + parenthesesDisjunction->terms.append(ByteTerm::SubpatternEnd()); + + m_bodyDisjunction->terms.shrink(beginTerm); + + m_allParenthesesInfo.append(parenthesesDisjunction); + m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, capture, inputPosition)); + + m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet(); + m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; + m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; + } + + void atomParenthesesOnceEnd(int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) + { + unsigned beginTerm = popParenthesesStack(); + closeAlternative(beginTerm + 1); + unsigned endTerm = m_bodyDisjunction->terms.size(); + + ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin); + + bool capture = m_bodyDisjunction->terms[beginTerm].capture(); + unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId; + + m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceEnd, subpatternId, capture, false, inputPosition)); + m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm; + m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; + m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation; + + m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet(); + m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; + m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount.unsafeGet(); + m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; + } + + void atomParenthesesTerminalEnd(int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) + { + unsigned beginTerm = popParenthesesStack(); + closeAlternative(beginTerm + 1); + unsigned endTerm = m_bodyDisjunction->terms.size(); + + ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternTerminalBegin); + + bool capture = m_bodyDisjunction->terms[beginTerm].capture(); + unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId; + + m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalEnd, subpatternId, capture, false, inputPosition)); + m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm; + m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; + m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation; + + m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet(); + m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; + m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount.unsafeGet(); + m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; + } + + void regexBegin(unsigned numSubpatterns, unsigned callFrameSize, bool onceThrough) + { + m_bodyDisjunction = adoptPtr(new ByteDisjunction(numSubpatterns, callFrameSize)); + m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeBegin(onceThrough)); + m_bodyDisjunction->terms[0].frameLocation = 0; + m_currentAlternativeIndex = 0; + } + + void regexEnd() + { + closeBodyAlternative(); + } + + void alternativeBodyDisjunction(bool onceThrough) + { + int newAlternativeIndex = m_bodyDisjunction->terms.size(); + m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex; + m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeDisjunction(onceThrough)); + + m_currentAlternativeIndex = newAlternativeIndex; + } + + void alternativeDisjunction() + { + int newAlternativeIndex = m_bodyDisjunction->terms.size(); + m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex; + m_bodyDisjunction->terms.append(ByteTerm::AlternativeDisjunction()); + + m_currentAlternativeIndex = newAlternativeIndex; + } + + void emitDisjunction(PatternDisjunction* disjunction, unsigned inputCountAlreadyChecked = 0, unsigned parenthesesInputCountAlreadyChecked = 0) + { + for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { + unsigned currentCountAlreadyChecked = inputCountAlreadyChecked; + + PatternAlternative* alternative = disjunction->m_alternatives[alt]; + + if (alt) { + if (disjunction == m_pattern.m_body) + alternativeBodyDisjunction(alternative->onceThrough()); + else + alternativeDisjunction(); + } + + unsigned minimumSize = alternative->m_minimumSize; + ASSERT(minimumSize >= parenthesesInputCountAlreadyChecked); + unsigned countToCheck = minimumSize - parenthesesInputCountAlreadyChecked; + + if (countToCheck) { + checkInput(countToCheck); + currentCountAlreadyChecked += countToCheck; + } + + for (unsigned i = 0; i < alternative->m_terms.size(); ++i) { + PatternTerm& term = alternative->m_terms[i]; + + switch (term.type) { + case PatternTerm::TypeAssertionBOL: + assertionBOL(currentCountAlreadyChecked - term.inputPosition); + break; + + case PatternTerm::TypeAssertionEOL: + assertionEOL(currentCountAlreadyChecked - term.inputPosition); + break; + + case PatternTerm::TypeAssertionWordBoundary: + assertionWordBoundary(term.invert(), currentCountAlreadyChecked - term.inputPosition); + break; + + case PatternTerm::TypePatternCharacter: + atomPatternCharacter(term.patternCharacter, currentCountAlreadyChecked - term.inputPosition, term.frameLocation, term.quantityCount, term.quantityType); + break; + + case PatternTerm::TypeCharacterClass: + atomCharacterClass(term.characterClass, term.invert(), currentCountAlreadyChecked- term.inputPosition, term.frameLocation, term.quantityCount, term.quantityType); + break; + + case PatternTerm::TypeBackReference: + atomBackReference(term.backReferenceSubpatternId, currentCountAlreadyChecked - term.inputPosition, term.frameLocation, term.quantityCount, term.quantityType); + break; + + case PatternTerm::TypeForwardReference: + break; + + case PatternTerm::TypeParenthesesSubpattern: { + unsigned disjunctionAlreadyCheckedCount = 0; + if (term.quantityCount == 1 && !term.parentheses.isCopy) { + unsigned alternativeFrameLocation = term.frameLocation; + // For QuantifierFixedCount we pre-check the minimum size; for greedy/non-greedy we reserve a slot in the frame. + if (term.quantityType == QuantifierFixedCount) + disjunctionAlreadyCheckedCount = term.parentheses.disjunction->m_minimumSize; + else + alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce; + unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; + atomParenthesesOnceBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount - delegateEndInputOffset, term.frameLocation, alternativeFrameLocation); + emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount); + atomParenthesesOnceEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType); + } else if (term.parentheses.isTerminal) { + unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; + atomParenthesesTerminalBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount - delegateEndInputOffset, term.frameLocation, term.frameLocation + YarrStackSpaceForBackTrackInfoParenthesesOnce); + emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount); + atomParenthesesTerminalEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType); + } else { + unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; + atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount - delegateEndInputOffset, term.frameLocation, 0); + emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0); + atomParenthesesSubpatternEnd(term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize); + } + break; + } + + case PatternTerm::TypeParentheticalAssertion: { + unsigned alternativeFrameLocation = term.frameLocation + YarrStackSpaceForBackTrackInfoParentheticalAssertion; + + ASSERT(currentCountAlreadyChecked >= static_cast<unsigned>(term.inputPosition)); + unsigned positiveInputOffset = currentCountAlreadyChecked - static_cast<unsigned>(term.inputPosition); + unsigned uncheckAmount = 0; + if (positiveInputOffset > term.parentheses.disjunction->m_minimumSize) { + uncheckAmount = positiveInputOffset - term.parentheses.disjunction->m_minimumSize; + uncheckInput(uncheckAmount); + currentCountAlreadyChecked -= uncheckAmount; + } + + atomParentheticalAssertionBegin(term.parentheses.subpatternId, term.invert(), term.frameLocation, alternativeFrameLocation); + emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, positiveInputOffset - uncheckAmount); + atomParentheticalAssertionEnd(0, term.frameLocation, term.quantityCount, term.quantityType); + if (uncheckAmount) { + checkInput(uncheckAmount); + currentCountAlreadyChecked += uncheckAmount; + } + break; + } + + case PatternTerm::TypeDotStarEnclosure: + assertionDotStarEnclosure(term.anchors.bolAnchor, term.anchors.eolAnchor); + break; + } + } + } + } + +private: + YarrPattern& m_pattern; + OwnPtr<ByteDisjunction> m_bodyDisjunction; + unsigned m_currentAlternativeIndex; + Vector<ParenthesesStackEntry> m_parenthesesStack; + Vector<ByteDisjunction*> m_allParenthesesInfo; +}; + +PassOwnPtr<BytecodePattern> byteCompile(YarrPattern& pattern, BumpPointerAllocator* allocator) +{ + return ByteCompiler(pattern).compile(allocator); +} + +unsigned interpret(BytecodePattern* bytecode, const String& input, unsigned start, unsigned* output) +{ + if (input.is8Bit()) + return Interpreter<LChar>(bytecode, output, input.characters8(), input.length(), start).interpret(); + return Interpreter<UChar>(bytecode, output, input.characters16(), input.length(), start).interpret(); +} + +unsigned interpret(BytecodePattern* bytecode, const LChar* input, unsigned length, unsigned start, unsigned* output) +{ + return Interpreter<LChar>(bytecode, output, input, length, start).interpret(); +} + +unsigned interpret(BytecodePattern* bytecode, const UChar* input, unsigned length, unsigned start, unsigned* output) +{ + return Interpreter<UChar>(bytecode, output, input, length, start).interpret(); +} + +// These should be the same for both UChar & LChar. +COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoPatternCharacter) == (YarrStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoPatternCharacter); +COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoCharacterClass) == (YarrStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoCharacterClass); +COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoBackReference) == (YarrStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoBackReference); +COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoAlternative) == (YarrStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoAlternative); +COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoParentheticalAssertion) == (YarrStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheticalAssertion); +COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoParenthesesOnce) == (YarrStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParenthesesOnce); +COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoParentheses) == (YarrStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheses); + + +} } diff --git a/src/3rdparty/masm/yarr/YarrInterpreter.h b/src/3rdparty/masm/yarr/YarrInterpreter.h new file mode 100644 index 0000000000..fb60bd979d --- /dev/null +++ b/src/3rdparty/masm/yarr/YarrInterpreter.h @@ -0,0 +1,385 @@ +/* + * Copyright (C) 2009, 2010 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef YarrInterpreter_h +#define YarrInterpreter_h + +#include "YarrPattern.h" +#include <wtf/PassOwnPtr.h> +#include <wtf/unicode/Unicode.h> + +namespace WTF { +class BumpPointerAllocator; +} +using WTF::BumpPointerAllocator; + +namespace JSC { namespace Yarr { + +class ByteDisjunction; + +struct ByteTerm { + enum Type { + TypeBodyAlternativeBegin, + TypeBodyAlternativeDisjunction, + TypeBodyAlternativeEnd, + TypeAlternativeBegin, + TypeAlternativeDisjunction, + TypeAlternativeEnd, + TypeSubpatternBegin, + TypeSubpatternEnd, + TypeAssertionBOL, + TypeAssertionEOL, + TypeAssertionWordBoundary, + TypePatternCharacterOnce, + TypePatternCharacterFixed, + TypePatternCharacterGreedy, + TypePatternCharacterNonGreedy, + TypePatternCasedCharacterOnce, + TypePatternCasedCharacterFixed, + TypePatternCasedCharacterGreedy, + TypePatternCasedCharacterNonGreedy, + TypeCharacterClass, + TypeBackReference, + TypeParenthesesSubpattern, + TypeParenthesesSubpatternOnceBegin, + TypeParenthesesSubpatternOnceEnd, + TypeParenthesesSubpatternTerminalBegin, + TypeParenthesesSubpatternTerminalEnd, + TypeParentheticalAssertionBegin, + TypeParentheticalAssertionEnd, + TypeCheckInput, + TypeUncheckInput, + TypeDotStarEnclosure, + } type; + union { + struct { + union { + UChar patternCharacter; + struct { + UChar lo; + UChar hi; + } casedCharacter; + CharacterClass* characterClass; + unsigned subpatternId; + }; + union { + ByteDisjunction* parenthesesDisjunction; + unsigned parenthesesWidth; + }; + QuantifierType quantityType; + unsigned quantityCount; + } atom; + struct { + int next; + int end; + bool onceThrough; + } alternative; + struct { + bool m_bol : 1; + bool m_eol : 1; + } anchors; + unsigned checkInputCount; + }; + unsigned frameLocation; + bool m_capture : 1; + bool m_invert : 1; + unsigned inputPosition; + + ByteTerm(UChar ch, int inputPos, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) + : frameLocation(frameLocation) + , m_capture(false) + , m_invert(false) + { + switch (quantityType) { + case QuantifierFixedCount: + type = (quantityCount == 1) ? ByteTerm::TypePatternCharacterOnce : ByteTerm::TypePatternCharacterFixed; + break; + case QuantifierGreedy: + type = ByteTerm::TypePatternCharacterGreedy; + break; + case QuantifierNonGreedy: + type = ByteTerm::TypePatternCharacterNonGreedy; + break; + } + + atom.patternCharacter = ch; + atom.quantityType = quantityType; + atom.quantityCount = quantityCount.unsafeGet(); + inputPosition = inputPos; + } + + ByteTerm(UChar lo, UChar hi, int inputPos, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) + : frameLocation(frameLocation) + , m_capture(false) + , m_invert(false) + { + switch (quantityType) { + case QuantifierFixedCount: + type = (quantityCount == 1) ? ByteTerm::TypePatternCasedCharacterOnce : ByteTerm::TypePatternCasedCharacterFixed; + break; + case QuantifierGreedy: + type = ByteTerm::TypePatternCasedCharacterGreedy; + break; + case QuantifierNonGreedy: + type = ByteTerm::TypePatternCasedCharacterNonGreedy; + break; + } + + atom.casedCharacter.lo = lo; + atom.casedCharacter.hi = hi; + atom.quantityType = quantityType; + atom.quantityCount = quantityCount.unsafeGet(); + inputPosition = inputPos; + } + + ByteTerm(CharacterClass* characterClass, bool invert, int inputPos) + : type(ByteTerm::TypeCharacterClass) + , m_capture(false) + , m_invert(invert) + { + atom.characterClass = characterClass; + atom.quantityType = QuantifierFixedCount; + atom.quantityCount = 1; + inputPosition = inputPos; + } + + ByteTerm(Type type, unsigned subpatternId, ByteDisjunction* parenthesesInfo, bool capture, int inputPos) + : type(type) + , m_capture(capture) + , m_invert(false) + { + atom.subpatternId = subpatternId; + atom.parenthesesDisjunction = parenthesesInfo; + atom.quantityType = QuantifierFixedCount; + atom.quantityCount = 1; + inputPosition = inputPos; + } + + ByteTerm(Type type, bool invert = false) + : type(type) + , m_capture(false) + , m_invert(invert) + { + atom.quantityType = QuantifierFixedCount; + atom.quantityCount = 1; + } + + ByteTerm(Type type, unsigned subpatternId, bool capture, bool invert, int inputPos) + : type(type) + , m_capture(capture) + , m_invert(invert) + { + atom.subpatternId = subpatternId; + atom.quantityType = QuantifierFixedCount; + atom.quantityCount = 1; + inputPosition = inputPos; + } + + static ByteTerm BOL(int inputPos) + { + ByteTerm term(TypeAssertionBOL); + term.inputPosition = inputPos; + return term; + } + + static ByteTerm CheckInput(Checked<unsigned> count) + { + ByteTerm term(TypeCheckInput); + term.checkInputCount = count.unsafeGet(); + return term; + } + + static ByteTerm UncheckInput(Checked<unsigned> count) + { + ByteTerm term(TypeUncheckInput); + term.checkInputCount = count.unsafeGet(); + return term; + } + + static ByteTerm EOL(int inputPos) + { + ByteTerm term(TypeAssertionEOL); + term.inputPosition = inputPos; + return term; + } + + static ByteTerm WordBoundary(bool invert, int inputPos) + { + ByteTerm term(TypeAssertionWordBoundary, invert); + term.inputPosition = inputPos; + return term; + } + + static ByteTerm BackReference(unsigned subpatternId, int inputPos) + { + return ByteTerm(TypeBackReference, subpatternId, false, false, inputPos); + } + + static ByteTerm BodyAlternativeBegin(bool onceThrough) + { + ByteTerm term(TypeBodyAlternativeBegin); + term.alternative.next = 0; + term.alternative.end = 0; + term.alternative.onceThrough = onceThrough; + return term; + } + + static ByteTerm BodyAlternativeDisjunction(bool onceThrough) + { + ByteTerm term(TypeBodyAlternativeDisjunction); + term.alternative.next = 0; + term.alternative.end = 0; + term.alternative.onceThrough = onceThrough; + return term; + } + + static ByteTerm BodyAlternativeEnd() + { + ByteTerm term(TypeBodyAlternativeEnd); + term.alternative.next = 0; + term.alternative.end = 0; + term.alternative.onceThrough = false; + return term; + } + + static ByteTerm AlternativeBegin() + { + ByteTerm term(TypeAlternativeBegin); + term.alternative.next = 0; + term.alternative.end = 0; + term.alternative.onceThrough = false; + return term; + } + + static ByteTerm AlternativeDisjunction() + { + ByteTerm term(TypeAlternativeDisjunction); + term.alternative.next = 0; + term.alternative.end = 0; + term.alternative.onceThrough = false; + return term; + } + + static ByteTerm AlternativeEnd() + { + ByteTerm term(TypeAlternativeEnd); + term.alternative.next = 0; + term.alternative.end = 0; + term.alternative.onceThrough = false; + return term; + } + + static ByteTerm SubpatternBegin() + { + return ByteTerm(TypeSubpatternBegin); + } + + static ByteTerm SubpatternEnd() + { + return ByteTerm(TypeSubpatternEnd); + } + + static ByteTerm DotStarEnclosure(bool bolAnchor, bool eolAnchor) + { + ByteTerm term(TypeDotStarEnclosure); + term.anchors.m_bol = bolAnchor; + term.anchors.m_eol = eolAnchor; + return term; + } + + bool invert() + { + return m_invert; + } + + bool capture() + { + return m_capture; + } +}; + +class ByteDisjunction { + WTF_MAKE_FAST_ALLOCATED; +public: + ByteDisjunction(unsigned numSubpatterns, unsigned frameSize) + : m_numSubpatterns(numSubpatterns) + , m_frameSize(frameSize) + { + } + + Vector<ByteTerm> terms; + unsigned m_numSubpatterns; + unsigned m_frameSize; +}; + +struct BytecodePattern { + WTF_MAKE_FAST_ALLOCATED; +public: + BytecodePattern(PassOwnPtr<ByteDisjunction> body, Vector<ByteDisjunction*> allParenthesesInfo, YarrPattern& pattern, BumpPointerAllocator* allocator) + : m_body(body) + , m_ignoreCase(pattern.m_ignoreCase) + , m_multiline(pattern.m_multiline) + , m_allocator(allocator) + { + newlineCharacterClass = pattern.newlineCharacterClass(); + wordcharCharacterClass = pattern.wordcharCharacterClass(); + + m_allParenthesesInfo.append(allParenthesesInfo); + m_userCharacterClasses.append(pattern.m_userCharacterClasses); + // 'Steal' the YarrPattern's CharacterClasses! We clear its + // array, so that it won't delete them on destruction. We'll + // take responsibility for that. + pattern.m_userCharacterClasses.clear(); + } + + ~BytecodePattern() + { + deleteAllValues(m_allParenthesesInfo); + deleteAllValues(m_userCharacterClasses); + } + + OwnPtr<ByteDisjunction> m_body; + bool m_ignoreCase; + bool m_multiline; + // Each BytecodePattern is associated with a RegExp, each RegExp is associated + // with a JSGlobalData. Cache a pointer to out JSGlobalData's m_regExpAllocator. + BumpPointerAllocator* m_allocator; + + CharacterClass* newlineCharacterClass; + CharacterClass* wordcharCharacterClass; + +private: + Vector<ByteDisjunction*> m_allParenthesesInfo; + Vector<CharacterClass*> m_userCharacterClasses; +}; + +JS_EXPORT_PRIVATE PassOwnPtr<BytecodePattern> byteCompile(YarrPattern&, BumpPointerAllocator*); +JS_EXPORT_PRIVATE unsigned interpret(BytecodePattern*, const String& input, unsigned start, unsigned* output); +unsigned interpret(BytecodePattern*, const LChar* input, unsigned length, unsigned start, unsigned* output); +unsigned interpret(BytecodePattern*, const UChar* input, unsigned length, unsigned start, unsigned* output); + +} } // namespace JSC::Yarr + +#endif // YarrInterpreter_h diff --git a/src/3rdparty/masm/yarr/YarrJIT.cpp b/src/3rdparty/masm/yarr/YarrJIT.cpp new file mode 100644 index 0000000000..ce84e2c74f --- /dev/null +++ b/src/3rdparty/masm/yarr/YarrJIT.cpp @@ -0,0 +1,2667 @@ +/* + * Copyright (C) 2009 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "YarrJIT.h" + +#include <wtf/ASCIICType.h> +#include "LinkBuffer.h" +#include "Options.h" +#include "Yarr.h" +#include "YarrCanonicalizeUCS2.h" + +#if ENABLE(YARR_JIT) + +using namespace WTF; + +namespace JSC { namespace Yarr { + +template<YarrJITCompileMode compileMode> +class YarrGenerator : private MacroAssembler { + friend void jitCompile(JSGlobalData*, YarrCodeBlock& jitObject, const String& pattern, unsigned& numSubpatterns, const char*& error, bool ignoreCase, bool multiline); + +#if CPU(ARM) + static const RegisterID input = ARMRegisters::r0; + static const RegisterID index = ARMRegisters::r1; + static const RegisterID length = ARMRegisters::r2; + static const RegisterID output = ARMRegisters::r4; + + static const RegisterID regT0 = ARMRegisters::r5; + static const RegisterID regT1 = ARMRegisters::r6; + + static const RegisterID returnRegister = ARMRegisters::r0; + static const RegisterID returnRegister2 = ARMRegisters::r1; +#elif CPU(MIPS) + static const RegisterID input = MIPSRegisters::a0; + static const RegisterID index = MIPSRegisters::a1; + static const RegisterID length = MIPSRegisters::a2; + static const RegisterID output = MIPSRegisters::a3; + + static const RegisterID regT0 = MIPSRegisters::t4; + static const RegisterID regT1 = MIPSRegisters::t5; + + static const RegisterID returnRegister = MIPSRegisters::v0; + static const RegisterID returnRegister2 = MIPSRegisters::v1; +#elif CPU(SH4) + static const RegisterID input = SH4Registers::r4; + static const RegisterID index = SH4Registers::r5; + static const RegisterID length = SH4Registers::r6; + static const RegisterID output = SH4Registers::r7; + + static const RegisterID regT0 = SH4Registers::r0; + static const RegisterID regT1 = SH4Registers::r1; + + static const RegisterID returnRegister = SH4Registers::r0; + static const RegisterID returnRegister2 = SH4Registers::r1; +#elif CPU(X86) + static const RegisterID input = X86Registers::eax; + static const RegisterID index = X86Registers::edx; + static const RegisterID length = X86Registers::ecx; + static const RegisterID output = X86Registers::edi; + + static const RegisterID regT0 = X86Registers::ebx; + static const RegisterID regT1 = X86Registers::esi; + + static const RegisterID returnRegister = X86Registers::eax; + static const RegisterID returnRegister2 = X86Registers::edx; +#elif CPU(X86_64) + static const RegisterID input = X86Registers::edi; + static const RegisterID index = X86Registers::esi; + static const RegisterID length = X86Registers::edx; + static const RegisterID output = X86Registers::ecx; + + static const RegisterID regT0 = X86Registers::eax; + static const RegisterID regT1 = X86Registers::ebx; + + static const RegisterID returnRegister = X86Registers::eax; + static const RegisterID returnRegister2 = X86Registers::edx; +#endif + + void optimizeAlternative(PatternAlternative* alternative) + { + if (!alternative->m_terms.size()) + return; + + for (unsigned i = 0; i < alternative->m_terms.size() - 1; ++i) { + PatternTerm& term = alternative->m_terms[i]; + PatternTerm& nextTerm = alternative->m_terms[i + 1]; + + if ((term.type == PatternTerm::TypeCharacterClass) + && (term.quantityType == QuantifierFixedCount) + && (nextTerm.type == PatternTerm::TypePatternCharacter) + && (nextTerm.quantityType == QuantifierFixedCount)) { + PatternTerm termCopy = term; + alternative->m_terms[i] = nextTerm; + alternative->m_terms[i + 1] = termCopy; + } + } + } + + void matchCharacterClassRange(RegisterID character, JumpList& failures, JumpList& matchDest, const CharacterRange* ranges, unsigned count, unsigned* matchIndex, const UChar* matches, unsigned matchCount) + { + do { + // pick which range we're going to generate + int which = count >> 1; + char lo = ranges[which].begin; + char hi = ranges[which].end; + + // check if there are any ranges or matches below lo. If not, just jl to failure - + // if there is anything else to check, check that first, if it falls through jmp to failure. + if ((*matchIndex < matchCount) && (matches[*matchIndex] < lo)) { + Jump loOrAbove = branch32(GreaterThanOrEqual, character, Imm32((unsigned short)lo)); + + // generate code for all ranges before this one + if (which) + matchCharacterClassRange(character, failures, matchDest, ranges, which, matchIndex, matches, matchCount); + + while ((*matchIndex < matchCount) && (matches[*matchIndex] < lo)) { + matchDest.append(branch32(Equal, character, Imm32((unsigned short)matches[*matchIndex]))); + ++*matchIndex; + } + failures.append(jump()); + + loOrAbove.link(this); + } else if (which) { + Jump loOrAbove = branch32(GreaterThanOrEqual, character, Imm32((unsigned short)lo)); + + matchCharacterClassRange(character, failures, matchDest, ranges, which, matchIndex, matches, matchCount); + failures.append(jump()); + + loOrAbove.link(this); + } else + failures.append(branch32(LessThan, character, Imm32((unsigned short)lo))); + + while ((*matchIndex < matchCount) && (matches[*matchIndex] <= hi)) + ++*matchIndex; + + matchDest.append(branch32(LessThanOrEqual, character, Imm32((unsigned short)hi))); + // fall through to here, the value is above hi. + + // shuffle along & loop around if there are any more matches to handle. + unsigned next = which + 1; + ranges += next; + count -= next; + } while (count); + } + + void matchCharacterClass(RegisterID character, JumpList& matchDest, const CharacterClass* charClass) + { + if (charClass->m_table) { + ExtendedAddress tableEntry(character, reinterpret_cast<intptr_t>(charClass->m_table->m_table)); + matchDest.append(branchTest8(charClass->m_table->m_inverted ? Zero : NonZero, tableEntry)); + return; + } + Jump unicodeFail; + if (charClass->m_matchesUnicode.size() || charClass->m_rangesUnicode.size()) { + Jump isAscii = branch32(LessThanOrEqual, character, TrustedImm32(0x7f)); + + if (charClass->m_matchesUnicode.size()) { + for (unsigned i = 0; i < charClass->m_matchesUnicode.size(); ++i) { + UChar ch = charClass->m_matchesUnicode[i]; + matchDest.append(branch32(Equal, character, Imm32(ch))); + } + } + + if (charClass->m_rangesUnicode.size()) { + for (unsigned i = 0; i < charClass->m_rangesUnicode.size(); ++i) { + UChar lo = charClass->m_rangesUnicode[i].begin; + UChar hi = charClass->m_rangesUnicode[i].end; + + Jump below = branch32(LessThan, character, Imm32(lo)); + matchDest.append(branch32(LessThanOrEqual, character, Imm32(hi))); + below.link(this); + } + } + + unicodeFail = jump(); + isAscii.link(this); + } + + if (charClass->m_ranges.size()) { + unsigned matchIndex = 0; + JumpList failures; + matchCharacterClassRange(character, failures, matchDest, charClass->m_ranges.begin(), charClass->m_ranges.size(), &matchIndex, charClass->m_matches.begin(), charClass->m_matches.size()); + while (matchIndex < charClass->m_matches.size()) + matchDest.append(branch32(Equal, character, Imm32((unsigned short)charClass->m_matches[matchIndex++]))); + + failures.link(this); + } else if (charClass->m_matches.size()) { + // optimization: gather 'a','A' etc back together, can mask & test once. + Vector<char> matchesAZaz; + + for (unsigned i = 0; i < charClass->m_matches.size(); ++i) { + char ch = charClass->m_matches[i]; + if (m_pattern.m_ignoreCase) { + if (isASCIILower(ch)) { + matchesAZaz.append(ch); + continue; + } + if (isASCIIUpper(ch)) + continue; + } + matchDest.append(branch32(Equal, character, Imm32((unsigned short)ch))); + } + + if (unsigned countAZaz = matchesAZaz.size()) { + or32(TrustedImm32(32), character); + for (unsigned i = 0; i < countAZaz; ++i) + matchDest.append(branch32(Equal, character, TrustedImm32(matchesAZaz[i]))); + } + } + + if (charClass->m_matchesUnicode.size() || charClass->m_rangesUnicode.size()) + unicodeFail.link(this); + } + + // Jumps if input not available; will have (incorrectly) incremented already! + Jump jumpIfNoAvailableInput(unsigned countToCheck = 0) + { + if (countToCheck) + add32(Imm32(countToCheck), index); + return branch32(Above, index, length); + } + + Jump jumpIfAvailableInput(unsigned countToCheck) + { + add32(Imm32(countToCheck), index); + return branch32(BelowOrEqual, index, length); + } + + Jump checkInput() + { + return branch32(BelowOrEqual, index, length); + } + + Jump atEndOfInput() + { + return branch32(Equal, index, length); + } + + Jump notAtEndOfInput() + { + return branch32(NotEqual, index, length); + } + + Jump jumpIfCharNotEquals(UChar ch, int inputPosition, RegisterID character) + { + readCharacter(inputPosition, character); + + // For case-insesitive compares, non-ascii characters that have different + // upper & lower case representations are converted to a character class. + ASSERT(!m_pattern.m_ignoreCase || isASCIIAlpha(ch) || isCanonicallyUnique(ch)); + if (m_pattern.m_ignoreCase && isASCIIAlpha(ch)) { + or32(TrustedImm32(0x20), character); + ch |= 0x20; + } + + return branch32(NotEqual, character, Imm32(ch)); + } + + void readCharacter(int inputPosition, RegisterID reg) + { + if (m_charSize == Char8) + load8(BaseIndex(input, index, TimesOne, inputPosition * sizeof(char)), reg); + else + load16(BaseIndex(input, index, TimesTwo, inputPosition * sizeof(UChar)), reg); + } + + void storeToFrame(RegisterID reg, unsigned frameLocation) + { + poke(reg, frameLocation); + } + + void storeToFrame(TrustedImm32 imm, unsigned frameLocation) + { + poke(imm, frameLocation); + } + + DataLabelPtr storeToFrameWithPatch(unsigned frameLocation) + { + return storePtrWithPatch(TrustedImmPtr(0), Address(stackPointerRegister, frameLocation * sizeof(void*))); + } + + void loadFromFrame(unsigned frameLocation, RegisterID reg) + { + peek(reg, frameLocation); + } + + void loadFromFrameAndJump(unsigned frameLocation) + { + jump(Address(stackPointerRegister, frameLocation * sizeof(void*))); + } + + void initCallFrame() + { + unsigned callFrameSize = m_pattern.m_body->m_callFrameSize; + if (callFrameSize) + subPtr(Imm32(callFrameSize * sizeof(void*)), stackPointerRegister); + } + void removeCallFrame() + { + unsigned callFrameSize = m_pattern.m_body->m_callFrameSize; + if (callFrameSize) + addPtr(Imm32(callFrameSize * sizeof(void*)), stackPointerRegister); + } + + // Used to record subpatters, should only be called if compileMode is IncludeSubpatterns. + void setSubpatternStart(RegisterID reg, unsigned subpattern) + { + ASSERT(subpattern); + // FIXME: should be able to ASSERT(compileMode == IncludeSubpatterns), but then this function is conditionally NORETURN. :-( + store32(reg, Address(output, (subpattern << 1) * sizeof(int))); + } + void setSubpatternEnd(RegisterID reg, unsigned subpattern) + { + ASSERT(subpattern); + // FIXME: should be able to ASSERT(compileMode == IncludeSubpatterns), but then this function is conditionally NORETURN. :-( + store32(reg, Address(output, ((subpattern << 1) + 1) * sizeof(int))); + } + void clearSubpatternStart(unsigned subpattern) + { + ASSERT(subpattern); + // FIXME: should be able to ASSERT(compileMode == IncludeSubpatterns), but then this function is conditionally NORETURN. :-( + store32(TrustedImm32(-1), Address(output, (subpattern << 1) * sizeof(int))); + } + + // We use one of three different strategies to track the start of the current match, + // while matching. + // 1) If the pattern has a fixed size, do nothing! - we calculate the value lazily + // at the end of matching. This is irrespective of compileMode, and in this case + // these methods should never be called. + // 2) If we're compiling IncludeSubpatterns, 'output' contains a pointer to an output + // vector, store the match start in the output vector. + // 3) If we're compiling MatchOnly, 'output' is unused, store the match start directly + // in this register. + void setMatchStart(RegisterID reg) + { + ASSERT(!m_pattern.m_body->m_hasFixedSize); + if (compileMode == IncludeSubpatterns) + store32(reg, output); + else + move(reg, output); + } + void getMatchStart(RegisterID reg) + { + ASSERT(!m_pattern.m_body->m_hasFixedSize); + if (compileMode == IncludeSubpatterns) + load32(output, reg); + else + move(output, reg); + } + + enum YarrOpCode { + // These nodes wrap body alternatives - those in the main disjunction, + // rather than subpatterns or assertions. These are chained together in + // a doubly linked list, with a 'begin' node for the first alternative, + // a 'next' node for each subsequent alternative, and an 'end' node at + // the end. In the case of repeating alternatives, the 'end' node also + // has a reference back to 'begin'. + OpBodyAlternativeBegin, + OpBodyAlternativeNext, + OpBodyAlternativeEnd, + // Similar to the body alternatives, but used for subpatterns with two + // or more alternatives. + OpNestedAlternativeBegin, + OpNestedAlternativeNext, + OpNestedAlternativeEnd, + // Used for alternatives in subpatterns where there is only a single + // alternative (backtrackingis easier in these cases), or for alternatives + // which never need to be backtracked (those in parenthetical assertions, + // terminal subpatterns). + OpSimpleNestedAlternativeBegin, + OpSimpleNestedAlternativeNext, + OpSimpleNestedAlternativeEnd, + // Used to wrap 'Once' subpattern matches (quantityCount == 1). + OpParenthesesSubpatternOnceBegin, + OpParenthesesSubpatternOnceEnd, + // Used to wrap 'Terminal' subpattern matches (at the end of the regexp). + OpParenthesesSubpatternTerminalBegin, + OpParenthesesSubpatternTerminalEnd, + // Used to wrap parenthetical assertions. + OpParentheticalAssertionBegin, + OpParentheticalAssertionEnd, + // Wraps all simple terms (pattern characters, character classes). + OpTerm, + // Where an expression contains only 'once through' body alternatives + // and no repeating ones, this op is used to return match failure. + OpMatchFailed + }; + + // This structure is used to hold the compiled opcode information, + // including reference back to the original PatternTerm/PatternAlternatives, + // and JIT compilation data structures. + struct YarrOp { + explicit YarrOp(PatternTerm* term) + : m_op(OpTerm) + , m_term(term) + , m_isDeadCode(false) + { + } + + explicit YarrOp(YarrOpCode op) + : m_op(op) + , m_isDeadCode(false) + { + } + + // The operation, as a YarrOpCode, and also a reference to the PatternTerm. + YarrOpCode m_op; + PatternTerm* m_term; + + // For alternatives, this holds the PatternAlternative and doubly linked + // references to this alternative's siblings. In the case of the + // OpBodyAlternativeEnd node at the end of a section of repeating nodes, + // m_nextOp will reference the OpBodyAlternativeBegin node of the first + // repeating alternative. + PatternAlternative* m_alternative; + size_t m_previousOp; + size_t m_nextOp; + + // Used to record a set of Jumps out of the generated code, typically + // used for jumps out to backtracking code, and a single reentry back + // into the code for a node (likely where a backtrack will trigger + // rematching). + Label m_reentry; + JumpList m_jumps; + + // Used for backtracking when the prior alternative did not consume any + // characters but matched. + Jump m_zeroLengthMatch; + + // This flag is used to null out the second pattern character, when + // two are fused to match a pair together. + bool m_isDeadCode; + + // Currently used in the case of some of the more complex management of + // 'm_checked', to cache the offset used in this alternative, to avoid + // recalculating it. + int m_checkAdjust; + + // Used by OpNestedAlternativeNext/End to hold the pointer to the + // value that will be pushed into the pattern's frame to return to, + // upon backtracking back into the disjunction. + DataLabelPtr m_returnAddress; + }; + + // BacktrackingState + // This class encapsulates information about the state of code generation + // whilst generating the code for backtracking, when a term fails to match. + // Upon entry to code generation of the backtracking code for a given node, + // the Backtracking state will hold references to all control flow sources + // that are outputs in need of further backtracking from the prior node + // generated (which is the subsequent operation in the regular expression, + // and in the m_ops Vector, since we generated backtracking backwards). + // These references to control flow take the form of: + // - A jump list of jumps, to be linked to code that will backtrack them + // further. + // - A set of DataLabelPtr values, to be populated with values to be + // treated effectively as return addresses backtracking into complex + // subpatterns. + // - A flag indicating that the current sequence of generated code up to + // this point requires backtracking. + class BacktrackingState { + public: + BacktrackingState() + : m_pendingFallthrough(false) + { + } + + // Add a jump or jumps, a return address, or set the flag indicating + // that the current 'fallthrough' control flow requires backtracking. + void append(const Jump& jump) + { + m_laterFailures.append(jump); + } + void append(JumpList& jumpList) + { + m_laterFailures.append(jumpList); + } + void append(const DataLabelPtr& returnAddress) + { + m_pendingReturns.append(returnAddress); + } + void fallthrough() + { + ASSERT(!m_pendingFallthrough); + m_pendingFallthrough = true; + } + + // These methods clear the backtracking state, either linking to the + // current location, a provided label, or copying the backtracking out + // to a JumpList. All actions may require code generation to take place, + // and as such are passed a pointer to the assembler. + void link(MacroAssembler* assembler) + { + if (m_pendingReturns.size()) { + Label here(assembler); + for (unsigned i = 0; i < m_pendingReturns.size(); ++i) + m_backtrackRecords.append(ReturnAddressRecord(m_pendingReturns[i], here)); + m_pendingReturns.clear(); + } + m_laterFailures.link(assembler); + m_laterFailures.clear(); + m_pendingFallthrough = false; + } + void linkTo(Label label, MacroAssembler* assembler) + { + if (m_pendingReturns.size()) { + for (unsigned i = 0; i < m_pendingReturns.size(); ++i) + m_backtrackRecords.append(ReturnAddressRecord(m_pendingReturns[i], label)); + m_pendingReturns.clear(); + } + if (m_pendingFallthrough) + assembler->jump(label); + m_laterFailures.linkTo(label, assembler); + m_laterFailures.clear(); + m_pendingFallthrough = false; + } + void takeBacktracksToJumpList(JumpList& jumpList, MacroAssembler* assembler) + { + if (m_pendingReturns.size()) { + Label here(assembler); + for (unsigned i = 0; i < m_pendingReturns.size(); ++i) + m_backtrackRecords.append(ReturnAddressRecord(m_pendingReturns[i], here)); + m_pendingReturns.clear(); + m_pendingFallthrough = true; + } + if (m_pendingFallthrough) + jumpList.append(assembler->jump()); + jumpList.append(m_laterFailures); + m_laterFailures.clear(); + m_pendingFallthrough = false; + } + + bool isEmpty() + { + return m_laterFailures.empty() && m_pendingReturns.isEmpty() && !m_pendingFallthrough; + } + + // Called at the end of code generation to link all return addresses. + void linkDataLabels(LinkBuffer& linkBuffer) + { + ASSERT(isEmpty()); + for (unsigned i = 0; i < m_backtrackRecords.size(); ++i) + linkBuffer.patch(m_backtrackRecords[i].m_dataLabel, linkBuffer.locationOf(m_backtrackRecords[i].m_backtrackLocation)); + } + + private: + struct ReturnAddressRecord { + ReturnAddressRecord(DataLabelPtr dataLabel, Label backtrackLocation) + : m_dataLabel(dataLabel) + , m_backtrackLocation(backtrackLocation) + { + } + + DataLabelPtr m_dataLabel; + Label m_backtrackLocation; + }; + + JumpList m_laterFailures; + bool m_pendingFallthrough; + Vector<DataLabelPtr, 4> m_pendingReturns; + Vector<ReturnAddressRecord, 4> m_backtrackRecords; + }; + + // Generation methods: + // =================== + + // This method provides a default implementation of backtracking common + // to many terms; terms commonly jump out of the forwards matching path + // on any failed conditions, and add these jumps to the m_jumps list. If + // no special handling is required we can often just backtrack to m_jumps. + void backtrackTermDefault(size_t opIndex) + { + YarrOp& op = m_ops[opIndex]; + m_backtrackingState.append(op.m_jumps); + } + + void generateAssertionBOL(size_t opIndex) + { + YarrOp& op = m_ops[opIndex]; + PatternTerm* term = op.m_term; + + if (m_pattern.m_multiline) { + const RegisterID character = regT0; + + JumpList matchDest; + if (!term->inputPosition) + matchDest.append(branch32(Equal, index, Imm32(m_checked))); + + readCharacter((term->inputPosition - m_checked) - 1, character); + matchCharacterClass(character, matchDest, m_pattern.newlineCharacterClass()); + op.m_jumps.append(jump()); + + matchDest.link(this); + } else { + // Erk, really should poison out these alternatives early. :-/ + if (term->inputPosition) + op.m_jumps.append(jump()); + else + op.m_jumps.append(branch32(NotEqual, index, Imm32(m_checked))); + } + } + void backtrackAssertionBOL(size_t opIndex) + { + backtrackTermDefault(opIndex); + } + + void generateAssertionEOL(size_t opIndex) + { + YarrOp& op = m_ops[opIndex]; + PatternTerm* term = op.m_term; + + if (m_pattern.m_multiline) { + const RegisterID character = regT0; + + JumpList matchDest; + if (term->inputPosition == m_checked) + matchDest.append(atEndOfInput()); + + readCharacter(term->inputPosition - m_checked, character); + matchCharacterClass(character, matchDest, m_pattern.newlineCharacterClass()); + op.m_jumps.append(jump()); + + matchDest.link(this); + } else { + if (term->inputPosition == m_checked) + op.m_jumps.append(notAtEndOfInput()); + // Erk, really should poison out these alternatives early. :-/ + else + op.m_jumps.append(jump()); + } + } + void backtrackAssertionEOL(size_t opIndex) + { + backtrackTermDefault(opIndex); + } + + // Also falls though on nextIsNotWordChar. + void matchAssertionWordchar(size_t opIndex, JumpList& nextIsWordChar, JumpList& nextIsNotWordChar) + { + YarrOp& op = m_ops[opIndex]; + PatternTerm* term = op.m_term; + + const RegisterID character = regT0; + + if (term->inputPosition == m_checked) + nextIsNotWordChar.append(atEndOfInput()); + + readCharacter((term->inputPosition - m_checked), character); + matchCharacterClass(character, nextIsWordChar, m_pattern.wordcharCharacterClass()); + } + + void generateAssertionWordBoundary(size_t opIndex) + { + YarrOp& op = m_ops[opIndex]; + PatternTerm* term = op.m_term; + + const RegisterID character = regT0; + + Jump atBegin; + JumpList matchDest; + if (!term->inputPosition) + atBegin = branch32(Equal, index, Imm32(m_checked)); + readCharacter((term->inputPosition - m_checked) - 1, character); + matchCharacterClass(character, matchDest, m_pattern.wordcharCharacterClass()); + if (!term->inputPosition) + atBegin.link(this); + + // We fall through to here if the last character was not a wordchar. + JumpList nonWordCharThenWordChar; + JumpList nonWordCharThenNonWordChar; + if (term->invert()) { + matchAssertionWordchar(opIndex, nonWordCharThenNonWordChar, nonWordCharThenWordChar); + nonWordCharThenWordChar.append(jump()); + } else { + matchAssertionWordchar(opIndex, nonWordCharThenWordChar, nonWordCharThenNonWordChar); + nonWordCharThenNonWordChar.append(jump()); + } + op.m_jumps.append(nonWordCharThenNonWordChar); + + // We jump here if the last character was a wordchar. + matchDest.link(this); + JumpList wordCharThenWordChar; + JumpList wordCharThenNonWordChar; + if (term->invert()) { + matchAssertionWordchar(opIndex, wordCharThenNonWordChar, wordCharThenWordChar); + wordCharThenWordChar.append(jump()); + } else { + matchAssertionWordchar(opIndex, wordCharThenWordChar, wordCharThenNonWordChar); + // This can fall-though! + } + + op.m_jumps.append(wordCharThenWordChar); + + nonWordCharThenWordChar.link(this); + wordCharThenNonWordChar.link(this); + } + void backtrackAssertionWordBoundary(size_t opIndex) + { + backtrackTermDefault(opIndex); + } + + void generatePatternCharacterOnce(size_t opIndex) + { + YarrOp& op = m_ops[opIndex]; + + if (op.m_isDeadCode) + return; + + // m_ops always ends with a OpBodyAlternativeEnd or OpMatchFailed + // node, so there must always be at least one more node. + ASSERT(opIndex + 1 < m_ops.size()); + YarrOp* nextOp = &m_ops[opIndex + 1]; + + PatternTerm* term = op.m_term; + UChar ch = term->patternCharacter; + + if ((ch > 0xff) && (m_charSize == Char8)) { + // Have a 16 bit pattern character and an 8 bit string - short circuit + op.m_jumps.append(jump()); + return; + } + + const RegisterID character = regT0; + int maxCharactersAtOnce = m_charSize == Char8 ? 4 : 2; + unsigned ignoreCaseMask = 0; + int allCharacters = ch; + int numberCharacters; + int startTermPosition = term->inputPosition; + + // For case-insesitive compares, non-ascii characters that have different + // upper & lower case representations are converted to a character class. + ASSERT(!m_pattern.m_ignoreCase || isASCIIAlpha(ch) || isCanonicallyUnique(ch)); + + if (m_pattern.m_ignoreCase && isASCIIAlpha(ch)) + ignoreCaseMask |= 32; + + for (numberCharacters = 1; numberCharacters < maxCharactersAtOnce && nextOp->m_op == OpTerm; ++numberCharacters, nextOp = &m_ops[opIndex + numberCharacters]) { + PatternTerm* nextTerm = nextOp->m_term; + + if (nextTerm->type != PatternTerm::TypePatternCharacter + || nextTerm->quantityType != QuantifierFixedCount + || nextTerm->quantityCount != 1 + || nextTerm->inputPosition != (startTermPosition + numberCharacters)) + break; + + nextOp->m_isDeadCode = true; + + int shiftAmount = (m_charSize == Char8 ? 8 : 16) * numberCharacters; + + UChar currentCharacter = nextTerm->patternCharacter; + + if ((currentCharacter > 0xff) && (m_charSize == Char8)) { + // Have a 16 bit pattern character and an 8 bit string - short circuit + op.m_jumps.append(jump()); + return; + } + + // For case-insesitive compares, non-ascii characters that have different + // upper & lower case representations are converted to a character class. + ASSERT(!m_pattern.m_ignoreCase || isASCIIAlpha(currentCharacter) || isCanonicallyUnique(currentCharacter)); + + allCharacters |= (currentCharacter << shiftAmount); + + if ((m_pattern.m_ignoreCase) && (isASCIIAlpha(currentCharacter))) + ignoreCaseMask |= 32 << shiftAmount; + } + + if (m_charSize == Char8) { + switch (numberCharacters) { + case 1: + op.m_jumps.append(jumpIfCharNotEquals(ch, startTermPosition - m_checked, character)); + return; + case 2: { + BaseIndex address(input, index, TimesOne, (startTermPosition - m_checked) * sizeof(LChar)); + load16Unaligned(address, character); + break; + } + case 3: { + BaseIndex highAddress(input, index, TimesOne, (startTermPosition - m_checked) * sizeof(LChar)); + load16Unaligned(highAddress, character); + if (ignoreCaseMask) + or32(Imm32(ignoreCaseMask), character); + op.m_jumps.append(branch32(NotEqual, character, Imm32((allCharacters & 0xffff) | ignoreCaseMask))); + op.m_jumps.append(jumpIfCharNotEquals(allCharacters >> 16, startTermPosition + 2 - m_checked, character)); + return; + } + case 4: { + BaseIndex address(input, index, TimesOne, (startTermPosition - m_checked) * sizeof(LChar)); + load32WithUnalignedHalfWords(address, character); + break; + } + } + } else { + switch (numberCharacters) { + case 1: + op.m_jumps.append(jumpIfCharNotEquals(ch, term->inputPosition - m_checked, character)); + return; + case 2: + BaseIndex address(input, index, TimesTwo, (term->inputPosition - m_checked) * sizeof(UChar)); + load32WithUnalignedHalfWords(address, character); + break; + } + } + + if (ignoreCaseMask) + or32(Imm32(ignoreCaseMask), character); + op.m_jumps.append(branch32(NotEqual, character, Imm32(allCharacters | ignoreCaseMask))); + return; + } + void backtrackPatternCharacterOnce(size_t opIndex) + { + backtrackTermDefault(opIndex); + } + + void generatePatternCharacterFixed(size_t opIndex) + { + YarrOp& op = m_ops[opIndex]; + PatternTerm* term = op.m_term; + UChar ch = term->patternCharacter; + + const RegisterID character = regT0; + const RegisterID countRegister = regT1; + + move(index, countRegister); + sub32(Imm32(term->quantityCount.unsafeGet()), countRegister); + + Label loop(this); + BaseIndex address(input, countRegister, m_charScale, (Checked<int>(term->inputPosition - m_checked + Checked<int64_t>(term->quantityCount)) * static_cast<int>(m_charSize == Char8 ? sizeof(char) : sizeof(UChar))).unsafeGet()); + + if (m_charSize == Char8) + load8(address, character); + else + load16(address, character); + + // For case-insesitive compares, non-ascii characters that have different + // upper & lower case representations are converted to a character class. + ASSERT(!m_pattern.m_ignoreCase || isASCIIAlpha(ch) || isCanonicallyUnique(ch)); + if (m_pattern.m_ignoreCase && isASCIIAlpha(ch)) { + or32(TrustedImm32(0x20), character); + ch |= 0x20; + } + + op.m_jumps.append(branch32(NotEqual, character, Imm32(ch))); + add32(TrustedImm32(1), countRegister); + branch32(NotEqual, countRegister, index).linkTo(loop, this); + } + void backtrackPatternCharacterFixed(size_t opIndex) + { + backtrackTermDefault(opIndex); + } + + void generatePatternCharacterGreedy(size_t opIndex) + { + YarrOp& op = m_ops[opIndex]; + PatternTerm* term = op.m_term; + UChar ch = term->patternCharacter; + + const RegisterID character = regT0; + const RegisterID countRegister = regT1; + + move(TrustedImm32(0), countRegister); + + // Unless have a 16 bit pattern character and an 8 bit string - short circuit + if (!((ch > 0xff) && (m_charSize == Char8))) { + JumpList failures; + Label loop(this); + failures.append(atEndOfInput()); + failures.append(jumpIfCharNotEquals(ch, term->inputPosition - m_checked, character)); + + add32(TrustedImm32(1), countRegister); + add32(TrustedImm32(1), index); + if (term->quantityCount == quantifyInfinite) + jump(loop); + else + branch32(NotEqual, countRegister, Imm32(term->quantityCount.unsafeGet())).linkTo(loop, this); + + failures.link(this); + } + op.m_reentry = label(); + + storeToFrame(countRegister, term->frameLocation); + } + void backtrackPatternCharacterGreedy(size_t opIndex) + { + YarrOp& op = m_ops[opIndex]; + PatternTerm* term = op.m_term; + + const RegisterID countRegister = regT1; + + m_backtrackingState.link(this); + + loadFromFrame(term->frameLocation, countRegister); + m_backtrackingState.append(branchTest32(Zero, countRegister)); + sub32(TrustedImm32(1), countRegister); + sub32(TrustedImm32(1), index); + jump(op.m_reentry); + } + + void generatePatternCharacterNonGreedy(size_t opIndex) + { + YarrOp& op = m_ops[opIndex]; + PatternTerm* term = op.m_term; + + const RegisterID countRegister = regT1; + + move(TrustedImm32(0), countRegister); + op.m_reentry = label(); + storeToFrame(countRegister, term->frameLocation); + } + void backtrackPatternCharacterNonGreedy(size_t opIndex) + { + YarrOp& op = m_ops[opIndex]; + PatternTerm* term = op.m_term; + UChar ch = term->patternCharacter; + + const RegisterID character = regT0; + const RegisterID countRegister = regT1; + + m_backtrackingState.link(this); + + loadFromFrame(term->frameLocation, countRegister); + + // Unless have a 16 bit pattern character and an 8 bit string - short circuit + if (!((ch > 0xff) && (m_charSize == Char8))) { + JumpList nonGreedyFailures; + nonGreedyFailures.append(atEndOfInput()); + if (term->quantityCount != quantifyInfinite) + nonGreedyFailures.append(branch32(Equal, countRegister, Imm32(term->quantityCount.unsafeGet()))); + nonGreedyFailures.append(jumpIfCharNotEquals(ch, term->inputPosition - m_checked, character)); + + add32(TrustedImm32(1), countRegister); + add32(TrustedImm32(1), index); + + jump(op.m_reentry); + nonGreedyFailures.link(this); + } + + sub32(countRegister, index); + m_backtrackingState.fallthrough(); + } + + void generateCharacterClassOnce(size_t opIndex) + { + YarrOp& op = m_ops[opIndex]; + PatternTerm* term = op.m_term; + + const RegisterID character = regT0; + + JumpList matchDest; + readCharacter(term->inputPosition - m_checked, character); + matchCharacterClass(character, matchDest, term->characterClass); + + if (term->invert()) + op.m_jumps.append(matchDest); + else { + op.m_jumps.append(jump()); + matchDest.link(this); + } + } + void backtrackCharacterClassOnce(size_t opIndex) + { + backtrackTermDefault(opIndex); + } + + void generateCharacterClassFixed(size_t opIndex) + { + YarrOp& op = m_ops[opIndex]; + PatternTerm* term = op.m_term; + + const RegisterID character = regT0; + const RegisterID countRegister = regT1; + + move(index, countRegister); + sub32(Imm32(term->quantityCount.unsafeGet()), countRegister); + + Label loop(this); + JumpList matchDest; + if (m_charSize == Char8) + load8(BaseIndex(input, countRegister, TimesOne, (Checked<int>(term->inputPosition - m_checked + Checked<int64_t>(term->quantityCount)) * static_cast<int>(sizeof(char))).unsafeGet()), character); + else + load16(BaseIndex(input, countRegister, TimesTwo, (Checked<int>(term->inputPosition - m_checked + Checked<int64_t>(term->quantityCount)) * static_cast<int>(sizeof(UChar))).unsafeGet()), character); + matchCharacterClass(character, matchDest, term->characterClass); + + if (term->invert()) + op.m_jumps.append(matchDest); + else { + op.m_jumps.append(jump()); + matchDest.link(this); + } + + add32(TrustedImm32(1), countRegister); + branch32(NotEqual, countRegister, index).linkTo(loop, this); + } + void backtrackCharacterClassFixed(size_t opIndex) + { + backtrackTermDefault(opIndex); + } + + void generateCharacterClassGreedy(size_t opIndex) + { + YarrOp& op = m_ops[opIndex]; + PatternTerm* term = op.m_term; + + const RegisterID character = regT0; + const RegisterID countRegister = regT1; + + move(TrustedImm32(0), countRegister); + + JumpList failures; + Label loop(this); + failures.append(atEndOfInput()); + + if (term->invert()) { + readCharacter(term->inputPosition - m_checked, character); + matchCharacterClass(character, failures, term->characterClass); + } else { + JumpList matchDest; + readCharacter(term->inputPosition - m_checked, character); + matchCharacterClass(character, matchDest, term->characterClass); + failures.append(jump()); + matchDest.link(this); + } + + add32(TrustedImm32(1), countRegister); + add32(TrustedImm32(1), index); + if (term->quantityCount != quantifyInfinite) { + branch32(NotEqual, countRegister, Imm32(term->quantityCount.unsafeGet())).linkTo(loop, this); + failures.append(jump()); + } else + jump(loop); + + failures.link(this); + op.m_reentry = label(); + + storeToFrame(countRegister, term->frameLocation); + } + void backtrackCharacterClassGreedy(size_t opIndex) + { + YarrOp& op = m_ops[opIndex]; + PatternTerm* term = op.m_term; + + const RegisterID countRegister = regT1; + + m_backtrackingState.link(this); + + loadFromFrame(term->frameLocation, countRegister); + m_backtrackingState.append(branchTest32(Zero, countRegister)); + sub32(TrustedImm32(1), countRegister); + sub32(TrustedImm32(1), index); + jump(op.m_reentry); + } + + void generateCharacterClassNonGreedy(size_t opIndex) + { + YarrOp& op = m_ops[opIndex]; + PatternTerm* term = op.m_term; + + const RegisterID countRegister = regT1; + + move(TrustedImm32(0), countRegister); + op.m_reentry = label(); + storeToFrame(countRegister, term->frameLocation); + } + void backtrackCharacterClassNonGreedy(size_t opIndex) + { + YarrOp& op = m_ops[opIndex]; + PatternTerm* term = op.m_term; + + const RegisterID character = regT0; + const RegisterID countRegister = regT1; + + JumpList nonGreedyFailures; + + m_backtrackingState.link(this); + + loadFromFrame(term->frameLocation, countRegister); + + nonGreedyFailures.append(atEndOfInput()); + nonGreedyFailures.append(branch32(Equal, countRegister, Imm32(term->quantityCount.unsafeGet()))); + + JumpList matchDest; + readCharacter(term->inputPosition - m_checked, character); + matchCharacterClass(character, matchDest, term->characterClass); + + if (term->invert()) + nonGreedyFailures.append(matchDest); + else { + nonGreedyFailures.append(jump()); + matchDest.link(this); + } + + add32(TrustedImm32(1), countRegister); + add32(TrustedImm32(1), index); + + jump(op.m_reentry); + + nonGreedyFailures.link(this); + sub32(countRegister, index); + m_backtrackingState.fallthrough(); + } + + void generateDotStarEnclosure(size_t opIndex) + { + YarrOp& op = m_ops[opIndex]; + PatternTerm* term = op.m_term; + + const RegisterID character = regT0; + const RegisterID matchPos = regT1; + + JumpList foundBeginningNewLine; + JumpList saveStartIndex; + JumpList foundEndingNewLine; + + ASSERT(!m_pattern.m_body->m_hasFixedSize); + getMatchStart(matchPos); + + saveStartIndex.append(branchTest32(Zero, matchPos)); + Label findBOLLoop(this); + sub32(TrustedImm32(1), matchPos); + if (m_charSize == Char8) + load8(BaseIndex(input, matchPos, TimesOne, 0), character); + else + load16(BaseIndex(input, matchPos, TimesTwo, 0), character); + matchCharacterClass(character, foundBeginningNewLine, m_pattern.newlineCharacterClass()); + branchTest32(NonZero, matchPos).linkTo(findBOLLoop, this); + saveStartIndex.append(jump()); + + foundBeginningNewLine.link(this); + add32(TrustedImm32(1), matchPos); // Advance past newline + saveStartIndex.link(this); + + if (!m_pattern.m_multiline && term->anchors.bolAnchor) + op.m_jumps.append(branchTest32(NonZero, matchPos)); + + ASSERT(!m_pattern.m_body->m_hasFixedSize); + setMatchStart(matchPos); + + move(index, matchPos); + + Label findEOLLoop(this); + foundEndingNewLine.append(branch32(Equal, matchPos, length)); + if (m_charSize == Char8) + load8(BaseIndex(input, matchPos, TimesOne, 0), character); + else + load16(BaseIndex(input, matchPos, TimesTwo, 0), character); + matchCharacterClass(character, foundEndingNewLine, m_pattern.newlineCharacterClass()); + add32(TrustedImm32(1), matchPos); + jump(findEOLLoop); + + foundEndingNewLine.link(this); + + if (!m_pattern.m_multiline && term->anchors.eolAnchor) + op.m_jumps.append(branch32(NotEqual, matchPos, length)); + + move(matchPos, index); + } + + void backtrackDotStarEnclosure(size_t opIndex) + { + backtrackTermDefault(opIndex); + } + + // Code generation/backtracking for simple terms + // (pattern characters, character classes, and assertions). + // These methods farm out work to the set of functions above. + void generateTerm(size_t opIndex) + { + YarrOp& op = m_ops[opIndex]; + PatternTerm* term = op.m_term; + + switch (term->type) { + case PatternTerm::TypePatternCharacter: + switch (term->quantityType) { + case QuantifierFixedCount: + if (term->quantityCount == 1) + generatePatternCharacterOnce(opIndex); + else + generatePatternCharacterFixed(opIndex); + break; + case QuantifierGreedy: + generatePatternCharacterGreedy(opIndex); + break; + case QuantifierNonGreedy: + generatePatternCharacterNonGreedy(opIndex); + break; + } + break; + + case PatternTerm::TypeCharacterClass: + switch (term->quantityType) { + case QuantifierFixedCount: + if (term->quantityCount == 1) + generateCharacterClassOnce(opIndex); + else + generateCharacterClassFixed(opIndex); + break; + case QuantifierGreedy: + generateCharacterClassGreedy(opIndex); + break; + case QuantifierNonGreedy: + generateCharacterClassNonGreedy(opIndex); + break; + } + break; + + case PatternTerm::TypeAssertionBOL: + generateAssertionBOL(opIndex); + break; + + case PatternTerm::TypeAssertionEOL: + generateAssertionEOL(opIndex); + break; + + case PatternTerm::TypeAssertionWordBoundary: + generateAssertionWordBoundary(opIndex); + break; + + case PatternTerm::TypeForwardReference: + break; + + case PatternTerm::TypeParenthesesSubpattern: + case PatternTerm::TypeParentheticalAssertion: + ASSERT_NOT_REACHED(); + case PatternTerm::TypeBackReference: + m_shouldFallBack = true; + break; + case PatternTerm::TypeDotStarEnclosure: + generateDotStarEnclosure(opIndex); + break; + } + } + void backtrackTerm(size_t opIndex) + { + YarrOp& op = m_ops[opIndex]; + PatternTerm* term = op.m_term; + + switch (term->type) { + case PatternTerm::TypePatternCharacter: + switch (term->quantityType) { + case QuantifierFixedCount: + if (term->quantityCount == 1) + backtrackPatternCharacterOnce(opIndex); + else + backtrackPatternCharacterFixed(opIndex); + break; + case QuantifierGreedy: + backtrackPatternCharacterGreedy(opIndex); + break; + case QuantifierNonGreedy: + backtrackPatternCharacterNonGreedy(opIndex); + break; + } + break; + + case PatternTerm::TypeCharacterClass: + switch (term->quantityType) { + case QuantifierFixedCount: + if (term->quantityCount == 1) + backtrackCharacterClassOnce(opIndex); + else + backtrackCharacterClassFixed(opIndex); + break; + case QuantifierGreedy: + backtrackCharacterClassGreedy(opIndex); + break; + case QuantifierNonGreedy: + backtrackCharacterClassNonGreedy(opIndex); + break; + } + break; + + case PatternTerm::TypeAssertionBOL: + backtrackAssertionBOL(opIndex); + break; + + case PatternTerm::TypeAssertionEOL: + backtrackAssertionEOL(opIndex); + break; + + case PatternTerm::TypeAssertionWordBoundary: + backtrackAssertionWordBoundary(opIndex); + break; + + case PatternTerm::TypeForwardReference: + break; + + case PatternTerm::TypeParenthesesSubpattern: + case PatternTerm::TypeParentheticalAssertion: + ASSERT_NOT_REACHED(); + + case PatternTerm::TypeDotStarEnclosure: + backtrackDotStarEnclosure(opIndex); + break; + + case PatternTerm::TypeBackReference: + m_shouldFallBack = true; + break; + } + } + + void generate() + { + // Forwards generate the matching code. + ASSERT(m_ops.size()); + size_t opIndex = 0; + + do { + YarrOp& op = m_ops[opIndex]; + switch (op.m_op) { + + case OpTerm: + generateTerm(opIndex); + break; + + // OpBodyAlternativeBegin/Next/End + // + // These nodes wrap the set of alternatives in the body of the regular expression. + // There may be either one or two chains of OpBodyAlternative nodes, one representing + // the 'once through' sequence of alternatives (if any exist), and one representing + // the repeating alternatives (again, if any exist). + // + // Upon normal entry to the Begin alternative, we will check that input is available. + // Reentry to the Begin alternative will take place after the check has taken place, + // and will assume that the input position has already been progressed as appropriate. + // + // Entry to subsequent Next/End alternatives occurs when the prior alternative has + // successfully completed a match - return a success state from JIT code. + // + // Next alternatives allow for reentry optimized to suit backtracking from its + // preceding alternative. It expects the input position to still be set to a position + // appropriate to its predecessor, and it will only perform an input check if the + // predecessor had a minimum size less than its own. + // + // In the case 'once through' expressions, the End node will also have a reentry + // point to jump to when the last alternative fails. Again, this expects the input + // position to still reflect that expected by the prior alternative. + case OpBodyAlternativeBegin: { + PatternAlternative* alternative = op.m_alternative; + + // Upon entry at the head of the set of alternatives, check if input is available + // to run the first alternative. (This progresses the input position). + op.m_jumps.append(jumpIfNoAvailableInput(alternative->m_minimumSize)); + // We will reenter after the check, and assume the input position to have been + // set as appropriate to this alternative. + op.m_reentry = label(); + + m_checked += alternative->m_minimumSize; + break; + } + case OpBodyAlternativeNext: + case OpBodyAlternativeEnd: { + PatternAlternative* priorAlternative = m_ops[op.m_previousOp].m_alternative; + PatternAlternative* alternative = op.m_alternative; + + // If we get here, the prior alternative matched - return success. + + // Adjust the stack pointer to remove the pattern's frame. + removeCallFrame(); + + // Load appropriate values into the return register and the first output + // slot, and return. In the case of pattern with a fixed size, we will + // not have yet set the value in the first + ASSERT(index != returnRegister); + if (m_pattern.m_body->m_hasFixedSize) { + move(index, returnRegister); + if (priorAlternative->m_minimumSize) + sub32(Imm32(priorAlternative->m_minimumSize), returnRegister); + if (compileMode == IncludeSubpatterns) + store32(returnRegister, output); + } else + getMatchStart(returnRegister); + if (compileMode == IncludeSubpatterns) + store32(index, Address(output, 4)); + move(index, returnRegister2); + + generateReturn(); + + // This is the divide between the tail of the prior alternative, above, and + // the head of the subsequent alternative, below. + + if (op.m_op == OpBodyAlternativeNext) { + // This is the reentry point for the Next alternative. We expect any code + // that jumps here to do so with the input position matching that of the + // PRIOR alteranative, and we will only check input availability if we + // need to progress it forwards. + op.m_reentry = label(); + if (alternative->m_minimumSize > priorAlternative->m_minimumSize) { + add32(Imm32(alternative->m_minimumSize - priorAlternative->m_minimumSize), index); + op.m_jumps.append(jumpIfNoAvailableInput()); + } else if (priorAlternative->m_minimumSize > alternative->m_minimumSize) + sub32(Imm32(priorAlternative->m_minimumSize - alternative->m_minimumSize), index); + } else if (op.m_nextOp == notFound) { + // This is the reentry point for the End of 'once through' alternatives, + // jumped to when the last alternative fails to match. + op.m_reentry = label(); + sub32(Imm32(priorAlternative->m_minimumSize), index); + } + + if (op.m_op == OpBodyAlternativeNext) + m_checked += alternative->m_minimumSize; + m_checked -= priorAlternative->m_minimumSize; + break; + } + + // OpSimpleNestedAlternativeBegin/Next/End + // OpNestedAlternativeBegin/Next/End + // + // These nodes are used to handle sets of alternatives that are nested within + // subpatterns and parenthetical assertions. The 'simple' forms are used where + // we do not need to be able to backtrack back into any alternative other than + // the last, the normal forms allow backtracking into any alternative. + // + // Each Begin/Next node is responsible for planting an input check to ensure + // sufficient input is available on entry. Next nodes additionally need to + // jump to the end - Next nodes use the End node's m_jumps list to hold this + // set of jumps. + // + // In the non-simple forms, successful alternative matches must store a + // 'return address' using a DataLabelPtr, used to store the address to jump + // to when backtracking, to get to the code for the appropriate alternative. + case OpSimpleNestedAlternativeBegin: + case OpNestedAlternativeBegin: { + PatternTerm* term = op.m_term; + PatternAlternative* alternative = op.m_alternative; + PatternDisjunction* disjunction = term->parentheses.disjunction; + + // Calculate how much input we need to check for, and if non-zero check. + op.m_checkAdjust = alternative->m_minimumSize; + if ((term->quantityType == QuantifierFixedCount) && (term->type != PatternTerm::TypeParentheticalAssertion)) + op.m_checkAdjust -= disjunction->m_minimumSize; + if (op.m_checkAdjust) + op.m_jumps.append(jumpIfNoAvailableInput(op.m_checkAdjust)); + + m_checked += op.m_checkAdjust; + break; + } + case OpSimpleNestedAlternativeNext: + case OpNestedAlternativeNext: { + PatternTerm* term = op.m_term; + PatternAlternative* alternative = op.m_alternative; + PatternDisjunction* disjunction = term->parentheses.disjunction; + + // In the non-simple case, store a 'return address' so we can backtrack correctly. + if (op.m_op == OpNestedAlternativeNext) { + unsigned parenthesesFrameLocation = term->frameLocation; + unsigned alternativeFrameLocation = parenthesesFrameLocation; + if (term->quantityType != QuantifierFixedCount) + alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce; + op.m_returnAddress = storeToFrameWithPatch(alternativeFrameLocation); + } + + if (term->quantityType != QuantifierFixedCount && !m_ops[op.m_previousOp].m_alternative->m_minimumSize) { + // If the previous alternative matched without consuming characters then + // backtrack to try to match while consumming some input. + op.m_zeroLengthMatch = branch32(Equal, index, Address(stackPointerRegister, term->frameLocation * sizeof(void*))); + } + + // If we reach here then the last alternative has matched - jump to the + // End node, to skip over any further alternatives. + // + // FIXME: this is logically O(N^2) (though N can be expected to be very + // small). We could avoid this either by adding an extra jump to the JIT + // data structures, or by making backtracking code that jumps to Next + // alternatives are responsible for checking that input is available (if + // we didn't need to plant the input checks, then m_jumps would be free). + YarrOp* endOp = &m_ops[op.m_nextOp]; + while (endOp->m_nextOp != notFound) { + ASSERT(endOp->m_op == OpSimpleNestedAlternativeNext || endOp->m_op == OpNestedAlternativeNext); + endOp = &m_ops[endOp->m_nextOp]; + } + ASSERT(endOp->m_op == OpSimpleNestedAlternativeEnd || endOp->m_op == OpNestedAlternativeEnd); + endOp->m_jumps.append(jump()); + + // This is the entry point for the next alternative. + op.m_reentry = label(); + + // Calculate how much input we need to check for, and if non-zero check. + op.m_checkAdjust = alternative->m_minimumSize; + if ((term->quantityType == QuantifierFixedCount) && (term->type != PatternTerm::TypeParentheticalAssertion)) + op.m_checkAdjust -= disjunction->m_minimumSize; + if (op.m_checkAdjust) + op.m_jumps.append(jumpIfNoAvailableInput(op.m_checkAdjust)); + + YarrOp& lastOp = m_ops[op.m_previousOp]; + m_checked -= lastOp.m_checkAdjust; + m_checked += op.m_checkAdjust; + break; + } + case OpSimpleNestedAlternativeEnd: + case OpNestedAlternativeEnd: { + PatternTerm* term = op.m_term; + + // In the non-simple case, store a 'return address' so we can backtrack correctly. + if (op.m_op == OpNestedAlternativeEnd) { + unsigned parenthesesFrameLocation = term->frameLocation; + unsigned alternativeFrameLocation = parenthesesFrameLocation; + if (term->quantityType != QuantifierFixedCount) + alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce; + op.m_returnAddress = storeToFrameWithPatch(alternativeFrameLocation); + } + + if (term->quantityType != QuantifierFixedCount && !m_ops[op.m_previousOp].m_alternative->m_minimumSize) { + // If the previous alternative matched without consuming characters then + // backtrack to try to match while consumming some input. + op.m_zeroLengthMatch = branch32(Equal, index, Address(stackPointerRegister, term->frameLocation * sizeof(void*))); + } + + // If this set of alternatives contains more than one alternative, + // then the Next nodes will have planted jumps to the End, and added + // them to this node's m_jumps list. + op.m_jumps.link(this); + op.m_jumps.clear(); + + YarrOp& lastOp = m_ops[op.m_previousOp]; + m_checked -= lastOp.m_checkAdjust; + break; + } + + // OpParenthesesSubpatternOnceBegin/End + // + // These nodes support (optionally) capturing subpatterns, that have a + // quantity count of 1 (this covers fixed once, and ?/?? quantifiers). + case OpParenthesesSubpatternOnceBegin: { + PatternTerm* term = op.m_term; + unsigned parenthesesFrameLocation = term->frameLocation; + const RegisterID indexTemporary = regT0; + ASSERT(term->quantityCount == 1); + + // Upon entry to a Greedy quantified set of parenthese store the index. + // We'll use this for two purposes: + // - To indicate which iteration we are on of mathing the remainder of + // the expression after the parentheses - the first, including the + // match within the parentheses, or the second having skipped over them. + // - To check for empty matches, which must be rejected. + // + // At the head of a NonGreedy set of parentheses we'll immediately set the + // value on the stack to -1 (indicating a match skipping the subpattern), + // and plant a jump to the end. We'll also plant a label to backtrack to + // to reenter the subpattern later, with a store to set up index on the + // second iteration. + // + // FIXME: for capturing parens, could use the index in the capture array? + if (term->quantityType == QuantifierGreedy) + storeToFrame(index, parenthesesFrameLocation); + else if (term->quantityType == QuantifierNonGreedy) { + storeToFrame(TrustedImm32(-1), parenthesesFrameLocation); + op.m_jumps.append(jump()); + op.m_reentry = label(); + storeToFrame(index, parenthesesFrameLocation); + } + + // If the parenthese are capturing, store the starting index value to the + // captures array, offsetting as necessary. + // + // FIXME: could avoid offsetting this value in JIT code, apply + // offsets only afterwards, at the point the results array is + // being accessed. + if (term->capture() && compileMode == IncludeSubpatterns) { + int inputOffset = term->inputPosition - m_checked; + if (term->quantityType == QuantifierFixedCount) + inputOffset -= term->parentheses.disjunction->m_minimumSize; + if (inputOffset) { + move(index, indexTemporary); + add32(Imm32(inputOffset), indexTemporary); + setSubpatternStart(indexTemporary, term->parentheses.subpatternId); + } else + setSubpatternStart(index, term->parentheses.subpatternId); + } + break; + } + case OpParenthesesSubpatternOnceEnd: { + PatternTerm* term = op.m_term; + const RegisterID indexTemporary = regT0; + ASSERT(term->quantityCount == 1); + +#ifndef NDEBUG + // Runtime ASSERT to make sure that the nested alternative handled the + // "no input consumed" check. + if (term->quantityType != QuantifierFixedCount && !term->parentheses.disjunction->m_minimumSize) { + Jump pastBreakpoint; + pastBreakpoint = branch32(NotEqual, index, Address(stackPointerRegister, term->frameLocation * sizeof(void*))); + breakpoint(); + pastBreakpoint.link(this); + } +#endif + + // If the parenthese are capturing, store the ending index value to the + // captures array, offsetting as necessary. + // + // FIXME: could avoid offsetting this value in JIT code, apply + // offsets only afterwards, at the point the results array is + // being accessed. + if (term->capture() && compileMode == IncludeSubpatterns) { + int inputOffset = term->inputPosition - m_checked; + if (inputOffset) { + move(index, indexTemporary); + add32(Imm32(inputOffset), indexTemporary); + setSubpatternEnd(indexTemporary, term->parentheses.subpatternId); + } else + setSubpatternEnd(index, term->parentheses.subpatternId); + } + + // If the parentheses are quantified Greedy then add a label to jump back + // to if get a failed match from after the parentheses. For NonGreedy + // parentheses, link the jump from before the subpattern to here. + if (term->quantityType == QuantifierGreedy) + op.m_reentry = label(); + else if (term->quantityType == QuantifierNonGreedy) { + YarrOp& beginOp = m_ops[op.m_previousOp]; + beginOp.m_jumps.link(this); + } + break; + } + + // OpParenthesesSubpatternTerminalBegin/End + case OpParenthesesSubpatternTerminalBegin: { + PatternTerm* term = op.m_term; + ASSERT(term->quantityType == QuantifierGreedy); + ASSERT(term->quantityCount == quantifyInfinite); + ASSERT(!term->capture()); + + // Upon entry set a label to loop back to. + op.m_reentry = label(); + + // Store the start index of the current match; we need to reject zero + // length matches. + storeToFrame(index, term->frameLocation); + break; + } + case OpParenthesesSubpatternTerminalEnd: { + YarrOp& beginOp = m_ops[op.m_previousOp]; +#ifndef NDEBUG + PatternTerm* term = op.m_term; + + // Runtime ASSERT to make sure that the nested alternative handled the + // "no input consumed" check. + Jump pastBreakpoint; + pastBreakpoint = branch32(NotEqual, index, Address(stackPointerRegister, term->frameLocation * sizeof(void*))); + breakpoint(); + pastBreakpoint.link(this); +#endif + + // We know that the match is non-zero, we can accept it and + // loop back up to the head of the subpattern. + jump(beginOp.m_reentry); + + // This is the entry point to jump to when we stop matching - we will + // do so once the subpattern cannot match any more. + op.m_reentry = label(); + break; + } + + // OpParentheticalAssertionBegin/End + case OpParentheticalAssertionBegin: { + PatternTerm* term = op.m_term; + + // Store the current index - assertions should not update index, so + // we will need to restore it upon a successful match. + unsigned parenthesesFrameLocation = term->frameLocation; + storeToFrame(index, parenthesesFrameLocation); + + // Check + op.m_checkAdjust = m_checked - term->inputPosition; + if (op.m_checkAdjust) + sub32(Imm32(op.m_checkAdjust), index); + + m_checked -= op.m_checkAdjust; + break; + } + case OpParentheticalAssertionEnd: { + PatternTerm* term = op.m_term; + + // Restore the input index value. + unsigned parenthesesFrameLocation = term->frameLocation; + loadFromFrame(parenthesesFrameLocation, index); + + // If inverted, a successful match of the assertion must be treated + // as a failure, so jump to backtracking. + if (term->invert()) { + op.m_jumps.append(jump()); + op.m_reentry = label(); + } + + YarrOp& lastOp = m_ops[op.m_previousOp]; + m_checked += lastOp.m_checkAdjust; + break; + } + + case OpMatchFailed: + removeCallFrame(); + move(TrustedImmPtr((void*)WTF::notFound), returnRegister); + move(TrustedImm32(0), returnRegister2); + generateReturn(); + break; + } + + ++opIndex; + } while (opIndex < m_ops.size()); + } + + void backtrack() + { + // Backwards generate the backtracking code. + size_t opIndex = m_ops.size(); + ASSERT(opIndex); + + do { + --opIndex; + YarrOp& op = m_ops[opIndex]; + switch (op.m_op) { + + case OpTerm: + backtrackTerm(opIndex); + break; + + // OpBodyAlternativeBegin/Next/End + // + // For each Begin/Next node representing an alternative, we need to decide what to do + // in two circumstances: + // - If we backtrack back into this node, from within the alternative. + // - If the input check at the head of the alternative fails (if this exists). + // + // We treat these two cases differently since in the former case we have slightly + // more information - since we are backtracking out of a prior alternative we know + // that at least enough input was available to run it. For example, given the regular + // expression /a|b/, if we backtrack out of the first alternative (a failed pattern + // character match of 'a'), then we need not perform an additional input availability + // check before running the second alternative. + // + // Backtracking required differs for the last alternative, which in the case of the + // repeating set of alternatives must loop. The code generated for the last alternative + // will also be used to handle all input check failures from any prior alternatives - + // these require similar functionality, in seeking the next available alternative for + // which there is sufficient input. + // + // Since backtracking of all other alternatives simply requires us to link backtracks + // to the reentry point for the subsequent alternative, we will only be generating any + // code when backtracking the last alternative. + case OpBodyAlternativeBegin: + case OpBodyAlternativeNext: { + PatternAlternative* alternative = op.m_alternative; + + if (op.m_op == OpBodyAlternativeNext) { + PatternAlternative* priorAlternative = m_ops[op.m_previousOp].m_alternative; + m_checked += priorAlternative->m_minimumSize; + } + m_checked -= alternative->m_minimumSize; + + // Is this the last alternative? If not, then if we backtrack to this point we just + // need to jump to try to match the next alternative. + if (m_ops[op.m_nextOp].m_op != OpBodyAlternativeEnd) { + m_backtrackingState.linkTo(m_ops[op.m_nextOp].m_reentry, this); + break; + } + YarrOp& endOp = m_ops[op.m_nextOp]; + + YarrOp* beginOp = &op; + while (beginOp->m_op != OpBodyAlternativeBegin) { + ASSERT(beginOp->m_op == OpBodyAlternativeNext); + beginOp = &m_ops[beginOp->m_previousOp]; + } + + bool onceThrough = endOp.m_nextOp == notFound; + + // First, generate code to handle cases where we backtrack out of an attempted match + // of the last alternative. If this is a 'once through' set of alternatives then we + // have nothing to do - link this straight through to the End. + if (onceThrough) + m_backtrackingState.linkTo(endOp.m_reentry, this); + else { + // If we don't need to move the input poistion, and the pattern has a fixed size + // (in which case we omit the store of the start index until the pattern has matched) + // then we can just link the backtrack out of the last alternative straight to the + // head of the first alternative. + if (m_pattern.m_body->m_hasFixedSize + && (alternative->m_minimumSize > beginOp->m_alternative->m_minimumSize) + && (alternative->m_minimumSize - beginOp->m_alternative->m_minimumSize == 1)) + m_backtrackingState.linkTo(beginOp->m_reentry, this); + else { + // We need to generate a trampoline of code to execute before looping back + // around to the first alternative. + m_backtrackingState.link(this); + + // If the pattern size is not fixed, then store the start index, for use if we match. + if (!m_pattern.m_body->m_hasFixedSize) { + if (alternative->m_minimumSize == 1) + setMatchStart(index); + else { + move(index, regT0); + if (alternative->m_minimumSize) + sub32(Imm32(alternative->m_minimumSize - 1), regT0); + else + add32(TrustedImm32(1), regT0); + setMatchStart(regT0); + } + } + + // Generate code to loop. Check whether the last alternative is longer than the + // first (e.g. /a|xy/ or /a|xyz/). + if (alternative->m_minimumSize > beginOp->m_alternative->m_minimumSize) { + // We want to loop, and increment input position. If the delta is 1, it is + // already correctly incremented, if more than one then decrement as appropriate. + unsigned delta = alternative->m_minimumSize - beginOp->m_alternative->m_minimumSize; + ASSERT(delta); + if (delta != 1) + sub32(Imm32(delta - 1), index); + jump(beginOp->m_reentry); + } else { + // If the first alternative has minimum size 0xFFFFFFFFu, then there cannot + // be sufficent input available to handle this, so just fall through. + unsigned delta = beginOp->m_alternative->m_minimumSize - alternative->m_minimumSize; + if (delta != 0xFFFFFFFFu) { + // We need to check input because we are incrementing the input. + add32(Imm32(delta + 1), index); + checkInput().linkTo(beginOp->m_reentry, this); + } + } + } + } + + // We can reach this point in the code in two ways: + // - Fallthrough from the code above (a repeating alternative backtracked out of its + // last alternative, and did not have sufficent input to run the first). + // - We will loop back up to the following label when a releating alternative loops, + // following a failed input check. + // + // Either way, we have just failed the input check for the first alternative. + Label firstInputCheckFailed(this); + + // Generate code to handle input check failures from alternatives except the last. + // prevOp is the alternative we're handling a bail out from (initially Begin), and + // nextOp is the alternative we will be attempting to reenter into. + // + // We will link input check failures from the forwards matching path back to the code + // that can handle them. + YarrOp* prevOp = beginOp; + YarrOp* nextOp = &m_ops[beginOp->m_nextOp]; + while (nextOp->m_op != OpBodyAlternativeEnd) { + prevOp->m_jumps.link(this); + + // We only get here if an input check fails, it is only worth checking again + // if the next alternative has a minimum size less than the last. + if (prevOp->m_alternative->m_minimumSize > nextOp->m_alternative->m_minimumSize) { + // FIXME: if we added an extra label to YarrOp, we could avoid needing to + // subtract delta back out, and reduce this code. Should performance test + // the benefit of this. + unsigned delta = prevOp->m_alternative->m_minimumSize - nextOp->m_alternative->m_minimumSize; + sub32(Imm32(delta), index); + Jump fail = jumpIfNoAvailableInput(); + add32(Imm32(delta), index); + jump(nextOp->m_reentry); + fail.link(this); + } else if (prevOp->m_alternative->m_minimumSize < nextOp->m_alternative->m_minimumSize) + add32(Imm32(nextOp->m_alternative->m_minimumSize - prevOp->m_alternative->m_minimumSize), index); + prevOp = nextOp; + nextOp = &m_ops[nextOp->m_nextOp]; + } + + // We fall through to here if there is insufficient input to run the last alternative. + + // If there is insufficient input to run the last alternative, then for 'once through' + // alternatives we are done - just jump back up into the forwards matching path at the End. + if (onceThrough) { + op.m_jumps.linkTo(endOp.m_reentry, this); + jump(endOp.m_reentry); + break; + } + + // For repeating alternatives, link any input check failure from the last alternative to + // this point. + op.m_jumps.link(this); + + bool needsToUpdateMatchStart = !m_pattern.m_body->m_hasFixedSize; + + // Check for cases where input position is already incremented by 1 for the last + // alternative (this is particularly useful where the minimum size of the body + // disjunction is 0, e.g. /a*|b/). + if (needsToUpdateMatchStart && alternative->m_minimumSize == 1) { + // index is already incremented by 1, so just store it now! + setMatchStart(index); + needsToUpdateMatchStart = false; + } + + // Check whether there is sufficient input to loop. Increment the input position by + // one, and check. Also add in the minimum disjunction size before checking - there + // is no point in looping if we're just going to fail all the input checks around + // the next iteration. + ASSERT(alternative->m_minimumSize >= m_pattern.m_body->m_minimumSize); + if (alternative->m_minimumSize == m_pattern.m_body->m_minimumSize) { + // If the last alternative had the same minimum size as the disjunction, + // just simply increment input pos by 1, no adjustment based on minimum size. + add32(TrustedImm32(1), index); + } else { + // If the minumum for the last alternative was one greater than than that + // for the disjunction, we're already progressed by 1, nothing to do! + unsigned delta = (alternative->m_minimumSize - m_pattern.m_body->m_minimumSize) - 1; + if (delta) + sub32(Imm32(delta), index); + } + Jump matchFailed = jumpIfNoAvailableInput(); + + if (needsToUpdateMatchStart) { + if (!m_pattern.m_body->m_minimumSize) + setMatchStart(index); + else { + move(index, regT0); + sub32(Imm32(m_pattern.m_body->m_minimumSize), regT0); + setMatchStart(regT0); + } + } + + // Calculate how much more input the first alternative requires than the minimum + // for the body as a whole. If no more is needed then we dont need an additional + // input check here - jump straight back up to the start of the first alternative. + if (beginOp->m_alternative->m_minimumSize == m_pattern.m_body->m_minimumSize) + jump(beginOp->m_reentry); + else { + if (beginOp->m_alternative->m_minimumSize > m_pattern.m_body->m_minimumSize) + add32(Imm32(beginOp->m_alternative->m_minimumSize - m_pattern.m_body->m_minimumSize), index); + else + sub32(Imm32(m_pattern.m_body->m_minimumSize - beginOp->m_alternative->m_minimumSize), index); + checkInput().linkTo(beginOp->m_reentry, this); + jump(firstInputCheckFailed); + } + + // We jump to here if we iterate to the point that there is insufficient input to + // run any matches, and need to return a failure state from JIT code. + matchFailed.link(this); + + removeCallFrame(); + move(TrustedImmPtr((void*)WTF::notFound), returnRegister); + move(TrustedImm32(0), returnRegister2); + generateReturn(); + break; + } + case OpBodyAlternativeEnd: { + // We should never backtrack back into a body disjunction. + ASSERT(m_backtrackingState.isEmpty()); + + PatternAlternative* priorAlternative = m_ops[op.m_previousOp].m_alternative; + m_checked += priorAlternative->m_minimumSize; + break; + } + + // OpSimpleNestedAlternativeBegin/Next/End + // OpNestedAlternativeBegin/Next/End + // + // Generate code for when we backtrack back out of an alternative into + // a Begin or Next node, or when the entry input count check fails. If + // there are more alternatives we need to jump to the next alternative, + // if not we backtrack back out of the current set of parentheses. + // + // In the case of non-simple nested assertions we need to also link the + // 'return address' appropriately to backtrack back out into the correct + // alternative. + case OpSimpleNestedAlternativeBegin: + case OpSimpleNestedAlternativeNext: + case OpNestedAlternativeBegin: + case OpNestedAlternativeNext: { + YarrOp& nextOp = m_ops[op.m_nextOp]; + bool isBegin = op.m_previousOp == notFound; + bool isLastAlternative = nextOp.m_nextOp == notFound; + ASSERT(isBegin == (op.m_op == OpSimpleNestedAlternativeBegin || op.m_op == OpNestedAlternativeBegin)); + ASSERT(isLastAlternative == (nextOp.m_op == OpSimpleNestedAlternativeEnd || nextOp.m_op == OpNestedAlternativeEnd)); + + // Treat an input check failure the same as a failed match. + m_backtrackingState.append(op.m_jumps); + + // Set the backtracks to jump to the appropriate place. We may need + // to link the backtracks in one of three different way depending on + // the type of alternative we are dealing with: + // - A single alternative, with no simplings. + // - The last alternative of a set of two or more. + // - An alternative other than the last of a set of two or more. + // + // In the case of a single alternative on its own, we don't need to + // jump anywhere - if the alternative fails to match we can just + // continue to backtrack out of the parentheses without jumping. + // + // In the case of the last alternative in a set of more than one, we + // need to jump to return back out to the beginning. We'll do so by + // adding a jump to the End node's m_jumps list, and linking this + // when we come to generate the Begin node. For alternatives other + // than the last, we need to jump to the next alternative. + // + // If the alternative had adjusted the input position we must link + // backtracking to here, correct, and then jump on. If not we can + // link the backtracks directly to their destination. + if (op.m_checkAdjust) { + // Handle the cases where we need to link the backtracks here. + m_backtrackingState.link(this); + sub32(Imm32(op.m_checkAdjust), index); + if (!isLastAlternative) { + // An alternative that is not the last should jump to its successor. + jump(nextOp.m_reentry); + } else if (!isBegin) { + // The last of more than one alternatives must jump back to the beginning. + nextOp.m_jumps.append(jump()); + } else { + // A single alternative on its own can fall through. + m_backtrackingState.fallthrough(); + } + } else { + // Handle the cases where we can link the backtracks directly to their destinations. + if (!isLastAlternative) { + // An alternative that is not the last should jump to its successor. + m_backtrackingState.linkTo(nextOp.m_reentry, this); + } else if (!isBegin) { + // The last of more than one alternatives must jump back to the beginning. + m_backtrackingState.takeBacktracksToJumpList(nextOp.m_jumps, this); + } + // In the case of a single alternative on its own do nothing - it can fall through. + } + + // If there is a backtrack jump from a zero length match link it here. + if (op.m_zeroLengthMatch.isSet()) + m_backtrackingState.append(op.m_zeroLengthMatch); + + // At this point we've handled the backtracking back into this node. + // Now link any backtracks that need to jump to here. + + // For non-simple alternatives, link the alternative's 'return address' + // so that we backtrack back out into the previous alternative. + if (op.m_op == OpNestedAlternativeNext) + m_backtrackingState.append(op.m_returnAddress); + + // If there is more than one alternative, then the last alternative will + // have planted a jump to be linked to the end. This jump was added to the + // End node's m_jumps list. If we are back at the beginning, link it here. + if (isBegin) { + YarrOp* endOp = &m_ops[op.m_nextOp]; + while (endOp->m_nextOp != notFound) { + ASSERT(endOp->m_op == OpSimpleNestedAlternativeNext || endOp->m_op == OpNestedAlternativeNext); + endOp = &m_ops[endOp->m_nextOp]; + } + ASSERT(endOp->m_op == OpSimpleNestedAlternativeEnd || endOp->m_op == OpNestedAlternativeEnd); + m_backtrackingState.append(endOp->m_jumps); + } + + if (!isBegin) { + YarrOp& lastOp = m_ops[op.m_previousOp]; + m_checked += lastOp.m_checkAdjust; + } + m_checked -= op.m_checkAdjust; + break; + } + case OpSimpleNestedAlternativeEnd: + case OpNestedAlternativeEnd: { + PatternTerm* term = op.m_term; + + // If there is a backtrack jump from a zero length match link it here. + if (op.m_zeroLengthMatch.isSet()) + m_backtrackingState.append(op.m_zeroLengthMatch); + + // If we backtrack into the end of a simple subpattern do nothing; + // just continue through into the last alternative. If we backtrack + // into the end of a non-simple set of alterntives we need to jump + // to the backtracking return address set up during generation. + if (op.m_op == OpNestedAlternativeEnd) { + m_backtrackingState.link(this); + + // Plant a jump to the return address. + unsigned parenthesesFrameLocation = term->frameLocation; + unsigned alternativeFrameLocation = parenthesesFrameLocation; + if (term->quantityType != QuantifierFixedCount) + alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce; + loadFromFrameAndJump(alternativeFrameLocation); + + // Link the DataLabelPtr associated with the end of the last + // alternative to this point. + m_backtrackingState.append(op.m_returnAddress); + } + + YarrOp& lastOp = m_ops[op.m_previousOp]; + m_checked += lastOp.m_checkAdjust; + break; + } + + // OpParenthesesSubpatternOnceBegin/End + // + // When we are backtracking back out of a capturing subpattern we need + // to clear the start index in the matches output array, to record that + // this subpattern has not been captured. + // + // When backtracking back out of a Greedy quantified subpattern we need + // to catch this, and try running the remainder of the alternative after + // the subpattern again, skipping the parentheses. + // + // Upon backtracking back into a quantified set of parentheses we need to + // check whether we were currently skipping the subpattern. If not, we + // can backtrack into them, if we were we need to either backtrack back + // out of the start of the parentheses, or jump back to the forwards + // matching start, depending of whether the match is Greedy or NonGreedy. + case OpParenthesesSubpatternOnceBegin: { + PatternTerm* term = op.m_term; + ASSERT(term->quantityCount == 1); + + // We only need to backtrack to thispoint if capturing or greedy. + if ((term->capture() && compileMode == IncludeSubpatterns) || term->quantityType == QuantifierGreedy) { + m_backtrackingState.link(this); + + // If capturing, clear the capture (we only need to reset start). + if (term->capture() && compileMode == IncludeSubpatterns) + clearSubpatternStart(term->parentheses.subpatternId); + + // If Greedy, jump to the end. + if (term->quantityType == QuantifierGreedy) { + // Clear the flag in the stackframe indicating we ran through the subpattern. + unsigned parenthesesFrameLocation = term->frameLocation; + storeToFrame(TrustedImm32(-1), parenthesesFrameLocation); + // Jump to after the parentheses, skipping the subpattern. + jump(m_ops[op.m_nextOp].m_reentry); + // A backtrack from after the parentheses, when skipping the subpattern, + // will jump back to here. + op.m_jumps.link(this); + } + + m_backtrackingState.fallthrough(); + } + break; + } + case OpParenthesesSubpatternOnceEnd: { + PatternTerm* term = op.m_term; + + if (term->quantityType != QuantifierFixedCount) { + m_backtrackingState.link(this); + + // Check whether we should backtrack back into the parentheses, or if we + // are currently in a state where we had skipped over the subpattern + // (in which case the flag value on the stack will be -1). + unsigned parenthesesFrameLocation = term->frameLocation; + Jump hadSkipped = branch32(Equal, Address(stackPointerRegister, parenthesesFrameLocation * sizeof(void*)), TrustedImm32(-1)); + + if (term->quantityType == QuantifierGreedy) { + // For Greedy parentheses, we skip after having already tried going + // through the subpattern, so if we get here we're done. + YarrOp& beginOp = m_ops[op.m_previousOp]; + beginOp.m_jumps.append(hadSkipped); + } else { + // For NonGreedy parentheses, we try skipping the subpattern first, + // so if we get here we need to try running through the subpattern + // next. Jump back to the start of the parentheses in the forwards + // matching path. + ASSERT(term->quantityType == QuantifierNonGreedy); + YarrOp& beginOp = m_ops[op.m_previousOp]; + hadSkipped.linkTo(beginOp.m_reentry, this); + } + + m_backtrackingState.fallthrough(); + } + + m_backtrackingState.append(op.m_jumps); + break; + } + + // OpParenthesesSubpatternTerminalBegin/End + // + // Terminal subpatterns will always match - there is nothing after them to + // force a backtrack, and they have a minimum count of 0, and as such will + // always produce an acceptable result. + case OpParenthesesSubpatternTerminalBegin: { + // We will backtrack to this point once the subpattern cannot match any + // more. Since no match is accepted as a successful match (we are Greedy + // quantified with a minimum of zero) jump back to the forwards matching + // path at the end. + YarrOp& endOp = m_ops[op.m_nextOp]; + m_backtrackingState.linkTo(endOp.m_reentry, this); + break; + } + case OpParenthesesSubpatternTerminalEnd: + // We should never be backtracking to here (hence the 'terminal' in the name). + ASSERT(m_backtrackingState.isEmpty()); + m_backtrackingState.append(op.m_jumps); + break; + + // OpParentheticalAssertionBegin/End + case OpParentheticalAssertionBegin: { + PatternTerm* term = op.m_term; + YarrOp& endOp = m_ops[op.m_nextOp]; + + // We need to handle the backtracks upon backtracking back out + // of a parenthetical assertion if either we need to correct + // the input index, or the assertion was inverted. + if (op.m_checkAdjust || term->invert()) { + m_backtrackingState.link(this); + + if (op.m_checkAdjust) + add32(Imm32(op.m_checkAdjust), index); + + // In an inverted assertion failure to match the subpattern + // is treated as a successful match - jump to the end of the + // subpattern. We already have adjusted the input position + // back to that before the assertion, which is correct. + if (term->invert()) + jump(endOp.m_reentry); + + m_backtrackingState.fallthrough(); + } + + // The End node's jump list will contain any backtracks into + // the end of the assertion. Also, if inverted, we will have + // added the failure caused by a successful match to this. + m_backtrackingState.append(endOp.m_jumps); + + m_checked += op.m_checkAdjust; + break; + } + case OpParentheticalAssertionEnd: { + // FIXME: We should really be clearing any nested subpattern + // matches on bailing out from after the pattern. Firefox has + // this bug too (presumably because they use YARR!) + + // Never backtrack into an assertion; later failures bail to before the begin. + m_backtrackingState.takeBacktracksToJumpList(op.m_jumps, this); + + YarrOp& lastOp = m_ops[op.m_previousOp]; + m_checked -= lastOp.m_checkAdjust; + break; + } + + case OpMatchFailed: + break; + } + + } while (opIndex); + } + + // Compilation methods: + // ==================== + + // opCompileParenthesesSubpattern + // Emits ops for a subpattern (set of parentheses). These consist + // of a set of alternatives wrapped in an outer set of nodes for + // the parentheses. + // Supported types of parentheses are 'Once' (quantityCount == 1) + // and 'Terminal' (non-capturing parentheses quantified as greedy + // and infinite). + // Alternatives will use the 'Simple' set of ops if either the + // subpattern is terminal (in which case we will never need to + // backtrack), or if the subpattern only contains one alternative. + void opCompileParenthesesSubpattern(PatternTerm* term) + { + YarrOpCode parenthesesBeginOpCode; + YarrOpCode parenthesesEndOpCode; + YarrOpCode alternativeBeginOpCode = OpSimpleNestedAlternativeBegin; + YarrOpCode alternativeNextOpCode = OpSimpleNestedAlternativeNext; + YarrOpCode alternativeEndOpCode = OpSimpleNestedAlternativeEnd; + + // We can currently only compile quantity 1 subpatterns that are + // not copies. We generate a copy in the case of a range quantifier, + // e.g. /(?:x){3,9}/, or /(?:x)+/ (These are effectively expanded to + // /(?:x){3,3}(?:x){0,6}/ and /(?:x)(?:x)*/ repectively). The problem + // comes where the subpattern is capturing, in which case we would + // need to restore the capture from the first subpattern upon a + // failure in the second. + if (term->quantityCount == 1 && !term->parentheses.isCopy) { + // Select the 'Once' nodes. + parenthesesBeginOpCode = OpParenthesesSubpatternOnceBegin; + parenthesesEndOpCode = OpParenthesesSubpatternOnceEnd; + + // If there is more than one alternative we cannot use the 'simple' nodes. + if (term->parentheses.disjunction->m_alternatives.size() != 1) { + alternativeBeginOpCode = OpNestedAlternativeBegin; + alternativeNextOpCode = OpNestedAlternativeNext; + alternativeEndOpCode = OpNestedAlternativeEnd; + } + } else if (term->parentheses.isTerminal) { + // Select the 'Terminal' nodes. + parenthesesBeginOpCode = OpParenthesesSubpatternTerminalBegin; + parenthesesEndOpCode = OpParenthesesSubpatternTerminalEnd; + } else { + // This subpattern is not supported by the JIT. + m_shouldFallBack = true; + return; + } + + size_t parenBegin = m_ops.size(); + m_ops.append(parenthesesBeginOpCode); + + m_ops.append(alternativeBeginOpCode); + m_ops.last().m_previousOp = notFound; + m_ops.last().m_term = term; + Vector<PatternAlternative*>& alternatives = term->parentheses.disjunction->m_alternatives; + for (unsigned i = 0; i < alternatives.size(); ++i) { + size_t lastOpIndex = m_ops.size() - 1; + + PatternAlternative* nestedAlternative = alternatives[i]; + opCompileAlternative(nestedAlternative); + + size_t thisOpIndex = m_ops.size(); + m_ops.append(YarrOp(alternativeNextOpCode)); + + YarrOp& lastOp = m_ops[lastOpIndex]; + YarrOp& thisOp = m_ops[thisOpIndex]; + + lastOp.m_alternative = nestedAlternative; + lastOp.m_nextOp = thisOpIndex; + thisOp.m_previousOp = lastOpIndex; + thisOp.m_term = term; + } + YarrOp& lastOp = m_ops.last(); + ASSERT(lastOp.m_op == alternativeNextOpCode); + lastOp.m_op = alternativeEndOpCode; + lastOp.m_alternative = 0; + lastOp.m_nextOp = notFound; + + size_t parenEnd = m_ops.size(); + m_ops.append(parenthesesEndOpCode); + + m_ops[parenBegin].m_term = term; + m_ops[parenBegin].m_previousOp = notFound; + m_ops[parenBegin].m_nextOp = parenEnd; + m_ops[parenEnd].m_term = term; + m_ops[parenEnd].m_previousOp = parenBegin; + m_ops[parenEnd].m_nextOp = notFound; + } + + // opCompileParentheticalAssertion + // Emits ops for a parenthetical assertion. These consist of an + // OpSimpleNestedAlternativeBegin/Next/End set of nodes wrapping + // the alternatives, with these wrapped by an outer pair of + // OpParentheticalAssertionBegin/End nodes. + // We can always use the OpSimpleNestedAlternative nodes in the + // case of parenthetical assertions since these only ever match + // once, and will never backtrack back into the assertion. + void opCompileParentheticalAssertion(PatternTerm* term) + { + size_t parenBegin = m_ops.size(); + m_ops.append(OpParentheticalAssertionBegin); + + m_ops.append(OpSimpleNestedAlternativeBegin); + m_ops.last().m_previousOp = notFound; + m_ops.last().m_term = term; + Vector<PatternAlternative*>& alternatives = term->parentheses.disjunction->m_alternatives; + for (unsigned i = 0; i < alternatives.size(); ++i) { + size_t lastOpIndex = m_ops.size() - 1; + + PatternAlternative* nestedAlternative = alternatives[i]; + opCompileAlternative(nestedAlternative); + + size_t thisOpIndex = m_ops.size(); + m_ops.append(YarrOp(OpSimpleNestedAlternativeNext)); + + YarrOp& lastOp = m_ops[lastOpIndex]; + YarrOp& thisOp = m_ops[thisOpIndex]; + + lastOp.m_alternative = nestedAlternative; + lastOp.m_nextOp = thisOpIndex; + thisOp.m_previousOp = lastOpIndex; + thisOp.m_term = term; + } + YarrOp& lastOp = m_ops.last(); + ASSERT(lastOp.m_op == OpSimpleNestedAlternativeNext); + lastOp.m_op = OpSimpleNestedAlternativeEnd; + lastOp.m_alternative = 0; + lastOp.m_nextOp = notFound; + + size_t parenEnd = m_ops.size(); + m_ops.append(OpParentheticalAssertionEnd); + + m_ops[parenBegin].m_term = term; + m_ops[parenBegin].m_previousOp = notFound; + m_ops[parenBegin].m_nextOp = parenEnd; + m_ops[parenEnd].m_term = term; + m_ops[parenEnd].m_previousOp = parenBegin; + m_ops[parenEnd].m_nextOp = notFound; + } + + // opCompileAlternative + // Called to emit nodes for all terms in an alternative. + void opCompileAlternative(PatternAlternative* alternative) + { + optimizeAlternative(alternative); + + for (unsigned i = 0; i < alternative->m_terms.size(); ++i) { + PatternTerm* term = &alternative->m_terms[i]; + + switch (term->type) { + case PatternTerm::TypeParenthesesSubpattern: + opCompileParenthesesSubpattern(term); + break; + + case PatternTerm::TypeParentheticalAssertion: + opCompileParentheticalAssertion(term); + break; + + default: + m_ops.append(term); + } + } + } + + // opCompileBody + // This method compiles the body disjunction of the regular expression. + // The body consists of two sets of alternatives - zero or more 'once + // through' (BOL anchored) alternatives, followed by zero or more + // repeated alternatives. + // For each of these two sets of alteratives, if not empty they will be + // wrapped in a set of OpBodyAlternativeBegin/Next/End nodes (with the + // 'begin' node referencing the first alternative, and 'next' nodes + // referencing any further alternatives. The begin/next/end nodes are + // linked together in a doubly linked list. In the case of repeating + // alternatives, the end node is also linked back to the beginning. + // If no repeating alternatives exist, then a OpMatchFailed node exists + // to return the failing result. + void opCompileBody(PatternDisjunction* disjunction) + { + Vector<PatternAlternative*>& alternatives = disjunction->m_alternatives; + size_t currentAlternativeIndex = 0; + + // Emit the 'once through' alternatives. + if (alternatives.size() && alternatives[0]->onceThrough()) { + m_ops.append(YarrOp(OpBodyAlternativeBegin)); + m_ops.last().m_previousOp = notFound; + + do { + size_t lastOpIndex = m_ops.size() - 1; + PatternAlternative* alternative = alternatives[currentAlternativeIndex]; + opCompileAlternative(alternative); + + size_t thisOpIndex = m_ops.size(); + m_ops.append(YarrOp(OpBodyAlternativeNext)); + + YarrOp& lastOp = m_ops[lastOpIndex]; + YarrOp& thisOp = m_ops[thisOpIndex]; + + lastOp.m_alternative = alternative; + lastOp.m_nextOp = thisOpIndex; + thisOp.m_previousOp = lastOpIndex; + + ++currentAlternativeIndex; + } while (currentAlternativeIndex < alternatives.size() && alternatives[currentAlternativeIndex]->onceThrough()); + + YarrOp& lastOp = m_ops.last(); + + ASSERT(lastOp.m_op == OpBodyAlternativeNext); + lastOp.m_op = OpBodyAlternativeEnd; + lastOp.m_alternative = 0; + lastOp.m_nextOp = notFound; + } + + if (currentAlternativeIndex == alternatives.size()) { + m_ops.append(YarrOp(OpMatchFailed)); + return; + } + + // Emit the repeated alternatives. + size_t repeatLoop = m_ops.size(); + m_ops.append(YarrOp(OpBodyAlternativeBegin)); + m_ops.last().m_previousOp = notFound; + do { + size_t lastOpIndex = m_ops.size() - 1; + PatternAlternative* alternative = alternatives[currentAlternativeIndex]; + ASSERT(!alternative->onceThrough()); + opCompileAlternative(alternative); + + size_t thisOpIndex = m_ops.size(); + m_ops.append(YarrOp(OpBodyAlternativeNext)); + + YarrOp& lastOp = m_ops[lastOpIndex]; + YarrOp& thisOp = m_ops[thisOpIndex]; + + lastOp.m_alternative = alternative; + lastOp.m_nextOp = thisOpIndex; + thisOp.m_previousOp = lastOpIndex; + + ++currentAlternativeIndex; + } while (currentAlternativeIndex < alternatives.size()); + YarrOp& lastOp = m_ops.last(); + ASSERT(lastOp.m_op == OpBodyAlternativeNext); + lastOp.m_op = OpBodyAlternativeEnd; + lastOp.m_alternative = 0; + lastOp.m_nextOp = repeatLoop; + } + + void generateEnter() + { +#if CPU(X86_64) + push(X86Registers::ebp); + move(stackPointerRegister, X86Registers::ebp); + push(X86Registers::ebx); +#elif CPU(X86) + push(X86Registers::ebp); + move(stackPointerRegister, X86Registers::ebp); + // TODO: do we need spill registers to fill the output pointer if there are no sub captures? + push(X86Registers::ebx); + push(X86Registers::edi); + push(X86Registers::esi); + // load output into edi (2 = saved ebp + return address). + #if COMPILER(MSVC) + loadPtr(Address(X86Registers::ebp, 2 * sizeof(void*)), input); + loadPtr(Address(X86Registers::ebp, 3 * sizeof(void*)), index); + loadPtr(Address(X86Registers::ebp, 4 * sizeof(void*)), length); + if (compileMode == IncludeSubpatterns) + loadPtr(Address(X86Registers::ebp, 5 * sizeof(void*)), output); + #else + if (compileMode == IncludeSubpatterns) + loadPtr(Address(X86Registers::ebp, 2 * sizeof(void*)), output); + #endif +#elif CPU(ARM) + push(ARMRegisters::r4); + push(ARMRegisters::r5); + push(ARMRegisters::r6); +#if CPU(ARM_TRADITIONAL) + push(ARMRegisters::r8); // scratch register +#endif + if (compileMode == IncludeSubpatterns) + move(ARMRegisters::r3, output); +#elif CPU(SH4) + push(SH4Registers::r11); + push(SH4Registers::r13); +#elif CPU(MIPS) + // Do nothing. +#endif + } + + void generateReturn() + { +#if CPU(X86_64) + pop(X86Registers::ebx); + pop(X86Registers::ebp); +#elif CPU(X86) + pop(X86Registers::esi); + pop(X86Registers::edi); + pop(X86Registers::ebx); + pop(X86Registers::ebp); +#elif CPU(ARM) +#if CPU(ARM_TRADITIONAL) + pop(ARMRegisters::r8); // scratch register +#endif + pop(ARMRegisters::r6); + pop(ARMRegisters::r5); + pop(ARMRegisters::r4); +#elif CPU(SH4) + pop(SH4Registers::r13); + pop(SH4Registers::r11); +#elif CPU(MIPS) + // Do nothing +#endif + ret(); + } + +public: + YarrGenerator(YarrPattern& pattern, YarrCharSize charSize) + : m_pattern(pattern) + , m_charSize(charSize) + , m_charScale(m_charSize == Char8 ? TimesOne: TimesTwo) + , m_shouldFallBack(false) + , m_checked(0) + { + } + + void compile(JSGlobalData* globalData, YarrCodeBlock& jitObject) + { + generateEnter(); + + Jump hasInput = checkInput(); + move(TrustedImmPtr((void*)WTF::notFound), returnRegister); + move(TrustedImm32(0), returnRegister2); + generateReturn(); + hasInput.link(this); + + if (compileMode == IncludeSubpatterns) { + for (unsigned i = 0; i < m_pattern.m_numSubpatterns + 1; ++i) + store32(TrustedImm32(-1), Address(output, (i << 1) * sizeof(int))); + } + + if (!m_pattern.m_body->m_hasFixedSize) + setMatchStart(index); + + initCallFrame(); + + // Compile the pattern to the internal 'YarrOp' representation. + opCompileBody(m_pattern.m_body); + + // If we encountered anything we can't handle in the JIT code + // (e.g. backreferences) then return early. + if (m_shouldFallBack) { + jitObject.setFallBack(true); + return; + } + + generate(); + backtrack(); + + // Link & finalize the code. + LinkBuffer linkBuffer(*globalData, this, REGEXP_CODE_ID); + m_backtrackingState.linkDataLabels(linkBuffer); + + if (compileMode == MatchOnly) { + if (m_charSize == Char8) + jitObject.set8BitCodeMatchOnly(FINALIZE_CODE(linkBuffer, ("Match-only 8-bit regular expression"))); + else + jitObject.set16BitCodeMatchOnly(FINALIZE_CODE(linkBuffer, ("Match-only 16-bit regular expression"))); + } else { + if (m_charSize == Char8) + jitObject.set8BitCode(FINALIZE_CODE(linkBuffer, ("8-bit regular expression"))); + else + jitObject.set16BitCode(FINALIZE_CODE(linkBuffer, ("16-bit regular expression"))); + } + jitObject.setFallBack(m_shouldFallBack); + } + +private: + YarrPattern& m_pattern; + + YarrCharSize m_charSize; + + Scale m_charScale; + + // Used to detect regular expression constructs that are not currently + // supported in the JIT; fall back to the interpreter when this is detected. + bool m_shouldFallBack; + + // The regular expression expressed as a linear sequence of operations. + Vector<YarrOp, 128> m_ops; + + // This records the current input offset being applied due to the current + // set of alternatives we are nested within. E.g. when matching the + // character 'b' within the regular expression /abc/, we will know that + // the minimum size for the alternative is 3, checked upon entry to the + // alternative, and that 'b' is at offset 1 from the start, and as such + // when matching 'b' we need to apply an offset of -2 to the load. + // + // FIXME: This should go away. Rather than tracking this value throughout + // code generation, we should gather this information up front & store it + // on the YarrOp structure. + int m_checked; + + // This class records state whilst generating the backtracking path of code. + BacktrackingState m_backtrackingState; +}; + +void jitCompile(YarrPattern& pattern, YarrCharSize charSize, JSGlobalData* globalData, YarrCodeBlock& jitObject, YarrJITCompileMode mode) +{ + if (mode == MatchOnly) + YarrGenerator<MatchOnly>(pattern, charSize).compile(globalData, jitObject); + else + YarrGenerator<IncludeSubpatterns>(pattern, charSize).compile(globalData, jitObject); +} + +}} + +#endif diff --git a/src/3rdparty/masm/yarr/YarrJIT.h b/src/3rdparty/masm/yarr/YarrJIT.h new file mode 100644 index 0000000000..bb7033fdea --- /dev/null +++ b/src/3rdparty/masm/yarr/YarrJIT.h @@ -0,0 +1,141 @@ +/* + * Copyright (C) 2009 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef YarrJIT_h +#define YarrJIT_h + +#if ENABLE(YARR_JIT) + +#include "JSGlobalData.h" +#include "MacroAssemblerCodeRef.h" +#include "MatchResult.h" +#include "Yarr.h" +#include "YarrPattern.h" + +#if CPU(X86) && !COMPILER(MSVC) +#define YARR_CALL __attribute__ ((regparm (3))) +#else +#define YARR_CALL +#endif + +namespace JSC { + +class JSGlobalData; +class ExecutablePool; + +namespace Yarr { + +class YarrCodeBlock { +#if CPU(X86_64) + typedef MatchResult (*YarrJITCode8)(const LChar* input, unsigned start, unsigned length, int* output) YARR_CALL; + typedef MatchResult (*YarrJITCode16)(const UChar* input, unsigned start, unsigned length, int* output) YARR_CALL; + typedef MatchResult (*YarrJITCodeMatchOnly8)(const LChar* input, unsigned start, unsigned length) YARR_CALL; + typedef MatchResult (*YarrJITCodeMatchOnly16)(const UChar* input, unsigned start, unsigned length) YARR_CALL; +#else + typedef EncodedMatchResult (*YarrJITCode8)(const LChar* input, unsigned start, unsigned length, int* output) YARR_CALL; + typedef EncodedMatchResult (*YarrJITCode16)(const UChar* input, unsigned start, unsigned length, int* output) YARR_CALL; + typedef EncodedMatchResult (*YarrJITCodeMatchOnly8)(const LChar* input, unsigned start, unsigned length) YARR_CALL; + typedef EncodedMatchResult (*YarrJITCodeMatchOnly16)(const UChar* input, unsigned start, unsigned length) YARR_CALL; +#endif + +public: + YarrCodeBlock() + : m_needFallBack(false) + { + } + + ~YarrCodeBlock() + { + } + + void setFallBack(bool fallback) { m_needFallBack = fallback; } + bool isFallBack() { return m_needFallBack; } + + bool has8BitCode() { return m_ref8.size(); } + bool has16BitCode() { return m_ref16.size(); } + void set8BitCode(MacroAssemblerCodeRef ref) { m_ref8 = ref; } + void set16BitCode(MacroAssemblerCodeRef ref) { m_ref16 = ref; } + + bool has8BitCodeMatchOnly() { return m_matchOnly8.size(); } + bool has16BitCodeMatchOnly() { return m_matchOnly16.size(); } + void set8BitCodeMatchOnly(MacroAssemblerCodeRef matchOnly) { m_matchOnly8 = matchOnly; } + void set16BitCodeMatchOnly(MacroAssemblerCodeRef matchOnly) { m_matchOnly16 = matchOnly; } + + MatchResult execute(const LChar* input, unsigned start, unsigned length, int* output) + { + ASSERT(has8BitCode()); + return MatchResult(reinterpret_cast<YarrJITCode8>(m_ref8.code().executableAddress())(input, start, length, output)); + } + + MatchResult execute(const UChar* input, unsigned start, unsigned length, int* output) + { + ASSERT(has16BitCode()); + return MatchResult(reinterpret_cast<YarrJITCode16>(m_ref16.code().executableAddress())(input, start, length, output)); + } + + MatchResult execute(const LChar* input, unsigned start, unsigned length) + { + ASSERT(has8BitCodeMatchOnly()); + return MatchResult(reinterpret_cast<YarrJITCodeMatchOnly8>(m_matchOnly8.code().executableAddress())(input, start, length)); + } + + MatchResult execute(const UChar* input, unsigned start, unsigned length) + { + ASSERT(has16BitCodeMatchOnly()); + return MatchResult(reinterpret_cast<YarrJITCodeMatchOnly16>(m_matchOnly16.code().executableAddress())(input, start, length)); + } + +#if ENABLE(REGEXP_TRACING) + void *getAddr() { return m_ref.code().executableAddress(); } +#endif + + void clear() + { + m_ref8 = MacroAssemblerCodeRef(); + m_ref16 = MacroAssemblerCodeRef(); + m_matchOnly8 = MacroAssemblerCodeRef(); + m_matchOnly16 = MacroAssemblerCodeRef(); + m_needFallBack = false; + } + +private: + MacroAssemblerCodeRef m_ref8; + MacroAssemblerCodeRef m_ref16; + MacroAssemblerCodeRef m_matchOnly8; + MacroAssemblerCodeRef m_matchOnly16; + bool m_needFallBack; +}; + +enum YarrJITCompileMode { + MatchOnly, + IncludeSubpatterns +}; +void jitCompile(YarrPattern&, YarrCharSize, JSGlobalData*, YarrCodeBlock& jitObject, YarrJITCompileMode = IncludeSubpatterns); + +} } // namespace JSC::Yarr + +#endif + +#endif // YarrJIT_h diff --git a/src/3rdparty/masm/yarr/YarrParser.h b/src/3rdparty/masm/yarr/YarrParser.h new file mode 100644 index 0000000000..4bab1a0903 --- /dev/null +++ b/src/3rdparty/masm/yarr/YarrParser.h @@ -0,0 +1,880 @@ +/* + * Copyright (C) 2009 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef YarrParser_h +#define YarrParser_h + +#include "Yarr.h" +#include <wtf/ASCIICType.h> +#include <wtf/text/WTFString.h> +#include <wtf/unicode/Unicode.h> + +namespace JSC { namespace Yarr { + +#define REGEXP_ERROR_PREFIX "Invalid regular expression: " + +enum BuiltInCharacterClassID { + DigitClassID, + SpaceClassID, + WordClassID, + NewlineClassID, +}; + +// The Parser class should not be used directly - only via the Yarr::parse() method. +template<class Delegate, typename CharType> +class Parser { +private: + template<class FriendDelegate> + friend const char* parse(FriendDelegate&, const String& pattern, unsigned backReferenceLimit); + + enum ErrorCode { + NoError, + PatternTooLarge, + QuantifierOutOfOrder, + QuantifierWithoutAtom, + QuantifierTooLarge, + MissingParentheses, + ParenthesesUnmatched, + ParenthesesTypeInvalid, + CharacterClassUnmatched, + CharacterClassOutOfOrder, + EscapeUnterminated, + NumberOfErrorCodes + }; + + /* + * CharacterClassParserDelegate: + * + * The class CharacterClassParserDelegate is used in the parsing of character + * classes. This class handles detection of character ranges. This class + * implements enough of the delegate interface such that it can be passed to + * parseEscape() as an EscapeDelegate. This allows parseEscape() to be reused + * to perform the parsing of escape characters in character sets. + */ + class CharacterClassParserDelegate { + public: + CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err) + : m_delegate(delegate) + , m_err(err) + , m_state(Empty) + , m_character(0) + { + } + + /* + * begin(): + * + * Called at beginning of construction. + */ + void begin(bool invert) + { + m_delegate.atomCharacterClassBegin(invert); + } + + /* + * atomPatternCharacter(): + * + * This method is called either from parseCharacterClass() (for an unescaped + * character in a character class), or from parseEscape(). In the former case + * the value true will be passed for the argument 'hyphenIsRange', and in this + * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/ + * is different to /[a\-z]/). + */ + void atomPatternCharacter(UChar ch, bool hyphenIsRange = false) + { + switch (m_state) { + case AfterCharacterClass: + // Following a builtin character class we need look out for a hyphen. + // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/. + // If we see a hyphen following a charater class then unlike usual + // we'll report it to the delegate immediately, and put ourself into + // a poisoned state. Any following calls to add another character or + // character class will result in an error. (A hypen following a + // character-class is itself valid, but only at the end of a regex). + if (hyphenIsRange && ch == '-') { + m_delegate.atomCharacterClassAtom('-'); + m_state = AfterCharacterClassHyphen; + return; + } + // Otherwise just fall through - cached character so treat this as Empty. + + case Empty: + m_character = ch; + m_state = CachedCharacter; + return; + + case CachedCharacter: + if (hyphenIsRange && ch == '-') + m_state = CachedCharacterHyphen; + else { + m_delegate.atomCharacterClassAtom(m_character); + m_character = ch; + } + return; + + case CachedCharacterHyphen: + if (ch < m_character) { + m_err = CharacterClassOutOfOrder; + return; + } + m_delegate.atomCharacterClassRange(m_character, ch); + m_state = Empty; + return; + + // See coment in atomBuiltInCharacterClass below. + // This too is technically an error, per ECMA-262, and again we + // we chose to allow this. Note a subtlely here that while we + // diverge from the spec's definition of CharacterRange we do + // remain in compliance with the grammar. For example, consider + // the expression /[\d-a-z]/. We comply with the grammar in + // this case by not allowing a-z to be matched as a range. + case AfterCharacterClassHyphen: + m_delegate.atomCharacterClassAtom(ch); + m_state = Empty; + return; + } + } + + /* + * atomBuiltInCharacterClass(): + * + * Adds a built-in character class, called by parseEscape(). + */ + void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) + { + switch (m_state) { + case CachedCharacter: + // Flush the currently cached character, then fall through. + m_delegate.atomCharacterClassAtom(m_character); + + case Empty: + case AfterCharacterClass: + m_state = AfterCharacterClass; + m_delegate.atomCharacterClassBuiltIn(classID, invert); + return; + + // If we hit either of these cases, we have an invalid range that + // looks something like /[x-\d]/ or /[\d-\d]/. + // According to ECMA-262 this should be a syntax error, but + // empirical testing shows this to break teh webz. Instead we + // comply with to the ECMA-262 grammar, and assume the grammar to + // have matched the range correctly, but tweak our interpretation + // of CharacterRange. Effectively we implicitly handle the hyphen + // as if it were escaped, e.g. /[\w-_]/ is treated as /[\w\-_]/. + case CachedCharacterHyphen: + m_delegate.atomCharacterClassAtom(m_character); + m_delegate.atomCharacterClassAtom('-'); + // fall through + case AfterCharacterClassHyphen: + m_delegate.atomCharacterClassBuiltIn(classID, invert); + m_state = Empty; + return; + } + } + + /* + * end(): + * + * Called at end of construction. + */ + void end() + { + if (m_state == CachedCharacter) + m_delegate.atomCharacterClassAtom(m_character); + else if (m_state == CachedCharacterHyphen) { + m_delegate.atomCharacterClassAtom(m_character); + m_delegate.atomCharacterClassAtom('-'); + } + m_delegate.atomCharacterClassEnd(); + } + + // parseEscape() should never call these delegate methods when + // invoked with inCharacterClass set. + NO_RETURN_DUE_TO_ASSERT void assertionWordBoundary(bool) { ASSERT_NOT_REACHED(); } + NO_RETURN_DUE_TO_ASSERT void atomBackReference(unsigned) { ASSERT_NOT_REACHED(); } + + private: + Delegate& m_delegate; + ErrorCode& m_err; + enum CharacterClassConstructionState { + Empty, + CachedCharacter, + CachedCharacterHyphen, + AfterCharacterClass, + AfterCharacterClassHyphen, + } m_state; + UChar m_character; + }; + + Parser(Delegate& delegate, const String& pattern, unsigned backReferenceLimit) + : m_delegate(delegate) + , m_backReferenceLimit(backReferenceLimit) + , m_err(NoError) + , m_data(pattern.getCharacters<CharType>()) + , m_size(pattern.length()) + , m_index(0) + , m_parenthesesNestingDepth(0) + { + } + + /* + * parseEscape(): + * + * Helper for parseTokens() AND parseCharacterClass(). + * Unlike the other parser methods, this function does not report tokens + * directly to the member delegate (m_delegate), instead tokens are + * emitted to the delegate provided as an argument. In the case of atom + * escapes, parseTokens() will call parseEscape() passing m_delegate as + * an argument, and as such the escape will be reported to the delegate. + * + * However this method may also be used by parseCharacterClass(), in which + * case a CharacterClassParserDelegate will be passed as the delegate that + * tokens should be added to. A boolean flag is also provided to indicate + * whether that an escape in a CharacterClass is being parsed (some parsing + * rules change in this context). + * + * The boolean value returned by this method indicates whether the token + * parsed was an atom (outside of a characted class \b and \B will be + * interpreted as assertions). + */ + template<bool inCharacterClass, class EscapeDelegate> + bool parseEscape(EscapeDelegate& delegate) + { + ASSERT(!m_err); + ASSERT(peek() == '\\'); + consume(); + + if (atEndOfPattern()) { + m_err = EscapeUnterminated; + return false; + } + + switch (peek()) { + // Assertions + case 'b': + consume(); + if (inCharacterClass) + delegate.atomPatternCharacter('\b'); + else { + delegate.assertionWordBoundary(false); + return false; + } + break; + case 'B': + consume(); + if (inCharacterClass) + delegate.atomPatternCharacter('B'); + else { + delegate.assertionWordBoundary(true); + return false; + } + break; + + // CharacterClassEscape + case 'd': + consume(); + delegate.atomBuiltInCharacterClass(DigitClassID, false); + break; + case 's': + consume(); + delegate.atomBuiltInCharacterClass(SpaceClassID, false); + break; + case 'w': + consume(); + delegate.atomBuiltInCharacterClass(WordClassID, false); + break; + case 'D': + consume(); + delegate.atomBuiltInCharacterClass(DigitClassID, true); + break; + case 'S': + consume(); + delegate.atomBuiltInCharacterClass(SpaceClassID, true); + break; + case 'W': + consume(); + delegate.atomBuiltInCharacterClass(WordClassID, true); + break; + + // DecimalEscape + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': { + // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape. + // First, try to parse this as backreference. + if (!inCharacterClass) { + ParseState state = saveState(); + + unsigned backReference = consumeNumber(); + if (backReference <= m_backReferenceLimit) { + delegate.atomBackReference(backReference); + break; + } + + restoreState(state); + } + + // Not a backreference, and not octal. + if (peek() >= '8') { + delegate.atomPatternCharacter('\\'); + break; + } + + // Fall-through to handle this as an octal escape. + } + + // Octal escape + case '0': + delegate.atomPatternCharacter(consumeOctal()); + break; + + // ControlEscape + case 'f': + consume(); + delegate.atomPatternCharacter('\f'); + break; + case 'n': + consume(); + delegate.atomPatternCharacter('\n'); + break; + case 'r': + consume(); + delegate.atomPatternCharacter('\r'); + break; + case 't': + consume(); + delegate.atomPatternCharacter('\t'); + break; + case 'v': + consume(); + delegate.atomPatternCharacter('\v'); + break; + + // ControlLetter + case 'c': { + ParseState state = saveState(); + consume(); + if (!atEndOfPattern()) { + int control = consume(); + + // To match Firefox, inside a character class, we also accept numbers and '_' as control characters. + if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) { + delegate.atomPatternCharacter(control & 0x1f); + break; + } + } + restoreState(state); + delegate.atomPatternCharacter('\\'); + break; + } + + // HexEscape + case 'x': { + consume(); + int x = tryConsumeHex(2); + if (x == -1) + delegate.atomPatternCharacter('x'); + else + delegate.atomPatternCharacter(x); + break; + } + + // UnicodeEscape + case 'u': { + consume(); + int u = tryConsumeHex(4); + if (u == -1) + delegate.atomPatternCharacter('u'); + else + delegate.atomPatternCharacter(u); + break; + } + + // IdentityEscape + default: + delegate.atomPatternCharacter(consume()); + } + + return true; + } + + /* + * parseAtomEscape(), parseCharacterClassEscape(): + * + * These methods alias to parseEscape(). + */ + bool parseAtomEscape() + { + return parseEscape<false>(m_delegate); + } + void parseCharacterClassEscape(CharacterClassParserDelegate& delegate) + { + parseEscape<true>(delegate); + } + + /* + * parseCharacterClass(): + * + * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape) + * to an instance of CharacterClassParserDelegate, to describe the character class to the + * delegate. + */ + void parseCharacterClass() + { + ASSERT(!m_err); + ASSERT(peek() == '['); + consume(); + + CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err); + + characterClassConstructor.begin(tryConsume('^')); + + while (!atEndOfPattern()) { + switch (peek()) { + case ']': + consume(); + characterClassConstructor.end(); + return; + + case '\\': + parseCharacterClassEscape(characterClassConstructor); + break; + + default: + characterClassConstructor.atomPatternCharacter(consume(), true); + } + + if (m_err) + return; + } + + m_err = CharacterClassUnmatched; + } + + /* + * parseParenthesesBegin(): + * + * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns. + */ + void parseParenthesesBegin() + { + ASSERT(!m_err); + ASSERT(peek() == '('); + consume(); + + if (tryConsume('?')) { + if (atEndOfPattern()) { + m_err = ParenthesesTypeInvalid; + return; + } + + switch (consume()) { + case ':': + m_delegate.atomParenthesesSubpatternBegin(false); + break; + + case '=': + m_delegate.atomParentheticalAssertionBegin(); + break; + + case '!': + m_delegate.atomParentheticalAssertionBegin(true); + break; + + default: + m_err = ParenthesesTypeInvalid; + } + } else + m_delegate.atomParenthesesSubpatternBegin(); + + ++m_parenthesesNestingDepth; + } + + /* + * parseParenthesesEnd(): + * + * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses). + */ + void parseParenthesesEnd() + { + ASSERT(!m_err); + ASSERT(peek() == ')'); + consume(); + + if (m_parenthesesNestingDepth > 0) + m_delegate.atomParenthesesEnd(); + else + m_err = ParenthesesUnmatched; + + --m_parenthesesNestingDepth; + } + + /* + * parseQuantifier(): + * + * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers. + */ + void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max) + { + ASSERT(!m_err); + ASSERT(min <= max); + + if (min == UINT_MAX) { + m_err = QuantifierTooLarge; + return; + } + + if (lastTokenWasAnAtom) + m_delegate.quantifyAtom(min, max, !tryConsume('?')); + else + m_err = QuantifierWithoutAtom; + } + + /* + * parseTokens(): + * + * This method loops over the input pattern reporting tokens to the delegate. + * The method returns when a parse error is detected, or the end of the pattern + * is reached. One piece of state is tracked around the loop, which is whether + * the last token passed to the delegate was an atom (this is necessary to detect + * a parse error when a quantifier provided without an atom to quantify). + */ + void parseTokens() + { + bool lastTokenWasAnAtom = false; + + while (!atEndOfPattern()) { + switch (peek()) { + case '|': + consume(); + m_delegate.disjunction(); + lastTokenWasAnAtom = false; + break; + + case '(': + parseParenthesesBegin(); + lastTokenWasAnAtom = false; + break; + + case ')': + parseParenthesesEnd(); + lastTokenWasAnAtom = true; + break; + + case '^': + consume(); + m_delegate.assertionBOL(); + lastTokenWasAnAtom = false; + break; + + case '$': + consume(); + m_delegate.assertionEOL(); + lastTokenWasAnAtom = false; + break; + + case '.': + consume(); + m_delegate.atomBuiltInCharacterClass(NewlineClassID, true); + lastTokenWasAnAtom = true; + break; + + case '[': + parseCharacterClass(); + lastTokenWasAnAtom = true; + break; + + case '\\': + lastTokenWasAnAtom = parseAtomEscape(); + break; + + case '*': + consume(); + parseQuantifier(lastTokenWasAnAtom, 0, quantifyInfinite); + lastTokenWasAnAtom = false; + break; + + case '+': + consume(); + parseQuantifier(lastTokenWasAnAtom, 1, quantifyInfinite); + lastTokenWasAnAtom = false; + break; + + case '?': + consume(); + parseQuantifier(lastTokenWasAnAtom, 0, 1); + lastTokenWasAnAtom = false; + break; + + case '{': { + ParseState state = saveState(); + + consume(); + if (peekIsDigit()) { + unsigned min = consumeNumber(); + unsigned max = min; + + if (tryConsume(',')) + max = peekIsDigit() ? consumeNumber() : quantifyInfinite; + + if (tryConsume('}')) { + if (min <= max) + parseQuantifier(lastTokenWasAnAtom, min, max); + else + m_err = QuantifierOutOfOrder; + lastTokenWasAnAtom = false; + break; + } + } + + restoreState(state); + } // if we did not find a complete quantifer, fall through to the default case. + + default: + m_delegate.atomPatternCharacter(consume()); + lastTokenWasAnAtom = true; + } + + if (m_err) + return; + } + + if (m_parenthesesNestingDepth > 0) + m_err = MissingParentheses; + } + + /* + * parse(): + * + * This method calls parseTokens() to parse over the input and converts any + * error code to a const char* for a result. + */ + const char* parse() + { + if (m_size > MAX_PATTERN_SIZE) + m_err = PatternTooLarge; + else + parseTokens(); + ASSERT(atEndOfPattern() || m_err); + + // The order of this array must match the ErrorCode enum. + static const char* errorMessages[NumberOfErrorCodes] = { + 0, // NoError + REGEXP_ERROR_PREFIX "regular expression too large", + REGEXP_ERROR_PREFIX "numbers out of order in {} quantifier", + REGEXP_ERROR_PREFIX "nothing to repeat", + REGEXP_ERROR_PREFIX "number too large in {} quantifier", + REGEXP_ERROR_PREFIX "missing )", + REGEXP_ERROR_PREFIX "unmatched parentheses", + REGEXP_ERROR_PREFIX "unrecognized character after (?", + REGEXP_ERROR_PREFIX "missing terminating ] for character class", + REGEXP_ERROR_PREFIX "range out of order in character class", + REGEXP_ERROR_PREFIX "\\ at end of pattern" + }; + + return errorMessages[m_err]; + } + + // Misc helper functions: + + typedef unsigned ParseState; + + ParseState saveState() + { + return m_index; + } + + void restoreState(ParseState state) + { + m_index = state; + } + + bool atEndOfPattern() + { + ASSERT(m_index <= m_size); + return m_index == m_size; + } + + int peek() + { + ASSERT(m_index < m_size); + return m_data[m_index]; + } + + bool peekIsDigit() + { + return !atEndOfPattern() && WTF::isASCIIDigit(peek()); + } + + unsigned peekDigit() + { + ASSERT(peekIsDigit()); + return peek() - '0'; + } + + int consume() + { + ASSERT(m_index < m_size); + return m_data[m_index++]; + } + + unsigned consumeDigit() + { + ASSERT(peekIsDigit()); + return consume() - '0'; + } + + unsigned consumeNumber() + { + unsigned n = consumeDigit(); + // check for overflow. + for (unsigned newValue; peekIsDigit() && ((newValue = n * 10 + peekDigit()) >= n); ) { + n = newValue; + consume(); + } + return n; + } + + unsigned consumeOctal() + { + ASSERT(WTF::isASCIIOctalDigit(peek())); + + unsigned n = consumeDigit(); + while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek())) + n = n * 8 + consumeDigit(); + return n; + } + + bool tryConsume(UChar ch) + { + if (atEndOfPattern() || (m_data[m_index] != ch)) + return false; + ++m_index; + return true; + } + + int tryConsumeHex(int count) + { + ParseState state = saveState(); + + int n = 0; + while (count--) { + if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) { + restoreState(state); + return -1; + } + n = (n << 4) | WTF::toASCIIHexValue(consume()); + } + return n; + } + + Delegate& m_delegate; + unsigned m_backReferenceLimit; + ErrorCode m_err; + const CharType* m_data; + unsigned m_size; + unsigned m_index; + unsigned m_parenthesesNestingDepth; + + // Derived by empirical testing of compile time in PCRE and WREC. + static const unsigned MAX_PATTERN_SIZE = 1024 * 1024; +}; + +/* + * Yarr::parse(): + * + * The parse method is passed a pattern to be parsed and a delegate upon which + * callbacks will be made to record the parsed tokens forming the regex. + * Yarr::parse() returns null on success, or a const C string providing an error + * message where a parse error occurs. + * + * The Delegate must implement the following interface: + * + * void assertionBOL(); + * void assertionEOL(); + * void assertionWordBoundary(bool invert); + * + * void atomPatternCharacter(UChar ch); + * void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert); + * void atomCharacterClassBegin(bool invert) + * void atomCharacterClassAtom(UChar ch) + * void atomCharacterClassRange(UChar begin, UChar end) + * void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) + * void atomCharacterClassEnd() + * void atomParenthesesSubpatternBegin(bool capture = true); + * void atomParentheticalAssertionBegin(bool invert = false); + * void atomParenthesesEnd(); + * void atomBackReference(unsigned subpatternId); + * + * void quantifyAtom(unsigned min, unsigned max, bool greedy); + * + * void disjunction(); + * + * The regular expression is described by a sequence of assertion*() and atom*() + * callbacks to the delegate, describing the terms in the regular expression. + * Following an atom a quantifyAtom() call may occur to indicate that the previous + * atom should be quantified. In the case of atoms described across multiple + * calls (parentheses and character classes) the call to quantifyAtom() will come + * after the call to the atom*End() method, never after atom*Begin(). + * + * Character classes may either be described by a single call to + * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls. + * In the latter case, ...Begin() will be called, followed by a sequence of + * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End(). + * + * Sequences of atoms and assertions are broken into alternatives via calls to + * disjunction(). Assertions, atoms, and disjunctions emitted between calls to + * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern. + * atomParenthesesBegin() is passed a subpatternId. In the case of a regular + * capturing subpattern, this will be the subpatternId associated with these + * parentheses, and will also by definition be the lowest subpatternId of these + * parentheses and of any nested paretheses. The atomParenthesesEnd() method + * is passed the subpatternId of the last capturing subexpression nested within + * these paretheses. In the case of a capturing subpattern with no nested + * capturing subpatterns, the same subpatternId will be passed to the begin and + * end functions. In the case of non-capturing subpatterns the subpatternId + * passed to the begin method is also the first possible subpatternId that might + * be nested within these paretheses. If a set of non-capturing parentheses does + * not contain any capturing subpatterns, then the subpatternId passed to begin + * will be greater than the subpatternId passed to end. + */ + +template<class Delegate> +const char* parse(Delegate& delegate, const String& pattern, unsigned backReferenceLimit = quantifyInfinite) +{ + if (pattern.is8Bit()) + return Parser<Delegate, LChar>(delegate, pattern, backReferenceLimit).parse(); + return Parser<Delegate, UChar>(delegate, pattern, backReferenceLimit).parse(); +} + +} } // namespace JSC::Yarr + +#endif // YarrParser_h diff --git a/src/3rdparty/masm/yarr/YarrPattern.cpp b/src/3rdparty/masm/yarr/YarrPattern.cpp new file mode 100644 index 0000000000..c953a38d2f --- /dev/null +++ b/src/3rdparty/masm/yarr/YarrPattern.cpp @@ -0,0 +1,874 @@ +/* + * Copyright (C) 2009 Apple Inc. All rights reserved. + * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "YarrPattern.h" + +#include "Yarr.h" +#include "YarrCanonicalizeUCS2.h" +#include "YarrParser.h" +#include <wtf/Vector.h> + +using namespace WTF; + +namespace JSC { namespace Yarr { + +#include "RegExpJitTables.h" + +class CharacterClassConstructor { +public: + CharacterClassConstructor(bool isCaseInsensitive = false) + : m_isCaseInsensitive(isCaseInsensitive) + { + } + + void reset() + { + m_matches.clear(); + m_ranges.clear(); + m_matchesUnicode.clear(); + m_rangesUnicode.clear(); + } + + void append(const CharacterClass* other) + { + for (size_t i = 0; i < other->m_matches.size(); ++i) + addSorted(m_matches, other->m_matches[i]); + for (size_t i = 0; i < other->m_ranges.size(); ++i) + addSortedRange(m_ranges, other->m_ranges[i].begin, other->m_ranges[i].end); + for (size_t i = 0; i < other->m_matchesUnicode.size(); ++i) + addSorted(m_matchesUnicode, other->m_matchesUnicode[i]); + for (size_t i = 0; i < other->m_rangesUnicode.size(); ++i) + addSortedRange(m_rangesUnicode, other->m_rangesUnicode[i].begin, other->m_rangesUnicode[i].end); + } + + void putChar(UChar ch) + { + // Handle ascii cases. + if (ch <= 0x7f) { + if (m_isCaseInsensitive && isASCIIAlpha(ch)) { + addSorted(m_matches, toASCIIUpper(ch)); + addSorted(m_matches, toASCIILower(ch)); + } else + addSorted(m_matches, ch); + return; + } + + // Simple case, not a case-insensitive match. + if (!m_isCaseInsensitive) { + addSorted(m_matchesUnicode, ch); + return; + } + + // Add multiple matches, if necessary. + UCS2CanonicalizationRange* info = rangeInfoFor(ch); + if (info->type == CanonicalizeUnique) + addSorted(m_matchesUnicode, ch); + else + putUnicodeIgnoreCase(ch, info); + } + + void putUnicodeIgnoreCase(UChar ch, UCS2CanonicalizationRange* info) + { + ASSERT(m_isCaseInsensitive); + ASSERT(ch > 0x7f); + ASSERT(ch >= info->begin && ch <= info->end); + ASSERT(info->type != CanonicalizeUnique); + if (info->type == CanonicalizeSet) { + for (uint16_t* set = characterSetInfo[info->value]; (ch = *set); ++set) + addSorted(m_matchesUnicode, ch); + } else { + addSorted(m_matchesUnicode, ch); + addSorted(m_matchesUnicode, getCanonicalPair(info, ch)); + } + } + + void putRange(UChar lo, UChar hi) + { + if (lo <= 0x7f) { + char asciiLo = lo; + char asciiHi = std::min(hi, (UChar)0x7f); + addSortedRange(m_ranges, lo, asciiHi); + + if (m_isCaseInsensitive) { + if ((asciiLo <= 'Z') && (asciiHi >= 'A')) + addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A')); + if ((asciiLo <= 'z') && (asciiHi >= 'a')) + addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a')); + } + } + if (hi <= 0x7f) + return; + + lo = std::max(lo, (UChar)0x80); + addSortedRange(m_rangesUnicode, lo, hi); + + if (!m_isCaseInsensitive) + return; + + UCS2CanonicalizationRange* info = rangeInfoFor(lo); + while (true) { + // Handle the range [lo .. end] + UChar end = std::min<UChar>(info->end, hi); + + switch (info->type) { + case CanonicalizeUnique: + // Nothing to do - no canonical equivalents. + break; + case CanonicalizeSet: { + UChar ch; + for (uint16_t* set = characterSetInfo[info->value]; (ch = *set); ++set) + addSorted(m_matchesUnicode, ch); + break; + } + case CanonicalizeRangeLo: + addSortedRange(m_rangesUnicode, lo + info->value, end + info->value); + break; + case CanonicalizeRangeHi: + addSortedRange(m_rangesUnicode, lo - info->value, end - info->value); + break; + case CanonicalizeAlternatingAligned: + // Use addSortedRange since there is likely an abutting range to combine with. + if (lo & 1) + addSortedRange(m_rangesUnicode, lo - 1, lo - 1); + if (!(end & 1)) + addSortedRange(m_rangesUnicode, end + 1, end + 1); + break; + case CanonicalizeAlternatingUnaligned: + // Use addSortedRange since there is likely an abutting range to combine with. + if (!(lo & 1)) + addSortedRange(m_rangesUnicode, lo - 1, lo - 1); + if (end & 1) + addSortedRange(m_rangesUnicode, end + 1, end + 1); + break; + } + + if (hi == end) + return; + + ++info; + lo = info->begin; + }; + + } + + CharacterClass* charClass() + { + CharacterClass* characterClass = new CharacterClass(0); + + characterClass->m_matches.swap(m_matches); + characterClass->m_ranges.swap(m_ranges); + characterClass->m_matchesUnicode.swap(m_matchesUnicode); + characterClass->m_rangesUnicode.swap(m_rangesUnicode); + + return characterClass; + } + +private: + void addSorted(Vector<UChar>& matches, UChar ch) + { + unsigned pos = 0; + unsigned range = matches.size(); + + // binary chop, find position to insert char. + while (range) { + unsigned index = range >> 1; + + int val = matches[pos+index] - ch; + if (!val) + return; + else if (val > 0) + range = index; + else { + pos += (index+1); + range -= (index+1); + } + } + + if (pos == matches.size()) + matches.append(ch); + else + matches.insert(pos, ch); + } + + void addSortedRange(Vector<CharacterRange>& ranges, UChar lo, UChar hi) + { + unsigned end = ranges.size(); + + // Simple linear scan - I doubt there are that many ranges anyway... + // feel free to fix this with something faster (eg binary chop). + for (unsigned i = 0; i < end; ++i) { + // does the new range fall before the current position in the array + if (hi < ranges[i].begin) { + // optional optimization: concatenate appending ranges? - may not be worthwhile. + if (hi == (ranges[i].begin - 1)) { + ranges[i].begin = lo; + return; + } + ranges.insert(i, CharacterRange(lo, hi)); + return; + } + // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining + // If the new range start at or before the end of the last range, then the overlap (if it starts one after the + // end of the last range they concatenate, which is just as good. + if (lo <= (ranges[i].end + 1)) { + // found an intersect! we'll replace this entry in the array. + ranges[i].begin = std::min(ranges[i].begin, lo); + ranges[i].end = std::max(ranges[i].end, hi); + + // now check if the new range can subsume any subsequent ranges. + unsigned next = i+1; + // each iteration of the loop we will either remove something from the list, or break the loop. + while (next < ranges.size()) { + if (ranges[next].begin <= (ranges[i].end + 1)) { + // the next entry now overlaps / concatenates this one. + ranges[i].end = std::max(ranges[i].end, ranges[next].end); + ranges.remove(next); + } else + break; + } + + return; + } + } + + // CharacterRange comes after all existing ranges. + ranges.append(CharacterRange(lo, hi)); + } + + bool m_isCaseInsensitive; + + Vector<UChar> m_matches; + Vector<CharacterRange> m_ranges; + Vector<UChar> m_matchesUnicode; + Vector<CharacterRange> m_rangesUnicode; +}; + +class YarrPatternConstructor { +public: + YarrPatternConstructor(YarrPattern& pattern) + : m_pattern(pattern) + , m_characterClassConstructor(pattern.m_ignoreCase) + , m_invertParentheticalAssertion(false) + { + m_pattern.m_body = new PatternDisjunction(); + m_alternative = m_pattern.m_body->addNewAlternative(); + m_pattern.m_disjunctions.append(m_pattern.m_body); + } + + ~YarrPatternConstructor() + { + } + + void reset() + { + m_pattern.reset(); + m_characterClassConstructor.reset(); + + m_pattern.m_body = new PatternDisjunction(); + m_alternative = m_pattern.m_body->addNewAlternative(); + m_pattern.m_disjunctions.append(m_pattern.m_body); + } + + void assertionBOL() + { + if (!m_alternative->m_terms.size() & !m_invertParentheticalAssertion) { + m_alternative->m_startsWithBOL = true; + m_alternative->m_containsBOL = true; + m_pattern.m_containsBOL = true; + } + m_alternative->m_terms.append(PatternTerm::BOL()); + } + void assertionEOL() + { + m_alternative->m_terms.append(PatternTerm::EOL()); + } + void assertionWordBoundary(bool invert) + { + m_alternative->m_terms.append(PatternTerm::WordBoundary(invert)); + } + + void atomPatternCharacter(UChar ch) + { + // We handle case-insensitive checking of unicode characters which do have both + // cases by handling them as if they were defined using a CharacterClass. + if (!m_pattern.m_ignoreCase || isASCII(ch)) { + m_alternative->m_terms.append(PatternTerm(ch)); + return; + } + + UCS2CanonicalizationRange* info = rangeInfoFor(ch); + if (info->type == CanonicalizeUnique) { + m_alternative->m_terms.append(PatternTerm(ch)); + return; + } + + m_characterClassConstructor.putUnicodeIgnoreCase(ch, info); + CharacterClass* newCharacterClass = m_characterClassConstructor.charClass(); + m_pattern.m_userCharacterClasses.append(newCharacterClass); + m_alternative->m_terms.append(PatternTerm(newCharacterClass, false)); + } + + void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) + { + switch (classID) { + case DigitClassID: + m_alternative->m_terms.append(PatternTerm(m_pattern.digitsCharacterClass(), invert)); + break; + case SpaceClassID: + m_alternative->m_terms.append(PatternTerm(m_pattern.spacesCharacterClass(), invert)); + break; + case WordClassID: + m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert)); + break; + case NewlineClassID: + m_alternative->m_terms.append(PatternTerm(m_pattern.newlineCharacterClass(), invert)); + break; + } + } + + void atomCharacterClassBegin(bool invert = false) + { + m_invertCharacterClass = invert; + } + + void atomCharacterClassAtom(UChar ch) + { + m_characterClassConstructor.putChar(ch); + } + + void atomCharacterClassRange(UChar begin, UChar end) + { + m_characterClassConstructor.putRange(begin, end); + } + + void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) + { + ASSERT(classID != NewlineClassID); + + switch (classID) { + case DigitClassID: + m_characterClassConstructor.append(invert ? m_pattern.nondigitsCharacterClass() : m_pattern.digitsCharacterClass()); + break; + + case SpaceClassID: + m_characterClassConstructor.append(invert ? m_pattern.nonspacesCharacterClass() : m_pattern.spacesCharacterClass()); + break; + + case WordClassID: + m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass()); + break; + + default: + ASSERT_NOT_REACHED(); + } + } + + void atomCharacterClassEnd() + { + CharacterClass* newCharacterClass = m_characterClassConstructor.charClass(); + m_pattern.m_userCharacterClasses.append(newCharacterClass); + m_alternative->m_terms.append(PatternTerm(newCharacterClass, m_invertCharacterClass)); + } + + void atomParenthesesSubpatternBegin(bool capture = true) + { + unsigned subpatternId = m_pattern.m_numSubpatterns + 1; + if (capture) + m_pattern.m_numSubpatterns++; + + PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative); + m_pattern.m_disjunctions.append(parenthesesDisjunction); + m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, capture, false)); + m_alternative = parenthesesDisjunction->addNewAlternative(); + } + + void atomParentheticalAssertionBegin(bool invert = false) + { + PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative); + m_pattern.m_disjunctions.append(parenthesesDisjunction); + m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction, false, invert)); + m_alternative = parenthesesDisjunction->addNewAlternative(); + m_invertParentheticalAssertion = invert; + } + + void atomParenthesesEnd() + { + ASSERT(m_alternative->m_parent); + ASSERT(m_alternative->m_parent->m_parent); + + PatternDisjunction* parenthesesDisjunction = m_alternative->m_parent; + m_alternative = m_alternative->m_parent->m_parent; + + PatternTerm& lastTerm = m_alternative->lastTerm(); + + unsigned numParenAlternatives = parenthesesDisjunction->m_alternatives.size(); + unsigned numBOLAnchoredAlts = 0; + + for (unsigned i = 0; i < numParenAlternatives; i++) { + // Bubble up BOL flags + if (parenthesesDisjunction->m_alternatives[i]->m_startsWithBOL) + numBOLAnchoredAlts++; + } + + if (numBOLAnchoredAlts) { + m_alternative->m_containsBOL = true; + // If all the alternatives in parens start with BOL, then so does this one + if (numBOLAnchoredAlts == numParenAlternatives) + m_alternative->m_startsWithBOL = true; + } + + lastTerm.parentheses.lastSubpatternId = m_pattern.m_numSubpatterns; + m_invertParentheticalAssertion = false; + } + + void atomBackReference(unsigned subpatternId) + { + ASSERT(subpatternId); + m_pattern.m_containsBackreferences = true; + m_pattern.m_maxBackReference = std::max(m_pattern.m_maxBackReference, subpatternId); + + if (subpatternId > m_pattern.m_numSubpatterns) { + m_alternative->m_terms.append(PatternTerm::ForwardReference()); + return; + } + + PatternAlternative* currentAlternative = m_alternative; + ASSERT(currentAlternative); + + // Note to self: if we waited until the AST was baked, we could also remove forwards refs + while ((currentAlternative = currentAlternative->m_parent->m_parent)) { + PatternTerm& term = currentAlternative->lastTerm(); + ASSERT((term.type == PatternTerm::TypeParenthesesSubpattern) || (term.type == PatternTerm::TypeParentheticalAssertion)); + + if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.capture() && (subpatternId == term.parentheses.subpatternId)) { + m_alternative->m_terms.append(PatternTerm::ForwardReference()); + return; + } + } + + m_alternative->m_terms.append(PatternTerm(subpatternId)); + } + + // deep copy the argument disjunction. If filterStartsWithBOL is true, + // skip alternatives with m_startsWithBOL set true. + PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction, bool filterStartsWithBOL = false) + { + PatternDisjunction* newDisjunction = 0; + for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { + PatternAlternative* alternative = disjunction->m_alternatives[alt]; + if (!filterStartsWithBOL || !alternative->m_startsWithBOL) { + if (!newDisjunction) { + newDisjunction = new PatternDisjunction(); + newDisjunction->m_parent = disjunction->m_parent; + } + PatternAlternative* newAlternative = newDisjunction->addNewAlternative(); + for (unsigned i = 0; i < alternative->m_terms.size(); ++i) + newAlternative->m_terms.append(copyTerm(alternative->m_terms[i], filterStartsWithBOL)); + } + } + + if (newDisjunction) + m_pattern.m_disjunctions.append(newDisjunction); + return newDisjunction; + } + + PatternTerm copyTerm(PatternTerm& term, bool filterStartsWithBOL = false) + { + if ((term.type != PatternTerm::TypeParenthesesSubpattern) && (term.type != PatternTerm::TypeParentheticalAssertion)) + return PatternTerm(term); + + PatternTerm termCopy = term; + termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction, filterStartsWithBOL); + return termCopy; + } + + void quantifyAtom(unsigned min, unsigned max, bool greedy) + { + ASSERT(min <= max); + ASSERT(m_alternative->m_terms.size()); + + if (!max) { + m_alternative->removeLastTerm(); + return; + } + + PatternTerm& term = m_alternative->lastTerm(); + ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary); + ASSERT((term.quantityCount == 1) && (term.quantityType == QuantifierFixedCount)); + + if (term.type == PatternTerm::TypeParentheticalAssertion) { + // If an assertion is quantified with a minimum count of zero, it can simply be removed. + // This arises from the RepeatMatcher behaviour in the spec. Matching an assertion never + // results in any input being consumed, however the continuation passed to the assertion + // (called in steps, 8c and 9 of the RepeatMatcher definition, ES5.1 15.10.2.5) will + // reject all zero length matches (see step 2.1). A match from the continuation of the + // expression will still be accepted regardless (via steps 8a and 11) - the upshot of all + // this is that matches from the assertion are not required, and won't be accepted anyway, + // so no need to ever run it. + if (!min) + m_alternative->removeLastTerm(); + // We never need to run an assertion more than once. Subsequent interations will be run + // with the same start index (since assertions are non-capturing) and the same captures + // (per step 4 of RepeatMatcher in ES5.1 15.10.2.5), and as such will always produce the + // same result and captures. If the first match succeeds then the subsequent (min - 1) + // matches will too. Any additional optional matches will fail (on the same basis as the + // minimum zero quantified assertions, above), but this will still result in a match. + return; + } + + if (min == 0) + term.quantify(max, greedy ? QuantifierGreedy : QuantifierNonGreedy); + else if (min == max) + term.quantify(min, QuantifierFixedCount); + else { + term.quantify(min, QuantifierFixedCount); + m_alternative->m_terms.append(copyTerm(term)); + // NOTE: this term is interesting from an analysis perspective, in that it can be ignored..... + m_alternative->lastTerm().quantify((max == quantifyInfinite) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy); + if (m_alternative->lastTerm().type == PatternTerm::TypeParenthesesSubpattern) + m_alternative->lastTerm().parentheses.isCopy = true; + } + } + + void disjunction() + { + m_alternative = m_alternative->m_parent->addNewAlternative(); + } + + unsigned setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition) + { + alternative->m_hasFixedSize = true; + Checked<unsigned> currentInputPosition = initialInputPosition; + + for (unsigned i = 0; i < alternative->m_terms.size(); ++i) { + PatternTerm& term = alternative->m_terms[i]; + + switch (term.type) { + case PatternTerm::TypeAssertionBOL: + case PatternTerm::TypeAssertionEOL: + case PatternTerm::TypeAssertionWordBoundary: + term.inputPosition = currentInputPosition.unsafeGet(); + break; + + case PatternTerm::TypeBackReference: + term.inputPosition = currentInputPosition.unsafeGet(); + term.frameLocation = currentCallFrameSize; + currentCallFrameSize += YarrStackSpaceForBackTrackInfoBackReference; + alternative->m_hasFixedSize = false; + break; + + case PatternTerm::TypeForwardReference: + break; + + case PatternTerm::TypePatternCharacter: + term.inputPosition = currentInputPosition.unsafeGet(); + if (term.quantityType != QuantifierFixedCount) { + term.frameLocation = currentCallFrameSize; + currentCallFrameSize += YarrStackSpaceForBackTrackInfoPatternCharacter; + alternative->m_hasFixedSize = false; + } else + currentInputPosition += term.quantityCount; + break; + + case PatternTerm::TypeCharacterClass: + term.inputPosition = currentInputPosition.unsafeGet(); + if (term.quantityType != QuantifierFixedCount) { + term.frameLocation = currentCallFrameSize; + currentCallFrameSize += YarrStackSpaceForBackTrackInfoCharacterClass; + alternative->m_hasFixedSize = false; + } else + currentInputPosition += term.quantityCount; + break; + + case PatternTerm::TypeParenthesesSubpattern: + // Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own. + term.frameLocation = currentCallFrameSize; + if (term.quantityCount == 1 && !term.parentheses.isCopy) { + if (term.quantityType != QuantifierFixedCount) + currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesOnce; + currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet()); + // If quantity is fixed, then pre-check its minimum size. + if (term.quantityType == QuantifierFixedCount) + currentInputPosition += term.parentheses.disjunction->m_minimumSize; + term.inputPosition = currentInputPosition.unsafeGet(); + } else if (term.parentheses.isTerminal) { + currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesTerminal; + currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet()); + term.inputPosition = currentInputPosition.unsafeGet(); + } else { + term.inputPosition = currentInputPosition.unsafeGet(); + setupDisjunctionOffsets(term.parentheses.disjunction, 0, currentInputPosition.unsafeGet()); + currentCallFrameSize += YarrStackSpaceForBackTrackInfoParentheses; + } + // Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length. + alternative->m_hasFixedSize = false; + break; + + case PatternTerm::TypeParentheticalAssertion: + term.inputPosition = currentInputPosition.unsafeGet(); + term.frameLocation = currentCallFrameSize; + currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + YarrStackSpaceForBackTrackInfoParentheticalAssertion, currentInputPosition.unsafeGet()); + break; + + case PatternTerm::TypeDotStarEnclosure: + alternative->m_hasFixedSize = false; + term.inputPosition = initialInputPosition; + break; + } + } + + alternative->m_minimumSize = (currentInputPosition - initialInputPosition).unsafeGet(); + return currentCallFrameSize; + } + + unsigned setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition) + { + if ((disjunction != m_pattern.m_body) && (disjunction->m_alternatives.size() > 1)) + initialCallFrameSize += YarrStackSpaceForBackTrackInfoAlternative; + + unsigned minimumInputSize = UINT_MAX; + unsigned maximumCallFrameSize = 0; + bool hasFixedSize = true; + + for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { + PatternAlternative* alternative = disjunction->m_alternatives[alt]; + unsigned currentAlternativeCallFrameSize = setupAlternativeOffsets(alternative, initialCallFrameSize, initialInputPosition); + minimumInputSize = std::min(minimumInputSize, alternative->m_minimumSize); + maximumCallFrameSize = std::max(maximumCallFrameSize, currentAlternativeCallFrameSize); + hasFixedSize &= alternative->m_hasFixedSize; + } + + ASSERT(minimumInputSize != UINT_MAX); + ASSERT(maximumCallFrameSize >= initialCallFrameSize); + + disjunction->m_hasFixedSize = hasFixedSize; + disjunction->m_minimumSize = minimumInputSize; + disjunction->m_callFrameSize = maximumCallFrameSize; + return maximumCallFrameSize; + } + + void setupOffsets() + { + setupDisjunctionOffsets(m_pattern.m_body, 0, 0); + } + + // This optimization identifies sets of parentheses that we will never need to backtrack. + // In these cases we do not need to store state from prior iterations. + // We can presently avoid backtracking for: + // * where the parens are at the end of the regular expression (last term in any of the + // alternatives of the main body disjunction). + // * where the parens are non-capturing, and quantified unbounded greedy (*). + // * where the parens do not contain any capturing subpatterns. + void checkForTerminalParentheses() + { + // This check is much too crude; should be just checking whether the candidate + // node contains nested capturing subpatterns, not the whole expression! + if (m_pattern.m_numSubpatterns) + return; + + Vector<PatternAlternative*>& alternatives = m_pattern.m_body->m_alternatives; + for (size_t i = 0; i < alternatives.size(); ++i) { + Vector<PatternTerm>& terms = alternatives[i]->m_terms; + if (terms.size()) { + PatternTerm& term = terms.last(); + if (term.type == PatternTerm::TypeParenthesesSubpattern + && term.quantityType == QuantifierGreedy + && term.quantityCount == quantifyInfinite + && !term.capture()) + term.parentheses.isTerminal = true; + } + } + } + + void optimizeBOL() + { + // Look for expressions containing beginning of line (^) anchoring and unroll them. + // e.g. /^a|^b|c/ becomes /^a|^b|c/ which is executed once followed by /c/ which loops + // This code relies on the parsing code tagging alternatives with m_containsBOL and + // m_startsWithBOL and rolling those up to containing alternatives. + // At this point, this is only valid for non-multiline expressions. + PatternDisjunction* disjunction = m_pattern.m_body; + + if (!m_pattern.m_containsBOL || m_pattern.m_multiline) + return; + + PatternDisjunction* loopDisjunction = copyDisjunction(disjunction, true); + + // Set alternatives in disjunction to "onceThrough" + for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) + disjunction->m_alternatives[alt]->setOnceThrough(); + + if (loopDisjunction) { + // Move alternatives from loopDisjunction to disjunction + for (unsigned alt = 0; alt < loopDisjunction->m_alternatives.size(); ++alt) + disjunction->m_alternatives.append(loopDisjunction->m_alternatives[alt]); + + loopDisjunction->m_alternatives.clear(); + } + } + + bool containsCapturingTerms(PatternAlternative* alternative, size_t firstTermIndex, size_t lastTermIndex) + { + Vector<PatternTerm>& terms = alternative->m_terms; + + for (size_t termIndex = firstTermIndex; termIndex <= lastTermIndex; ++termIndex) { + PatternTerm& term = terms[termIndex]; + + if (term.m_capture) + return true; + + if (term.type == PatternTerm::TypeParenthesesSubpattern) { + PatternDisjunction* nestedDisjunction = term.parentheses.disjunction; + for (unsigned alt = 0; alt < nestedDisjunction->m_alternatives.size(); ++alt) { + if (containsCapturingTerms(nestedDisjunction->m_alternatives[alt], 0, nestedDisjunction->m_alternatives[alt]->m_terms.size() - 1)) + return true; + } + } + } + + return false; + } + + // This optimization identifies alternatives in the form of + // [^].*[?]<expression>.*[$] for expressions that don't have any + // capturing terms. The alternative is changed to <expression> + // followed by processing of the dot stars to find and adjust the + // beginning and the end of the match. + void optimizeDotStarWrappedExpressions() + { + Vector<PatternAlternative*>& alternatives = m_pattern.m_body->m_alternatives; + if (alternatives.size() != 1) + return; + + PatternAlternative* alternative = alternatives[0]; + Vector<PatternTerm>& terms = alternative->m_terms; + if (terms.size() >= 3) { + bool startsWithBOL = false; + bool endsWithEOL = false; + size_t termIndex, firstExpressionTerm, lastExpressionTerm; + + termIndex = 0; + if (terms[termIndex].type == PatternTerm::TypeAssertionBOL) { + startsWithBOL = true; + ++termIndex; + } + + PatternTerm& firstNonAnchorTerm = terms[termIndex]; + if ((firstNonAnchorTerm.type != PatternTerm::TypeCharacterClass) || (firstNonAnchorTerm.characterClass != m_pattern.newlineCharacterClass()) || !((firstNonAnchorTerm.quantityType == QuantifierGreedy) || (firstNonAnchorTerm.quantityType == QuantifierNonGreedy))) + return; + + firstExpressionTerm = termIndex + 1; + + termIndex = terms.size() - 1; + if (terms[termIndex].type == PatternTerm::TypeAssertionEOL) { + endsWithEOL = true; + --termIndex; + } + + PatternTerm& lastNonAnchorTerm = terms[termIndex]; + if ((lastNonAnchorTerm.type != PatternTerm::TypeCharacterClass) || (lastNonAnchorTerm.characterClass != m_pattern.newlineCharacterClass()) || (lastNonAnchorTerm.quantityType != QuantifierGreedy)) + return; + + lastExpressionTerm = termIndex - 1; + + if (firstExpressionTerm > lastExpressionTerm) + return; + + if (!containsCapturingTerms(alternative, firstExpressionTerm, lastExpressionTerm)) { + for (termIndex = terms.size() - 1; termIndex > lastExpressionTerm; --termIndex) + terms.remove(termIndex); + + for (termIndex = firstExpressionTerm; termIndex > 0; --termIndex) + terms.remove(termIndex - 1); + + terms.append(PatternTerm(startsWithBOL, endsWithEOL)); + + m_pattern.m_containsBOL = false; + } + } + } + +private: + YarrPattern& m_pattern; + PatternAlternative* m_alternative; + CharacterClassConstructor m_characterClassConstructor; + bool m_invertCharacterClass; + bool m_invertParentheticalAssertion; +}; + +const char* YarrPattern::compile(const String& patternString) +{ + YarrPatternConstructor constructor(*this); + + if (const char* error = parse(constructor, patternString)) + return error; + + // If the pattern contains illegal backreferences reset & reparse. + // Quoting Netscape's "What's new in JavaScript 1.2", + // "Note: if the number of left parentheses is less than the number specified + // in \#, the \# is taken as an octal escape as described in the next row." + if (containsIllegalBackReference()) { + unsigned numSubpatterns = m_numSubpatterns; + + constructor.reset(); +#if !ASSERT_DISABLED + const char* error = +#endif + parse(constructor, patternString, numSubpatterns); + + ASSERT(!error); + ASSERT(numSubpatterns == m_numSubpatterns); + } + + constructor.checkForTerminalParentheses(); + constructor.optimizeDotStarWrappedExpressions(); + constructor.optimizeBOL(); + + constructor.setupOffsets(); + + return 0; +} + +YarrPattern::YarrPattern(const String& pattern, bool ignoreCase, bool multiline, const char** error) + : m_ignoreCase(ignoreCase) + , m_multiline(multiline) + , m_containsBackreferences(false) + , m_containsBOL(false) + , m_numSubpatterns(0) + , m_maxBackReference(0) + , newlineCached(0) + , digitsCached(0) + , spacesCached(0) + , wordcharCached(0) + , nondigitsCached(0) + , nonspacesCached(0) + , nonwordcharCached(0) +{ + *error = compile(pattern); +} + +} } diff --git a/src/3rdparty/masm/yarr/YarrPattern.h b/src/3rdparty/masm/yarr/YarrPattern.h new file mode 100644 index 0000000000..14e89b8e09 --- /dev/null +++ b/src/3rdparty/masm/yarr/YarrPattern.h @@ -0,0 +1,421 @@ +/* + * Copyright (C) 2009 Apple Inc. All rights reserved. + * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef YarrPattern_h +#define YarrPattern_h + +#include <wtf/CheckedArithmetic.h> +#include <wtf/RefCounted.h> +#include <wtf/Vector.h> +#include <wtf/text/WTFString.h> +#include <wtf/unicode/Unicode.h> + +namespace JSC { namespace Yarr { + +struct PatternDisjunction; + +struct CharacterRange { + UChar begin; + UChar end; + + CharacterRange(UChar begin, UChar end) + : begin(begin) + , end(end) + { + } +}; + +struct CharacterClassTable : RefCounted<CharacterClassTable> { + const char* m_table; + bool m_inverted; + static PassRefPtr<CharacterClassTable> create(const char* table, bool inverted) + { + return adoptRef(new CharacterClassTable(table, inverted)); + } + +private: + CharacterClassTable(const char* table, bool inverted) + : m_table(table) + , m_inverted(inverted) + { + } +}; + +struct CharacterClass { + WTF_MAKE_FAST_ALLOCATED; +public: + // All CharacterClass instances have to have the full set of matches and ranges, + // they may have an optional table for faster lookups (which must match the + // specified matches and ranges) + CharacterClass(PassRefPtr<CharacterClassTable> table) + : m_table(table) + { + } + Vector<UChar> m_matches; + Vector<CharacterRange> m_ranges; + Vector<UChar> m_matchesUnicode; + Vector<CharacterRange> m_rangesUnicode; + RefPtr<CharacterClassTable> m_table; +}; + +enum QuantifierType { + QuantifierFixedCount, + QuantifierGreedy, + QuantifierNonGreedy, +}; + +struct PatternTerm { + enum Type { + TypeAssertionBOL, + TypeAssertionEOL, + TypeAssertionWordBoundary, + TypePatternCharacter, + TypeCharacterClass, + TypeBackReference, + TypeForwardReference, + TypeParenthesesSubpattern, + TypeParentheticalAssertion, + TypeDotStarEnclosure, + } type; + bool m_capture :1; + bool m_invert :1; + union { + UChar patternCharacter; + CharacterClass* characterClass; + unsigned backReferenceSubpatternId; + struct { + PatternDisjunction* disjunction; + unsigned subpatternId; + unsigned lastSubpatternId; + bool isCopy; + bool isTerminal; + } parentheses; + struct { + bool bolAnchor : 1; + bool eolAnchor : 1; + } anchors; + }; + QuantifierType quantityType; + Checked<unsigned> quantityCount; + int inputPosition; + unsigned frameLocation; + + PatternTerm(UChar ch) + : type(PatternTerm::TypePatternCharacter) + , m_capture(false) + , m_invert(false) + { + patternCharacter = ch; + quantityType = QuantifierFixedCount; + quantityCount = 1; + } + + PatternTerm(CharacterClass* charClass, bool invert) + : type(PatternTerm::TypeCharacterClass) + , m_capture(false) + , m_invert(invert) + { + characterClass = charClass; + quantityType = QuantifierFixedCount; + quantityCount = 1; + } + + PatternTerm(Type type, unsigned subpatternId, PatternDisjunction* disjunction, bool capture = false, bool invert = false) + : type(type) + , m_capture(capture) + , m_invert(invert) + { + parentheses.disjunction = disjunction; + parentheses.subpatternId = subpatternId; + parentheses.isCopy = false; + parentheses.isTerminal = false; + quantityType = QuantifierFixedCount; + quantityCount = 1; + } + + PatternTerm(Type type, bool invert = false) + : type(type) + , m_capture(false) + , m_invert(invert) + { + quantityType = QuantifierFixedCount; + quantityCount = 1; + } + + PatternTerm(unsigned spatternId) + : type(TypeBackReference) + , m_capture(false) + , m_invert(false) + { + backReferenceSubpatternId = spatternId; + quantityType = QuantifierFixedCount; + quantityCount = 1; + } + + PatternTerm(bool bolAnchor, bool eolAnchor) + : type(TypeDotStarEnclosure) + , m_capture(false) + , m_invert(false) + { + anchors.bolAnchor = bolAnchor; + anchors.eolAnchor = eolAnchor; + quantityType = QuantifierFixedCount; + quantityCount = 1; + } + + static PatternTerm ForwardReference() + { + return PatternTerm(TypeForwardReference); + } + + static PatternTerm BOL() + { + return PatternTerm(TypeAssertionBOL); + } + + static PatternTerm EOL() + { + return PatternTerm(TypeAssertionEOL); + } + + static PatternTerm WordBoundary(bool invert) + { + return PatternTerm(TypeAssertionWordBoundary, invert); + } + + bool invert() + { + return m_invert; + } + + bool capture() + { + return m_capture; + } + + void quantify(unsigned count, QuantifierType type) + { + quantityCount = count; + quantityType = type; + } +}; + +struct PatternAlternative { + WTF_MAKE_FAST_ALLOCATED; +public: + PatternAlternative(PatternDisjunction* disjunction) + : m_parent(disjunction) + , m_onceThrough(false) + , m_hasFixedSize(false) + , m_startsWithBOL(false) + , m_containsBOL(false) + { + } + + PatternTerm& lastTerm() + { + ASSERT(m_terms.size()); + return m_terms[m_terms.size() - 1]; + } + + void removeLastTerm() + { + ASSERT(m_terms.size()); + m_terms.shrink(m_terms.size() - 1); + } + + void setOnceThrough() + { + m_onceThrough = true; + } + + bool onceThrough() + { + return m_onceThrough; + } + + Vector<PatternTerm> m_terms; + PatternDisjunction* m_parent; + unsigned m_minimumSize; + bool m_onceThrough : 1; + bool m_hasFixedSize : 1; + bool m_startsWithBOL : 1; + bool m_containsBOL : 1; +}; + +struct PatternDisjunction { + WTF_MAKE_FAST_ALLOCATED; +public: + PatternDisjunction(PatternAlternative* parent = 0) + : m_parent(parent) + , m_hasFixedSize(false) + { + } + + ~PatternDisjunction() + { + deleteAllValues(m_alternatives); + } + + PatternAlternative* addNewAlternative() + { + PatternAlternative* alternative = new PatternAlternative(this); + m_alternatives.append(alternative); + return alternative; + } + + Vector<PatternAlternative*> m_alternatives; + PatternAlternative* m_parent; + unsigned m_minimumSize; + unsigned m_callFrameSize; + bool m_hasFixedSize; +}; + +// You probably don't want to be calling these functions directly +// (please to be calling newlineCharacterClass() et al on your +// friendly neighborhood YarrPattern instance to get nicely +// cached copies). +CharacterClass* newlineCreate(); +CharacterClass* digitsCreate(); +CharacterClass* spacesCreate(); +CharacterClass* wordcharCreate(); +CharacterClass* nondigitsCreate(); +CharacterClass* nonspacesCreate(); +CharacterClass* nonwordcharCreate(); + +struct TermChain { + TermChain(PatternTerm term) + : term(term) + {} + + PatternTerm term; + Vector<TermChain> hotTerms; +}; + +struct YarrPattern { + JS_EXPORT_PRIVATE YarrPattern(const String& pattern, bool ignoreCase, bool multiline, const char** error); + + ~YarrPattern() + { + deleteAllValues(m_disjunctions); + deleteAllValues(m_userCharacterClasses); + } + + void reset() + { + m_numSubpatterns = 0; + m_maxBackReference = 0; + + m_containsBackreferences = false; + m_containsBOL = false; + + newlineCached = 0; + digitsCached = 0; + spacesCached = 0; + wordcharCached = 0; + nondigitsCached = 0; + nonspacesCached = 0; + nonwordcharCached = 0; + + deleteAllValues(m_disjunctions); + m_disjunctions.clear(); + deleteAllValues(m_userCharacterClasses); + m_userCharacterClasses.clear(); + } + + bool containsIllegalBackReference() + { + return m_maxBackReference > m_numSubpatterns; + } + + CharacterClass* newlineCharacterClass() + { + if (!newlineCached) + m_userCharacterClasses.append(newlineCached = newlineCreate()); + return newlineCached; + } + CharacterClass* digitsCharacterClass() + { + if (!digitsCached) + m_userCharacterClasses.append(digitsCached = digitsCreate()); + return digitsCached; + } + CharacterClass* spacesCharacterClass() + { + if (!spacesCached) + m_userCharacterClasses.append(spacesCached = spacesCreate()); + return spacesCached; + } + CharacterClass* wordcharCharacterClass() + { + if (!wordcharCached) + m_userCharacterClasses.append(wordcharCached = wordcharCreate()); + return wordcharCached; + } + CharacterClass* nondigitsCharacterClass() + { + if (!nondigitsCached) + m_userCharacterClasses.append(nondigitsCached = nondigitsCreate()); + return nondigitsCached; + } + CharacterClass* nonspacesCharacterClass() + { + if (!nonspacesCached) + m_userCharacterClasses.append(nonspacesCached = nonspacesCreate()); + return nonspacesCached; + } + CharacterClass* nonwordcharCharacterClass() + { + if (!nonwordcharCached) + m_userCharacterClasses.append(nonwordcharCached = nonwordcharCreate()); + return nonwordcharCached; + } + + bool m_ignoreCase : 1; + bool m_multiline : 1; + bool m_containsBackreferences : 1; + bool m_containsBOL : 1; + unsigned m_numSubpatterns; + unsigned m_maxBackReference; + PatternDisjunction* m_body; + Vector<PatternDisjunction*, 4> m_disjunctions; + Vector<CharacterClass*> m_userCharacterClasses; + +private: + const char* compile(const String& patternString); + + CharacterClass* newlineCached; + CharacterClass* digitsCached; + CharacterClass* spacesCached; + CharacterClass* wordcharCached; + CharacterClass* nondigitsCached; + CharacterClass* nonspacesCached; + CharacterClass* nonwordcharCached; +}; + +} } // namespace JSC::Yarr + +#endif // YarrPattern_h diff --git a/src/3rdparty/masm/yarr/YarrSyntaxChecker.cpp b/src/3rdparty/masm/yarr/YarrSyntaxChecker.cpp new file mode 100644 index 0000000000..aa98c4a354 --- /dev/null +++ b/src/3rdparty/masm/yarr/YarrSyntaxChecker.cpp @@ -0,0 +1,59 @@ +/* + * Copyright (C) 2011 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "YarrSyntaxChecker.h" + +#include "YarrParser.h" + +namespace JSC { namespace Yarr { + +class SyntaxChecker { +public: + void assertionBOL() {} + void assertionEOL() {} + void assertionWordBoundary(bool) {} + void atomPatternCharacter(UChar) {} + void atomBuiltInCharacterClass(BuiltInCharacterClassID, bool) {} + void atomCharacterClassBegin(bool = false) {} + void atomCharacterClassAtom(UChar) {} + void atomCharacterClassRange(UChar, UChar) {} + void atomCharacterClassBuiltIn(BuiltInCharacterClassID, bool) {} + void atomCharacterClassEnd() {} + void atomParenthesesSubpatternBegin(bool = true) {} + void atomParentheticalAssertionBegin(bool = false) {} + void atomParenthesesEnd() {} + void atomBackReference(unsigned) {} + void quantifyAtom(unsigned, unsigned, bool) {} + void disjunction() {} +}; + +const char* checkSyntax(const String& pattern) +{ + SyntaxChecker syntaxChecker; + return parse(syntaxChecker, pattern); +} + +}} // JSC::YARR diff --git a/src/3rdparty/masm/yarr/YarrSyntaxChecker.h b/src/3rdparty/masm/yarr/YarrSyntaxChecker.h new file mode 100644 index 0000000000..104ced3ab4 --- /dev/null +++ b/src/3rdparty/masm/yarr/YarrSyntaxChecker.h @@ -0,0 +1,38 @@ +/* + * Copyright (C) 2011 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef YarrSyntaxChecker_h +#define YarrSyntaxChecker_h + +#include <wtf/text/WTFString.h> + +namespace JSC { namespace Yarr { + +const char* checkSyntax(const String& pattern); + +}} // JSC::YARR + +#endif // YarrSyntaxChecker_h + diff --git a/src/3rdparty/masm/yarr/yarr.pri b/src/3rdparty/masm/yarr/yarr.pri new file mode 100644 index 0000000000..7e9b4d3f3b --- /dev/null +++ b/src/3rdparty/masm/yarr/yarr.pri @@ -0,0 +1,12 @@ +# ------------------------------------------------------------------- +# Project file for YARR +# +# See 'Tools/qmake/README' for an overview of the build system +# ------------------------------------------------------------------- + +SOURCES += \ + $$PWD/YarrInterpreter.cpp \ + $$PWD/YarrPattern.cpp \ + $$PWD/YarrSyntaxChecker.cpp \ + $$PWD/YarrCanonicalizeUCS2.cpp + |