diff options
author | Ulf Hermann <ulf.hermann@qt.io> | 2019-02-22 11:35:34 +0100 |
---|---|---|
committer | Ulf Hermann <ulf.hermann@qt.io> | 2019-02-26 09:32:51 +0000 |
commit | a126b566dc9c0cebb6ef6ddc337e7665a1ce54e8 (patch) | |
tree | 805b913b4040b8747b2c61228e27d327128e121b /src/3rdparty/masm/yarr | |
parent | f52b331e06136bf9d47ec2077626515c9008f97d (diff) |
Upgrade Yarr to latest version from WebKit
This is an upgrade to commit cbb0aa18662bc26da31de91e2104c030eaa6ead2 in
webkit. It causes some more ecmascript tests to pass.
Fixes: QTBUG-73915
Change-Id: I8bb5ff9b37907d17b1020576ba64f0b3aed2f1b3
Reviewed-by: Lars Knoll <lars.knoll@qt.io>
Diffstat (limited to 'src/3rdparty/masm/yarr')
-rw-r--r-- | src/3rdparty/masm/yarr/YarrCanonicalize.h | 1 | ||||
-rw-r--r-- | src/3rdparty/masm/yarr/YarrCanonicalizeUCS2.cpp | 122 | ||||
-rw-r--r-- | src/3rdparty/masm/yarr/YarrCanonicalizeUCS2.js | 21 | ||||
-rw-r--r-- | src/3rdparty/masm/yarr/YarrErrorCode.h | 7 | ||||
-rw-r--r-- | src/3rdparty/masm/yarr/YarrInterpreter.cpp | 43 | ||||
-rw-r--r-- | src/3rdparty/masm/yarr/YarrJIT.cpp | 538 | ||||
-rw-r--r-- | src/3rdparty/masm/yarr/YarrJIT.h | 5 | ||||
-rw-r--r-- | src/3rdparty/masm/yarr/YarrParser.h | 23 | ||||
-rw-r--r-- | src/3rdparty/masm/yarr/YarrPattern.cpp | 137 | ||||
-rw-r--r-- | src/3rdparty/masm/yarr/YarrPattern.h | 22 | ||||
-rw-r--r-- | src/3rdparty/masm/yarr/YarrSyntaxChecker.cpp | 4 | ||||
-rw-r--r-- | src/3rdparty/masm/yarr/create_regex_tables | 2 | ||||
-rw-r--r-- | src/3rdparty/masm/yarr/generateYarrCanonicalizeUnicode | 16 |
13 files changed, 720 insertions, 221 deletions
diff --git a/src/3rdparty/masm/yarr/YarrCanonicalize.h b/src/3rdparty/masm/yarr/YarrCanonicalize.h index fb5e0231ac..cbd279edca 100644 --- a/src/3rdparty/masm/yarr/YarrCanonicalize.h +++ b/src/3rdparty/masm/yarr/YarrCanonicalize.h @@ -53,6 +53,7 @@ struct CanonicalizationRange { extern const size_t UCS2_CANONICALIZATION_RANGES; extern const UChar32* const ucs2CharacterSetInfo[]; extern const CanonicalizationRange ucs2RangeInfo[]; +extern const uint16_t canonicalTableLChar[256]; extern const size_t UNICODE_CANONICALIZATION_RANGES; extern const UChar32* const unicodeCharacterSetInfo[]; diff --git a/src/3rdparty/masm/yarr/YarrCanonicalizeUCS2.cpp b/src/3rdparty/masm/yarr/YarrCanonicalizeUCS2.cpp index d91c771590..0eb59f38d2 100644 --- a/src/3rdparty/masm/yarr/YarrCanonicalizeUCS2.cpp +++ b/src/3rdparty/masm/yarr/YarrCanonicalizeUCS2.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2012-2013, 2015-2016 Apple Inc. All rights reserved. + * Copyright (C) 2012-2018 Apple Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -44,9 +44,17 @@ const UChar32 ucs2CharacterSet10[] = { 0x03a0, 0x03c0, 0x03d6, 0 }; const UChar32 ucs2CharacterSet11[] = { 0x03a1, 0x03c1, 0x03f1, 0 }; const UChar32 ucs2CharacterSet12[] = { 0x03a3, 0x03c2, 0x03c3, 0 }; const UChar32 ucs2CharacterSet13[] = { 0x03a6, 0x03c6, 0x03d5, 0 }; -const UChar32 ucs2CharacterSet14[] = { 0x1e60, 0x1e61, 0x1e9b, 0 }; +const UChar32 ucs2CharacterSet14[] = { 0x0412, 0x0432, 0x1c80, 0 }; +const UChar32 ucs2CharacterSet15[] = { 0x0414, 0x0434, 0x1c81, 0 }; +const UChar32 ucs2CharacterSet16[] = { 0x041e, 0x043e, 0x1c82, 0 }; +const UChar32 ucs2CharacterSet17[] = { 0x0421, 0x0441, 0x1c83, 0 }; +const UChar32 ucs2CharacterSet18[] = { 0x0422, 0x0442, 0x1c84, 0x1c85, 0 }; +const UChar32 ucs2CharacterSet19[] = { 0x042a, 0x044a, 0x1c86, 0 }; +const UChar32 ucs2CharacterSet20[] = { 0x0462, 0x0463, 0x1c87, 0 }; +const UChar32 ucs2CharacterSet21[] = { 0x1e60, 0x1e61, 0x1e9b, 0 }; +const UChar32 ucs2CharacterSet22[] = { 0x1c88, 0xa64a, 0xa64b, 0 }; -static const size_t UCS2_CANONICALIZATION_SETS = 15; +static const size_t UCS2_CANONICALIZATION_SETS = 23; const UChar32* const ucs2CharacterSetInfo[UCS2_CANONICALIZATION_SETS] = { ucs2CharacterSet0, ucs2CharacterSet1, @@ -63,9 +71,17 @@ const UChar32* const ucs2CharacterSetInfo[UCS2_CANONICALIZATION_SETS] = { ucs2CharacterSet12, ucs2CharacterSet13, ucs2CharacterSet14, + ucs2CharacterSet15, + ucs2CharacterSet16, + ucs2CharacterSet17, + ucs2CharacterSet18, + ucs2CharacterSet19, + ucs2CharacterSet20, + ucs2CharacterSet21, + ucs2CharacterSet22, }; -const size_t UCS2_CANONICALIZATION_RANGES = 391; +const size_t UCS2_CANONICALIZATION_RANGES = 448; const CanonicalizationRange ucs2RangeInfo[UCS2_CANONICALIZATION_RANGES] = { { 0x0000, 0x0040, 0x0000, CanonicalizeUnique }, { 0x0041, 0x005a, 0x0020, CanonicalizeRangeLo }, @@ -182,7 +198,7 @@ const CanonicalizationRange ucs2RangeInfo[UCS2_CANONICALIZATION_RANGES] = { { 0x0267, 0x0267, 0x0000, CanonicalizeUnique }, { 0x0268, 0x0268, 0x00d1, CanonicalizeRangeHi }, { 0x0269, 0x0269, 0x00d3, CanonicalizeRangeHi }, - { 0x026a, 0x026a, 0x0000, CanonicalizeUnique }, + { 0x026a, 0x026a, 0xa544, CanonicalizeRangeLo }, { 0x026b, 0x026b, 0x29f7, CanonicalizeRangeLo }, { 0x026c, 0x026c, 0xa541, CanonicalizeRangeLo }, { 0x026d, 0x026e, 0x0000, CanonicalizeUnique }, @@ -206,7 +222,8 @@ const CanonicalizationRange ucs2RangeInfo[UCS2_CANONICALIZATION_RANGES] = { { 0x028c, 0x028c, 0x0047, CanonicalizeRangeHi }, { 0x028d, 0x0291, 0x0000, CanonicalizeUnique }, { 0x0292, 0x0292, 0x00db, CanonicalizeRangeHi }, - { 0x0293, 0x029d, 0x0000, CanonicalizeUnique }, + { 0x0293, 0x029c, 0x0000, CanonicalizeUnique }, + { 0x029d, 0x029d, 0xa515, CanonicalizeRangeLo }, { 0x029e, 0x029e, 0xa512, CanonicalizeRangeLo }, { 0x029f, 0x0344, 0x0000, CanonicalizeUnique }, { 0x0345, 0x0345, 0x0007, CanonicalizeSet }, @@ -288,10 +305,34 @@ const CanonicalizationRange ucs2RangeInfo[UCS2_CANONICALIZATION_RANGES] = { { 0x03fc, 0x03fc, 0x0000, CanonicalizeUnique }, { 0x03fd, 0x03ff, 0x0082, CanonicalizeRangeHi }, { 0x0400, 0x040f, 0x0050, CanonicalizeRangeLo }, - { 0x0410, 0x042f, 0x0020, CanonicalizeRangeLo }, - { 0x0430, 0x044f, 0x0020, CanonicalizeRangeHi }, + { 0x0410, 0x0411, 0x0020, CanonicalizeRangeLo }, + { 0x0412, 0x0412, 0x000e, CanonicalizeSet }, + { 0x0413, 0x0413, 0x0020, CanonicalizeRangeLo }, + { 0x0414, 0x0414, 0x000f, CanonicalizeSet }, + { 0x0415, 0x041d, 0x0020, CanonicalizeRangeLo }, + { 0x041e, 0x041e, 0x0010, CanonicalizeSet }, + { 0x041f, 0x0420, 0x0020, CanonicalizeRangeLo }, + { 0x0421, 0x0421, 0x0011, CanonicalizeSet }, + { 0x0422, 0x0422, 0x0012, CanonicalizeSet }, + { 0x0423, 0x0429, 0x0020, CanonicalizeRangeLo }, + { 0x042a, 0x042a, 0x0013, CanonicalizeSet }, + { 0x042b, 0x042f, 0x0020, CanonicalizeRangeLo }, + { 0x0430, 0x0431, 0x0020, CanonicalizeRangeHi }, + { 0x0432, 0x0432, 0x000e, CanonicalizeSet }, + { 0x0433, 0x0433, 0x0020, CanonicalizeRangeHi }, + { 0x0434, 0x0434, 0x000f, CanonicalizeSet }, + { 0x0435, 0x043d, 0x0020, CanonicalizeRangeHi }, + { 0x043e, 0x043e, 0x0010, CanonicalizeSet }, + { 0x043f, 0x0440, 0x0020, CanonicalizeRangeHi }, + { 0x0441, 0x0441, 0x0011, CanonicalizeSet }, + { 0x0442, 0x0442, 0x0012, CanonicalizeSet }, + { 0x0443, 0x0449, 0x0020, CanonicalizeRangeHi }, + { 0x044a, 0x044a, 0x0013, CanonicalizeSet }, + { 0x044b, 0x044f, 0x0020, CanonicalizeRangeHi }, { 0x0450, 0x045f, 0x0050, CanonicalizeRangeHi }, - { 0x0460, 0x0481, 0x0000, CanonicalizeAlternatingAligned }, + { 0x0460, 0x0461, 0x0000, CanonicalizeAlternatingAligned }, + { 0x0462, 0x0463, 0x0014, CanonicalizeSet }, + { 0x0464, 0x0481, 0x0000, CanonicalizeAlternatingAligned }, { 0x0482, 0x0489, 0x0000, CanonicalizeUnique }, { 0x048a, 0x04bf, 0x0000, CanonicalizeAlternatingAligned }, { 0x04c0, 0x04c0, 0x000f, CanonicalizeRangeLo }, @@ -308,16 +349,38 @@ const CanonicalizationRange ucs2RangeInfo[UCS2_CANONICALIZATION_RANGES] = { { 0x10c7, 0x10c7, 0x1c60, CanonicalizeRangeLo }, { 0x10c8, 0x10cc, 0x0000, CanonicalizeUnique }, { 0x10cd, 0x10cd, 0x1c60, CanonicalizeRangeLo }, - { 0x10ce, 0x1d78, 0x0000, CanonicalizeUnique }, + { 0x10ce, 0x10cf, 0x0000, CanonicalizeUnique }, + { 0x10d0, 0x10fa, 0x0bc0, CanonicalizeRangeLo }, + { 0x10fb, 0x10fc, 0x0000, CanonicalizeUnique }, + { 0x10fd, 0x10ff, 0x0bc0, CanonicalizeRangeLo }, + { 0x1100, 0x139f, 0x0000, CanonicalizeUnique }, + { 0x13a0, 0x13ef, 0x97d0, CanonicalizeRangeLo }, + { 0x13f0, 0x13f5, 0x0008, CanonicalizeRangeLo }, + { 0x13f6, 0x13f7, 0x0000, CanonicalizeUnique }, + { 0x13f8, 0x13fd, 0x0008, CanonicalizeRangeHi }, + { 0x13fe, 0x1c7f, 0x0000, CanonicalizeUnique }, + { 0x1c80, 0x1c80, 0x000e, CanonicalizeSet }, + { 0x1c81, 0x1c81, 0x000f, CanonicalizeSet }, + { 0x1c82, 0x1c82, 0x0010, CanonicalizeSet }, + { 0x1c83, 0x1c83, 0x0011, CanonicalizeSet }, + { 0x1c84, 0x1c85, 0x0012, CanonicalizeSet }, + { 0x1c86, 0x1c86, 0x0013, CanonicalizeSet }, + { 0x1c87, 0x1c87, 0x0014, CanonicalizeSet }, + { 0x1c88, 0x1c88, 0x0016, CanonicalizeSet }, + { 0x1c89, 0x1c8f, 0x0000, CanonicalizeUnique }, + { 0x1c90, 0x1cba, 0x0bc0, CanonicalizeRangeHi }, + { 0x1cbb, 0x1cbc, 0x0000, CanonicalizeUnique }, + { 0x1cbd, 0x1cbf, 0x0bc0, CanonicalizeRangeHi }, + { 0x1cc0, 0x1d78, 0x0000, CanonicalizeUnique }, { 0x1d79, 0x1d79, 0x8a04, CanonicalizeRangeLo }, { 0x1d7a, 0x1d7c, 0x0000, CanonicalizeUnique }, { 0x1d7d, 0x1d7d, 0x0ee6, CanonicalizeRangeLo }, { 0x1d7e, 0x1dff, 0x0000, CanonicalizeUnique }, { 0x1e00, 0x1e5f, 0x0000, CanonicalizeAlternatingAligned }, - { 0x1e60, 0x1e61, 0x000e, CanonicalizeSet }, + { 0x1e60, 0x1e61, 0x0015, CanonicalizeSet }, { 0x1e62, 0x1e95, 0x0000, CanonicalizeAlternatingAligned }, { 0x1e96, 0x1e9a, 0x0000, CanonicalizeUnique }, - { 0x1e9b, 0x1e9b, 0x000e, CanonicalizeSet }, + { 0x1e9b, 0x1e9b, 0x0015, CanonicalizeSet }, { 0x1e9c, 0x1e9f, 0x0000, CanonicalizeUnique }, { 0x1ea0, 0x1eff, 0x0000, CanonicalizeAlternatingAligned }, { 0x1f00, 0x1f07, 0x0008, CanonicalizeRangeLo }, @@ -428,7 +491,9 @@ const CanonicalizationRange ucs2RangeInfo[UCS2_CANONICALIZATION_RANGES] = { { 0x2d28, 0x2d2c, 0x0000, CanonicalizeUnique }, { 0x2d2d, 0x2d2d, 0x1c60, CanonicalizeRangeHi }, { 0x2d2e, 0xa63f, 0x0000, CanonicalizeUnique }, - { 0xa640, 0xa66d, 0x0000, CanonicalizeAlternatingAligned }, + { 0xa640, 0xa649, 0x0000, CanonicalizeAlternatingAligned }, + { 0xa64a, 0xa64b, 0x0016, CanonicalizeSet }, + { 0xa64c, 0xa66d, 0x0000, CanonicalizeAlternatingAligned }, { 0xa66e, 0xa67f, 0x0000, CanonicalizeUnique }, { 0xa680, 0xa69b, 0x0000, CanonicalizeAlternatingAligned }, { 0xa69c, 0xa721, 0x0000, CanonicalizeUnique }, @@ -450,15 +515,42 @@ const CanonicalizationRange ucs2RangeInfo[UCS2_CANONICALIZATION_RANGES] = { { 0xa7ab, 0xa7ab, 0xa54f, CanonicalizeRangeHi }, { 0xa7ac, 0xa7ac, 0xa54b, CanonicalizeRangeHi }, { 0xa7ad, 0xa7ad, 0xa541, CanonicalizeRangeHi }, - { 0xa7ae, 0xa7af, 0x0000, CanonicalizeUnique }, + { 0xa7ae, 0xa7ae, 0xa544, CanonicalizeRangeHi }, + { 0xa7af, 0xa7af, 0x0000, CanonicalizeUnique }, { 0xa7b0, 0xa7b0, 0xa512, CanonicalizeRangeHi }, { 0xa7b1, 0xa7b1, 0xa52a, CanonicalizeRangeHi }, - { 0xa7b2, 0xff20, 0x0000, CanonicalizeUnique }, + { 0xa7b2, 0xa7b2, 0xa515, CanonicalizeRangeHi }, + { 0xa7b3, 0xa7b3, 0x03a0, CanonicalizeRangeLo }, + { 0xa7b4, 0xa7b9, 0x0000, CanonicalizeAlternatingAligned }, + { 0xa7ba, 0xab52, 0x0000, CanonicalizeUnique }, + { 0xab53, 0xab53, 0x03a0, CanonicalizeRangeHi }, + { 0xab54, 0xab6f, 0x0000, CanonicalizeUnique }, + { 0xab70, 0xabbf, 0x97d0, CanonicalizeRangeHi }, + { 0xabc0, 0xff20, 0x0000, CanonicalizeUnique }, { 0xff21, 0xff3a, 0x0020, CanonicalizeRangeLo }, { 0xff3b, 0xff40, 0x0000, CanonicalizeUnique }, { 0xff41, 0xff5a, 0x0020, CanonicalizeRangeHi }, { 0xff5b, 0xffff, 0x0000, CanonicalizeUnique }, }; +const uint16_t canonicalTableLChar[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0x39c, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xf7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0x178 +}; + } } // JSC::Yarr diff --git a/src/3rdparty/masm/yarr/YarrCanonicalizeUCS2.js b/src/3rdparty/masm/yarr/YarrCanonicalizeUCS2.js index dc578cfece..b92d8bdd4f 100644 --- a/src/3rdparty/masm/yarr/YarrCanonicalizeUCS2.js +++ b/src/3rdparty/masm/yarr/YarrCanonicalizeUCS2.js @@ -1,5 +1,5 @@ /* - * Copyright (C) 2012, 2016 Apple Inc. All rights reserved. + * Copyright (C) 2012-2018 Apple Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -27,7 +27,7 @@ function printHeader() { var copyright = ( "/*" + "\n" + - " * Copyright (C) 2012-2013, 2015-2016 Apple Inc. All rights reserved." + "\n" + + " * Copyright (C) 2012-2018 Apple Inc. All rights reserved." + "\n" + " *" + "\n" + " * Redistribution and use in source and binary forms, with or without" + "\n" + " * modification, are permitted provided that the following conditions" + "\n" + @@ -183,6 +183,23 @@ function createTables(prefix, maxValue, canonicalGroups) } print("};"); print(); + // Create canonical table for LChar domain + let line = "const uint16_t canonicalTableLChar[256] = {"; + for (let i = 0; i < 256; i++) { + if (!(i % 16)) { + print(line); + line = " "; + } + let canonicalChar = canonicalize(i); + line = line + (canonicalChar < 16 ? "0x0" : "0x") + canonicalChar.toString(16); + if ((i % 16) != 15) + line += ", "; + else if (i != 255) + line += ","; + } + print(line); + print("};"); + print(); } printHeader(); diff --git a/src/3rdparty/masm/yarr/YarrErrorCode.h b/src/3rdparty/masm/yarr/YarrErrorCode.h index 48f2bb7900..3f06a6bff1 100644 --- a/src/3rdparty/masm/yarr/YarrErrorCode.h +++ b/src/3rdparty/masm/yarr/YarrErrorCode.h @@ -60,6 +60,13 @@ inline bool hasError(ErrorCode errorCode) { return errorCode != ErrorCode::NoError; } + +inline bool hasHardError(ErrorCode errorCode) +{ + // TooManyDisjunctions means that we ran out stack compiling. + // All other errors are due to problems in the expression. + return hasError(errorCode) && errorCode != ErrorCode::TooManyDisjunctions; +} JS_EXPORT_PRIVATE JSObject* errorToThrow(ExecState*, ErrorCode); } } // namespace JSC::Yarr diff --git a/src/3rdparty/masm/yarr/YarrInterpreter.cpp b/src/3rdparty/masm/yarr/YarrInterpreter.cpp index 4d3652fcbc..cdcd16af64 100644 --- a/src/3rdparty/masm/yarr/YarrInterpreter.cpp +++ b/src/3rdparty/masm/yarr/YarrInterpreter.cpp @@ -32,12 +32,12 @@ #include "Yarr.h" #include "YarrCanonicalize.h" #include <wtf/BumpPointerAllocator.h> +#include <wtf/CheckedArithmetic.h> #include <wtf/DataLog.h> +#include <wtf/StdLibExtras.h> #include <wtf/text/CString.h> #include <wtf/text/WTFString.h> -using namespace WTF; - namespace JSC { namespace Yarr { template<typename CharType> @@ -67,17 +67,23 @@ public: struct DisjunctionContext { - DisjunctionContext() - : term(0) - { - } + DisjunctionContext() = default; void* operator new(size_t, void* where) { return where; } - int term; + static size_t allocationSize(unsigned numberOfFrames) + { + static_assert(alignof(DisjunctionContext) <= sizeof(void*), ""); + size_t rawSize = (sizeof(DisjunctionContext) - sizeof(uintptr_t) + Checked<size_t>(numberOfFrames) * sizeof(uintptr_t)).unsafeGet(); + size_t roundedSize = WTF::roundUpToMultipleOf<sizeof(void*)>(rawSize); + RELEASE_ASSERT(roundedSize >= rawSize); + return roundedSize; + } + + int term { 0 }; unsigned matchBegin; unsigned matchEnd; uintptr_t frame[1]; @@ -85,7 +91,7 @@ public: DisjunctionContext* allocDisjunctionContext(ByteDisjunction* disjunction) { - size_t size = sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t); + size_t size = DisjunctionContext::allocationSize(disjunction->m_frameSize); allocatorPool = allocatorPool->ensureCapacity(size); RELEASE_ASSERT(allocatorPool); return new (allocatorPool->alloc(size)) DisjunctionContext(); @@ -99,7 +105,6 @@ public: struct ParenthesesDisjunctionContext { ParenthesesDisjunctionContext(unsigned* output, ByteTerm& term) - : next(0) { unsigned firstSubpatternId = term.atom.subpatternId; unsigned numNestedSubpatterns = term.atom.parenthesesDisjunction->m_numSubpatterns; @@ -125,16 +130,25 @@ public: DisjunctionContext* getDisjunctionContext(ByteTerm& term) { - return reinterpret_cast<DisjunctionContext*>(&(subpatternBackup[term.atom.parenthesesDisjunction->m_numSubpatterns << 1])); + return bitwise_cast<DisjunctionContext*>(bitwise_cast<uintptr_t>(this) + allocationSize(term.atom.parenthesesDisjunction->m_numSubpatterns)); } - ParenthesesDisjunctionContext* next; + static size_t allocationSize(unsigned numberOfSubpatterns) + { + static_assert(alignof(ParenthesesDisjunctionContext) <= sizeof(void*), ""); + size_t rawSize = (sizeof(ParenthesesDisjunctionContext) - sizeof(unsigned) + (Checked<size_t>(numberOfSubpatterns) * 2U) * sizeof(unsigned)).unsafeGet(); + size_t roundedSize = WTF::roundUpToMultipleOf<sizeof(void*)>(rawSize); + RELEASE_ASSERT(roundedSize >= rawSize); + return roundedSize; + } + + ParenthesesDisjunctionContext* next { nullptr }; unsigned subpatternBackup[1]; }; ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, unsigned* output, ByteTerm& term) { - size_t size = sizeof(ParenthesesDisjunctionContext) - sizeof(unsigned) + (term.atom.parenthesesDisjunction->m_numSubpatterns << 1) * sizeof(unsigned) + sizeof(DisjunctionContext) - sizeof(uintptr_t) + static_cast<size_t>(disjunction->m_frameSize) * sizeof(uintptr_t); + size_t size = (Checked<size_t>(ParenthesesDisjunctionContext::allocationSize(term.atom.parenthesesDisjunction->m_numSubpatterns)) + DisjunctionContext::allocationSize(disjunction->m_frameSize)).unsafeGet(); allocatorPool = allocatorPool->ensureCapacity(size); RELEASE_ASSERT(allocatorPool); return new (allocatorPool->alloc(size)) ParenthesesDisjunctionContext(output, term); @@ -1630,7 +1644,6 @@ public: , unicode(pattern->unicode()) , output(output) , input(input, start, length, pattern->unicode()) - , allocatorPool(0) , startOffset(start) , remainingMatchCount(matchLimit) { @@ -1641,7 +1654,7 @@ private: bool unicode; unsigned* output; InputStream input; - BumpPointerPool* allocatorPool; + WTF::BumpPointerPool* allocatorPool { nullptr }; unsigned startOffset; unsigned remainingMatchCount; }; @@ -1740,7 +1753,7 @@ public: void atomParenthesesOnceBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) { - unsigned beginTerm = m_bodyDisjunction->terms.size(); + int beginTerm = m_bodyDisjunction->terms.size(); m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition)); m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; diff --git a/src/3rdparty/masm/yarr/YarrJIT.cpp b/src/3rdparty/masm/yarr/YarrJIT.cpp index da65b772f7..1c8138c66e 100644 --- a/src/3rdparty/masm/yarr/YarrJIT.cpp +++ b/src/3rdparty/masm/yarr/YarrJIT.cpp @@ -37,15 +37,12 @@ #if ENABLE(YARR_JIT) -using namespace WTF; - namespace JSC { namespace Yarr { template<YarrJITCompileMode compileMode> class YarrGenerator : private DefaultMacroAssembler { - friend void jitCompile(VM*, YarrCodeBlock&, const String& pattern, unsigned& numSubpatterns, const char*& error, bool ignoreCase, bool multiline); -#if CPU(ARM) +#if CPU(ARM_THUMB2) static const RegisterID input = ARMRegisters::r0; static const RegisterID index = ARMRegisters::r1; static const RegisterID length = ARMRegisters::r2; @@ -477,6 +474,12 @@ class YarrGenerator : private DefaultMacroAssembler { return branch32(BelowOrEqual, index, length); } + Jump checkNotEnoughInput(RegisterID additionalAmount) + { + add32(index, additionalAmount); + return branch32(Above, additionalAmount, length); + } + Jump checkInput() { return branch32(BelowOrEqual, index, length); @@ -559,6 +562,16 @@ class YarrGenerator : private DefaultMacroAssembler { } #endif + void readCharacterDontDecodeSurrogates(Checked<unsigned> negativeCharacterOffset, RegisterID resultReg, RegisterID indexReg = index) + { + BaseIndex address = negativeOffsetIndexedAddress(negativeCharacterOffset, resultReg, indexReg); + + if (m_charSize == Char8) + load8(address, resultReg); + else + load16Unaligned(address, resultReg); + } + void readCharacter(Checked<unsigned> negativeCharacterOffset, RegisterID resultReg, RegisterID indexReg = index) { BaseIndex address = negativeOffsetIndexedAddress(negativeCharacterOffset, resultReg, indexReg); @@ -809,16 +822,16 @@ class YarrGenerator : private DefaultMacroAssembler { // The operation, as a YarrOpCode, and also a reference to the PatternTerm. YarrOpCode m_op; - PatternTerm* m_term; + PatternTerm* m_term = nullptr; // For alternatives, this holds the PatternAlternative and doubly linked // references to this alternative's siblings. In the case of the // OpBodyAlternativeEnd node at the end of a section of repeating nodes, // m_nextOp will reference the OpBodyAlternativeBegin node of the first // repeating alternative. - PatternAlternative* m_alternative; - size_t m_previousOp; - size_t m_nextOp; + PatternAlternative* m_alternative = nullptr; + size_t m_previousOp = 0; + size_t m_nextOp = 0; // Used to record a set of Jumps out of the generated code, typically // used for jumps out to backtracking code, and a single reentry back @@ -1119,6 +1132,228 @@ class YarrGenerator : private DefaultMacroAssembler { backtrackTermDefault(opIndex); } +#if ENABLE(YARR_JIT_BACKREFERENCES) + void matchBackreference(size_t opIndex, JumpList& characterMatchFails, RegisterID character, RegisterID patternIndex, RegisterID patternCharacter) + { + YarrOp& op = m_ops[opIndex]; + PatternTerm* term = op.m_term; + unsigned subpatternId = term->backReferenceSubpatternId; + + Label loop(this); + + readCharacterDontDecodeSurrogates(0, patternCharacter, patternIndex); + readCharacterDontDecodeSurrogates(m_checkedOffset - term->inputPosition, character); + + if (!m_pattern.ignoreCase()) + characterMatchFails.append(branch32(NotEqual, character, patternCharacter)); + else { + Jump charactersMatch = branch32(Equal, character, patternCharacter); + ExtendedAddress characterTableEntry(character, reinterpret_cast<intptr_t>(&canonicalTableLChar)); + load16(characterTableEntry, character); + ExtendedAddress patternTableEntry(patternCharacter, reinterpret_cast<intptr_t>(&canonicalTableLChar)); + load16(patternTableEntry, patternCharacter); + characterMatchFails.append(branch32(NotEqual, character, patternCharacter)); + charactersMatch.link(this); + } + + + add32(TrustedImm32(1), index); + add32(TrustedImm32(1), patternIndex); + + branch32(NotEqual, patternIndex, Address(output, ((subpatternId << 1) + 1) * sizeof(int))).linkTo(loop, this); + } + + void generateBackReference(size_t opIndex) + { + YarrOp& op = m_ops[opIndex]; + PatternTerm* term = op.m_term; + + if (m_pattern.ignoreCase() && m_charSize != Char8) { + m_failureReason = JITFailureReason::BackReference; + return; + } + + unsigned subpatternId = term->backReferenceSubpatternId; + unsigned parenthesesFrameLocation = term->frameLocation; + + const RegisterID characterOrTemp = regT0; + const RegisterID patternIndex = regT1; + const RegisterID patternTemp = regT2; + + storeToFrame(index, parenthesesFrameLocation + BackTrackInfoBackReference::beginIndex()); + if (term->quantityType != QuantifierFixedCount || term->quantityMaxCount != 1) + storeToFrame(TrustedImm32(0), parenthesesFrameLocation + BackTrackInfoBackReference::matchAmountIndex()); + + JumpList matches; + + if (term->quantityType != QuantifierNonGreedy) { + load32(Address(output, (subpatternId << 1) * sizeof(int)), patternIndex); + load32(Address(output, ((subpatternId << 1) + 1) * sizeof(int)), patternTemp); + + // An empty match is successful without consuming characters + if (term->quantityType != QuantifierFixedCount || term->quantityMaxCount != 1) { + matches.append(branch32(Equal, TrustedImm32(-1), patternIndex)); + matches.append(branch32(Equal, patternIndex, patternTemp)); + } else { + Jump zeroLengthMatch = branch32(Equal, TrustedImm32(-1), patternIndex); + Jump tryNonZeroMatch = branch32(NotEqual, patternIndex, patternTemp); + zeroLengthMatch.link(this); + storeToFrame(TrustedImm32(1), parenthesesFrameLocation + BackTrackInfoBackReference::matchAmountIndex()); + matches.append(jump()); + tryNonZeroMatch.link(this); + } + } + + switch (term->quantityType) { + case QuantifierFixedCount: { + Label outerLoop(this); + + // PatternTemp should contain pattern end index at this point + sub32(patternIndex, patternTemp); + if (m_checkedOffset - term->inputPosition) + sub32(Imm32((m_checkedOffset - term->inputPosition).unsafeGet()), patternTemp); + op.m_jumps.append(checkNotEnoughInput(patternTemp)); + + matchBackreference(opIndex, op.m_jumps, characterOrTemp, patternIndex, patternTemp); + + if (term->quantityMaxCount != 1) { + loadFromFrame(parenthesesFrameLocation + BackTrackInfoBackReference::matchAmountIndex(), characterOrTemp); + add32(TrustedImm32(1), characterOrTemp); + storeToFrame(characterOrTemp, parenthesesFrameLocation + BackTrackInfoBackReference::matchAmountIndex()); + matches.append(branch32(Equal, Imm32(term->quantityMaxCount.unsafeGet()), characterOrTemp)); + load32(Address(output, (subpatternId << 1) * sizeof(int)), patternIndex); + load32(Address(output, ((subpatternId << 1) + 1) * sizeof(int)), patternTemp); + jump(outerLoop); + } + matches.link(this); + break; + } + + case QuantifierGreedy: { + JumpList incompleteMatches; + + Label outerLoop(this); + + // PatternTemp should contain pattern end index at this point + sub32(patternIndex, patternTemp); + if (m_checkedOffset - term->inputPosition) + sub32(Imm32((m_checkedOffset - term->inputPosition).unsafeGet()), patternTemp); + matches.append(checkNotEnoughInput(patternTemp)); + + matchBackreference(opIndex, incompleteMatches, characterOrTemp, patternIndex, patternTemp); + + loadFromFrame(parenthesesFrameLocation + BackTrackInfoBackReference::matchAmountIndex(), characterOrTemp); + add32(TrustedImm32(1), characterOrTemp); + storeToFrame(characterOrTemp, parenthesesFrameLocation + BackTrackInfoBackReference::matchAmountIndex()); + if (term->quantityMaxCount != quantifyInfinite) + matches.append(branch32(Equal, Imm32(term->quantityMaxCount.unsafeGet()), characterOrTemp)); + load32(Address(output, (subpatternId << 1) * sizeof(int)), patternIndex); + load32(Address(output, ((subpatternId << 1) + 1) * sizeof(int)), patternTemp); + + // Store current index in frame for restoring after a partial match + storeToFrame(index, parenthesesFrameLocation + BackTrackInfoBackReference::beginIndex()); + jump(outerLoop); + + incompleteMatches.link(this); + loadFromFrame(parenthesesFrameLocation + BackTrackInfoBackReference::beginIndex(), index); + + matches.link(this); + op.m_reentry = label(); + break; + } + + case QuantifierNonGreedy: { + JumpList incompleteMatches; + + matches.append(jump()); + + op.m_reentry = label(); + + load32(Address(output, (subpatternId << 1) * sizeof(int)), patternIndex); + load32(Address(output, ((subpatternId << 1) + 1) * sizeof(int)), patternTemp); + + // An empty match is successful without consuming characters + Jump zeroLengthMatch = branch32(Equal, TrustedImm32(-1), patternIndex); + Jump tryNonZeroMatch = branch32(NotEqual, patternIndex, patternTemp); + zeroLengthMatch.link(this); + storeToFrame(TrustedImm32(1), parenthesesFrameLocation + BackTrackInfoBackReference::matchAmountIndex()); + matches.append(jump()); + tryNonZeroMatch.link(this); + + // Check if we have input remaining to match + sub32(patternIndex, patternTemp); + if (m_checkedOffset - term->inputPosition) + sub32(Imm32((m_checkedOffset - term->inputPosition).unsafeGet()), patternTemp); + matches.append(checkNotEnoughInput(patternTemp)); + + storeToFrame(index, parenthesesFrameLocation + BackTrackInfoBackReference::beginIndex()); + + matchBackreference(opIndex, incompleteMatches, characterOrTemp, patternIndex, patternTemp); + + matches.append(jump()); + + incompleteMatches.link(this); + loadFromFrame(parenthesesFrameLocation + BackTrackInfoBackReference::beginIndex(), index); + + matches.link(this); + break; + } + } + } + void backtrackBackReference(size_t opIndex) + { + YarrOp& op = m_ops[opIndex]; + PatternTerm* term = op.m_term; + + unsigned subpatternId = term->backReferenceSubpatternId; + + m_backtrackingState.link(this); + op.m_jumps.link(this); + + JumpList failures; + + unsigned parenthesesFrameLocation = term->frameLocation; + switch (term->quantityType) { + case QuantifierFixedCount: + loadFromFrame(parenthesesFrameLocation + BackTrackInfoBackReference::beginIndex(), index); + break; + + case QuantifierGreedy: { + const RegisterID matchAmount = regT0; + const RegisterID patternStartIndex = regT1; + const RegisterID patternEndIndexOrLen = regT2; + + loadFromFrame(parenthesesFrameLocation + BackTrackInfoBackReference::matchAmountIndex(), matchAmount); + failures.append(branchTest32(Zero, matchAmount)); + + load32(Address(output, (subpatternId << 1) * sizeof(int)), patternStartIndex); + load32(Address(output, ((subpatternId << 1) + 1) * sizeof(int)), patternEndIndexOrLen); + sub32(patternStartIndex, patternEndIndexOrLen); + sub32(patternEndIndexOrLen, index); + + sub32(TrustedImm32(1), matchAmount); + storeToFrame(matchAmount, parenthesesFrameLocation + BackTrackInfoBackReference::matchAmountIndex()); + jump(op.m_reentry); + break; + } + + case QuantifierNonGreedy: { + const RegisterID matchAmount = regT0; + + loadFromFrame(parenthesesFrameLocation + BackTrackInfoBackReference::matchAmountIndex(), matchAmount); + if (term->quantityMaxCount != quantifyInfinite) + failures.append(branch32(AboveOrEqual, Imm32(term->quantityMaxCount.unsafeGet()), matchAmount)); + add32(TrustedImm32(1), matchAmount); + storeToFrame(matchAmount, parenthesesFrameLocation + BackTrackInfoBackReference::matchAmountIndex()); + jump(op.m_reentry); + break; + } + } + failures.link(this); + m_backtrackingState.fallthrough(); + } +#endif + void generatePatternCharacterOnce(size_t opIndex) { YarrOp& op = m_ops[opIndex]; @@ -1141,12 +1376,16 @@ class YarrGenerator : private DefaultMacroAssembler { } const RegisterID character = regT0; +#if CPU(X86_64) || CPU(ARM64) + unsigned maxCharactersAtOnce = m_charSize == Char8 ? 8 : 4; +#else unsigned maxCharactersAtOnce = m_charSize == Char8 ? 4 : 2; - unsigned ignoreCaseMask = 0; +#endif + uint64_t ignoreCaseMask = 0; #if CPU(BIG_ENDIAN) - int allCharacters = ch << (m_charSize == Char8 ? 24 : 16); + uint64_t allCharacters = ch << (m_charSize == Char8 ? 24 : 16); #else - int allCharacters = ch; + uint64_t allCharacters = ch; #endif unsigned numberCharacters; unsigned startTermPosition = term->inputPosition; @@ -1155,16 +1394,19 @@ class YarrGenerator : private DefaultMacroAssembler { // upper & lower case representations are converted to a character class. ASSERT(!m_pattern.ignoreCase() || isASCIIAlpha(ch) || isCanonicallyUnique(ch, m_canonicalMode)); - if (m_pattern.ignoreCase() && isASCIIAlpha(ch)) + if (m_pattern.ignoreCase() && isASCIIAlpha(ch)) { #if CPU(BIG_ENDIAN) ignoreCaseMask |= 32 << (m_charSize == Char8 ? 24 : 16); #else ignoreCaseMask |= 32; #endif + } for (numberCharacters = 1; numberCharacters < maxCharactersAtOnce && nextOp->m_op == OpTerm; ++numberCharacters, nextOp = &m_ops[opIndex + numberCharacters]) { PatternTerm* nextTerm = nextOp->m_term; - + + // YarrJIT handles decoded surrogate pair as one character if unicode flag is enabled. + // Note that the numberCharacters become 1 while the width of the pattern character becomes 32bit in this case. if (nextTerm->type != PatternTerm::TypePatternCharacter || nextTerm->quantityType != QuantifierFixedCount || nextTerm->quantityMaxCount != 1 @@ -1192,49 +1434,132 @@ class YarrGenerator : private DefaultMacroAssembler { // upper & lower case representations are converted to a character class. ASSERT(!m_pattern.ignoreCase() || isASCIIAlpha(currentCharacter) || isCanonicallyUnique(currentCharacter, m_canonicalMode)); - allCharacters |= (currentCharacter << shiftAmount); + allCharacters |= (static_cast<uint64_t>(currentCharacter) << shiftAmount); if ((m_pattern.ignoreCase()) && (isASCIIAlpha(currentCharacter))) - ignoreCaseMask |= 32 << shiftAmount; + ignoreCaseMask |= 32ULL << shiftAmount; } + if (m_decodeSurrogatePairs) + op.m_jumps.append(jumpIfNoAvailableInput()); + if (m_charSize == Char8) { + auto check1 = [&] (Checked<unsigned> offset, UChar32 characters) { + op.m_jumps.append(jumpIfCharNotEquals(characters, offset, character)); + }; + + auto check2 = [&] (Checked<unsigned> offset, uint16_t characters, uint16_t mask) { + load16Unaligned(negativeOffsetIndexedAddress(offset, character), character); + if (mask) + or32(Imm32(mask), character); + op.m_jumps.append(branch32(NotEqual, character, Imm32(characters | mask))); + }; + + auto check4 = [&] (Checked<unsigned> offset, unsigned characters, unsigned mask) { + if (mask) { + load32WithUnalignedHalfWords(negativeOffsetIndexedAddress(offset, character), character); + if (mask) + or32(Imm32(mask), character); + op.m_jumps.append(branch32(NotEqual, character, Imm32(characters | mask))); + return; + } + op.m_jumps.append(branch32WithUnalignedHalfWords(NotEqual, negativeOffsetIndexedAddress(offset, character), TrustedImm32(characters))); + }; + +#if CPU(X86_64) || CPU(ARM64) + auto check8 = [&] (Checked<unsigned> offset, uint64_t characters, uint64_t mask) { + load64(negativeOffsetIndexedAddress(offset, character), character); + if (mask) + or64(TrustedImm64(mask), character); + op.m_jumps.append(branch64(NotEqual, character, TrustedImm64(characters | mask))); + }; +#endif + switch (numberCharacters) { case 1: - op.m_jumps.append(jumpIfCharNotEquals(ch, m_checkedOffset - startTermPosition, character)); + // Use 32bit width of allCharacters since Yarr counts surrogate pairs as one character with unicode flag. + check1(m_checkedOffset - startTermPosition, allCharacters & 0xffffffff); return; case 2: { - load16Unaligned(negativeOffsetIndexedAddress(m_checkedOffset - startTermPosition, character), character); - break; + check2(m_checkedOffset - startTermPosition, allCharacters & 0xffff, ignoreCaseMask & 0xffff); + return; } case 3: { - load16Unaligned(negativeOffsetIndexedAddress(m_checkedOffset - startTermPosition, character), character); - if (ignoreCaseMask) - or32(Imm32(ignoreCaseMask), character); - op.m_jumps.append(branch32(NotEqual, character, Imm32((allCharacters & 0xffff) | ignoreCaseMask))); - op.m_jumps.append(jumpIfCharNotEquals(allCharacters >> 16, m_checkedOffset - startTermPosition - 2, character)); + check2(m_checkedOffset - startTermPosition, allCharacters & 0xffff, ignoreCaseMask & 0xffff); + check1(m_checkedOffset - startTermPosition - 2, (allCharacters >> 16) & 0xff); return; } case 4: { - load32WithUnalignedHalfWords(negativeOffsetIndexedAddress(m_checkedOffset- startTermPosition, character), character); - break; + check4(m_checkedOffset - startTermPosition, allCharacters & 0xffffffff, ignoreCaseMask & 0xffffffff); + return; + } +#if CPU(X86_64) || CPU(ARM64) + case 5: { + check4(m_checkedOffset - startTermPosition, allCharacters & 0xffffffff, ignoreCaseMask & 0xffffffff); + check1(m_checkedOffset - startTermPosition - 4, (allCharacters >> 32) & 0xff); + return; + } + case 6: { + check4(m_checkedOffset - startTermPosition, allCharacters & 0xffffffff, ignoreCaseMask & 0xffffffff); + check2(m_checkedOffset - startTermPosition - 4, (allCharacters >> 32) & 0xffff, (ignoreCaseMask >> 32) & 0xffff); + return; + } + case 7: { + check4(m_checkedOffset - startTermPosition, allCharacters & 0xffffffff, ignoreCaseMask & 0xffffffff); + check2(m_checkedOffset - startTermPosition - 4, (allCharacters >> 32) & 0xffff, (ignoreCaseMask >> 32) & 0xffff); + check1(m_checkedOffset - startTermPosition - 6, (allCharacters >> 48) & 0xff); + return; + } + case 8: { + check8(m_checkedOffset - startTermPosition, allCharacters, ignoreCaseMask); + return; } +#endif } } else { + auto check1 = [&] (Checked<unsigned> offset, UChar32 characters) { + op.m_jumps.append(jumpIfCharNotEquals(characters, offset, character)); + }; + + auto check2 = [&] (Checked<unsigned> offset, unsigned characters, unsigned mask) { + if (mask) { + load32WithUnalignedHalfWords(negativeOffsetIndexedAddress(offset, character), character); + if (mask) + or32(Imm32(mask), character); + op.m_jumps.append(branch32(NotEqual, character, Imm32(characters | mask))); + return; + } + op.m_jumps.append(branch32WithUnalignedHalfWords(NotEqual, negativeOffsetIndexedAddress(offset, character), TrustedImm32(characters))); + }; + +#if CPU(X86_64) || CPU(ARM64) + auto check4 = [&] (Checked<unsigned> offset, uint64_t characters, uint64_t mask) { + load64(negativeOffsetIndexedAddress(offset, character), character); + if (mask) + or64(TrustedImm64(mask), character); + op.m_jumps.append(branch64(NotEqual, character, TrustedImm64(characters | mask))); + }; +#endif + switch (numberCharacters) { case 1: - op.m_jumps.append(jumpIfCharNotEquals(ch, m_checkedOffset - term->inputPosition, character)); + // Use 32bit width of allCharacters since Yarr counts surrogate pairs as one character with unicode flag. + check1(m_checkedOffset - startTermPosition, allCharacters & 0xffffffff); return; case 2: - load32WithUnalignedHalfWords(negativeOffsetIndexedAddress(m_checkedOffset- term->inputPosition, character), character); - break; + check2(m_checkedOffset - startTermPosition, allCharacters & 0xffffffff, ignoreCaseMask & 0xffffffff); + return; +#if CPU(X86_64) || CPU(ARM64) + case 3: + check2(m_checkedOffset - startTermPosition, allCharacters & 0xffffffff, ignoreCaseMask & 0xffffffff); + check1(m_checkedOffset - startTermPosition - 2, (allCharacters >> 32) & 0xffff); + return; + case 4: + check4(m_checkedOffset - startTermPosition, allCharacters, ignoreCaseMask); + return; +#endif } } - - if (ignoreCaseMask) - or32(Imm32(ignoreCaseMask), character); - op.m_jumps.append(branch32(NotEqual, character, Imm32(allCharacters | ignoreCaseMask))); - return; } void backtrackPatternCharacterOnce(size_t opIndex) { @@ -1250,6 +1575,9 @@ class YarrGenerator : private DefaultMacroAssembler { const RegisterID character = regT0; const RegisterID countRegister = regT1; + if (m_decodeSurrogatePairs) + op.m_jumps.append(jumpIfNoAvailableInput()); + move(index, countRegister); Checked<unsigned> scaledMaxCount = term->quantityMaxCount; scaledMaxCount *= U_IS_BMP(ch) ? 1 : 2; @@ -1403,8 +1731,10 @@ class YarrGenerator : private DefaultMacroAssembler { const RegisterID character = regT0; - if (m_decodeSurrogatePairs) + if (m_decodeSurrogatePairs) { + op.m_jumps.append(jumpIfNoAvailableInput()); storeToFrame(index, term->frameLocation + BackTrackInfoCharacterClass::beginIndex()); + } JumpList matchDest; readCharacter(m_checkedOffset - term->inputPosition, character); @@ -1451,6 +1781,9 @@ class YarrGenerator : private DefaultMacroAssembler { const RegisterID character = regT0; const RegisterID countRegister = regT1; + if (m_decodeSurrogatePairs) + op.m_jumps.append(jumpIfNoAvailableInput()); + move(index, countRegister); sub32(Imm32(term->quantityMaxCount.unsafeGet()), countRegister); @@ -1780,13 +2113,19 @@ class YarrGenerator : private DefaultMacroAssembler { break; case PatternTerm::TypeForwardReference: + m_failureReason = JITFailureReason::ForwardReference; break; case PatternTerm::TypeParenthesesSubpattern: case PatternTerm::TypeParentheticalAssertion: RELEASE_ASSERT_NOT_REACHED(); + case PatternTerm::TypeBackReference: +#if ENABLE(YARR_JIT_BACKREFERENCES) + generateBackReference(opIndex); +#else m_failureReason = JITFailureReason::BackReference; +#endif break; case PatternTerm::TypeDotStarEnclosure: generateDotStarEnclosure(opIndex); @@ -1846,18 +2185,23 @@ class YarrGenerator : private DefaultMacroAssembler { break; case PatternTerm::TypeForwardReference: + m_failureReason = JITFailureReason::ForwardReference; break; case PatternTerm::TypeParenthesesSubpattern: case PatternTerm::TypeParentheticalAssertion: RELEASE_ASSERT_NOT_REACHED(); - case PatternTerm::TypeDotStarEnclosure: - backtrackDotStarEnclosure(opIndex); - break; - case PatternTerm::TypeBackReference: +#if ENABLE(YARR_JIT_BACKREFERENCES) + backtrackBackReference(opIndex); +#else m_failureReason = JITFailureReason::BackReference; +#endif + break; + + case PatternTerm::TypeDotStarEnclosure: + backtrackDotStarEnclosure(opIndex); break; } } @@ -2157,7 +2501,7 @@ class YarrGenerator : private DefaultMacroAssembler { } // If the parentheses are quantified Greedy then add a label to jump back - // to if get a failed match from after the parentheses. For NonGreedy + // to if we get a failed match from after the parentheses. For NonGreedy // parentheses, link the jump from before the subpattern to here. if (term->quantityType == QuantifierGreedy) op.m_reentry = label(); @@ -2221,11 +2565,11 @@ class YarrGenerator : private DefaultMacroAssembler { // match within the parentheses, or the second having skipped over them. // - To check for empty matches, which must be rejected. // - // At the head of a NonGreedy set of parentheses we'll immediately set the - // value on the stack to -1 (indicating a match skipping the subpattern), + // At the head of a NonGreedy set of parentheses we'll immediately set 'begin' + // in the backtrack info to -1 (indicating a match skipping the subpattern), // and plant a jump to the end. We'll also plant a label to backtrack to - // to reenter the subpattern later, with a store to set up index on the - // second iteration. + // to reenter the subpattern later, with a store to set 'begin' to current index + // on the second iteration. // // FIXME: for capturing parens, could use the index in the capture array? if (term->quantityType == QuantifierGreedy || term->quantityType == QuantifierNonGreedy) { @@ -2312,7 +2656,7 @@ class YarrGenerator : private DefaultMacroAssembler { } // If the parentheses are quantified Greedy then add a label to jump back - // to if get a failed match from after the parentheses. For NonGreedy + // to if we get a failed match from after the parentheses. For NonGreedy // parentheses, link the jump from before the subpattern to here. if (term->quantityType == QuantifierGreedy) { if (term->quantityMaxCount != quantifyInfinite) @@ -2324,6 +2668,7 @@ class YarrGenerator : private DefaultMacroAssembler { } else if (term->quantityType == QuantifierNonGreedy) { YarrOp& beginOp = m_ops[op.m_previousOp]; beginOp.m_jumps.link(this); + op.m_reentry = label(); } #else // !YARR_JIT_ALL_PARENS_EXPRESSIONS RELEASE_ASSERT_NOT_REACHED(); @@ -2385,6 +2730,7 @@ class YarrGenerator : private DefaultMacroAssembler { do { --opIndex; + YarrOp& op = m_ops[opIndex]; switch (op.m_op) { @@ -2881,32 +3227,32 @@ class YarrGenerator : private DefaultMacroAssembler { if (term->quantityType != QuantifierFixedCount) { m_backtrackingState.link(this); - if (term->quantityType == QuantifierGreedy) { - RegisterID currParenContextReg = regT0; - RegisterID newParenContextReg = regT1; + RegisterID currParenContextReg = regT0; + RegisterID newParenContextReg = regT1; - loadFromFrame(parenthesesFrameLocation + BackTrackInfoParentheses::parenContextHeadIndex(), currParenContextReg); + loadFromFrame(parenthesesFrameLocation + BackTrackInfoParentheses::parenContextHeadIndex(), currParenContextReg); - restoreParenContext(currParenContextReg, regT2, term->parentheses.subpatternId, term->parentheses.lastSubpatternId, parenthesesFrameLocation); + restoreParenContext(currParenContextReg, regT2, term->parentheses.subpatternId, term->parentheses.lastSubpatternId, parenthesesFrameLocation); - freeParenContext(currParenContextReg, newParenContextReg); - storeToFrame(newParenContextReg, parenthesesFrameLocation + BackTrackInfoParentheses::parenContextHeadIndex()); - const RegisterID countTemporary = regT0; - loadFromFrame(parenthesesFrameLocation + BackTrackInfoParentheses::matchAmountIndex(), countTemporary); - Jump zeroLengthMatch = branchTest32(Zero, countTemporary); + freeParenContext(currParenContextReg, newParenContextReg); + storeToFrame(newParenContextReg, parenthesesFrameLocation + BackTrackInfoParentheses::parenContextHeadIndex()); - sub32(TrustedImm32(1), countTemporary); - storeToFrame(countTemporary, parenthesesFrameLocation + BackTrackInfoParentheses::matchAmountIndex()); + const RegisterID countTemporary = regT0; + loadFromFrame(parenthesesFrameLocation + BackTrackInfoParentheses::matchAmountIndex(), countTemporary); + Jump zeroLengthMatch = branchTest32(Zero, countTemporary); - jump(m_ops[op.m_nextOp].m_reentry); + sub32(TrustedImm32(1), countTemporary); + storeToFrame(countTemporary, parenthesesFrameLocation + BackTrackInfoParentheses::matchAmountIndex()); - zeroLengthMatch.link(this); + jump(m_ops[op.m_nextOp].m_reentry); - // Clear the flag in the stackframe indicating we didn't run through the subpattern. - storeToFrame(TrustedImm32(-1), parenthesesFrameLocation + BackTrackInfoParentheses::beginIndex()); + zeroLengthMatch.link(this); + // Clear the flag in the stackframe indicating we didn't run through the subpattern. + storeToFrame(TrustedImm32(-1), parenthesesFrameLocation + BackTrackInfoParentheses::beginIndex()); + + if (term->quantityType == QuantifierGreedy) jump(m_ops[op.m_nextOp].m_reentry); - } // If Greedy, jump to the end. if (term->quantityType == QuantifierGreedy) { @@ -2929,13 +3275,14 @@ class YarrGenerator : private DefaultMacroAssembler { if (term->quantityType != QuantifierFixedCount) { m_backtrackingState.link(this); - // Check whether we should backtrack back into the parentheses, or if we - // are currently in a state where we had skipped over the subpattern - // (in which case the flag value on the stack will be -1). unsigned parenthesesFrameLocation = term->frameLocation; - Jump hadSkipped = branch32(Equal, Address(stackPointerRegister, (parenthesesFrameLocation + BackTrackInfoParentheses::beginIndex()) * sizeof(void*)), TrustedImm32(-1)); if (term->quantityType == QuantifierGreedy) { + // Check whether we should backtrack back into the parentheses, or if we + // are currently in a state where we had skipped over the subpattern + // (in which case the flag value on the stack will be -1). + Jump hadSkipped = branch32(Equal, Address(stackPointerRegister, (parenthesesFrameLocation + BackTrackInfoParentheses::beginIndex()) * sizeof(void*)), TrustedImm32(-1)); + // For Greedy parentheses, we skip after having already tried going // through the subpattern, so if we get here we're done. YarrOp& beginOp = m_ops[op.m_previousOp]; @@ -2946,8 +3293,25 @@ class YarrGenerator : private DefaultMacroAssembler { // next. Jump back to the start of the parentheses in the forwards // matching path. ASSERT(term->quantityType == QuantifierNonGreedy); + + const RegisterID beginTemporary = regT0; + const RegisterID countTemporary = regT1; + YarrOp& beginOp = m_ops[op.m_previousOp]; - hadSkipped.linkTo(beginOp.m_reentry, this); + + loadFromFrame(parenthesesFrameLocation + BackTrackInfoParentheses::beginIndex(), beginTemporary); + branch32(Equal, beginTemporary, TrustedImm32(-1)).linkTo(beginOp.m_reentry, this); + + JumpList exceededMatchLimit; + + if (term->quantityMaxCount != quantifyInfinite) { + loadFromFrame(parenthesesFrameLocation + BackTrackInfoParentheses::matchAmountIndex(), countTemporary); + exceededMatchLimit.append(branch32(AboveOrEqual, countTemporary, Imm32(term->quantityMaxCount.unsafeGet()))); + } + + branch32(Above, index, beginTemporary).linkTo(beginOp.m_reentry, this); + + exceededMatchLimit.link(this); } m_backtrackingState.fallthrough(); @@ -3021,7 +3385,7 @@ class YarrGenerator : private DefaultMacroAssembler { // the parentheses. // Supported types of parentheses are 'Once' (quantityMaxCount == 1), // 'Terminal' (non-capturing parentheses quantified as greedy - // and infinite), and 0 based greedy quantified parentheses. + // and infinite), and 0 based greedy / non-greedy quantified parentheses. // Alternatives will use the 'Simple' set of ops if either the // subpattern is terminal (in which case we will never need to // backtrack), or if the subpattern only contains one alternative. @@ -3043,7 +3407,9 @@ class YarrGenerator : private DefaultMacroAssembler { if (term->quantityMinCount && term->quantityMinCount != term->quantityMaxCount) { m_failureReason = JITFailureReason::VariableCountedParenthesisWithNonZeroMinimum; return; - } if (term->quantityMaxCount == 1 && !term->parentheses.isCopy) { + } + + if (term->quantityMaxCount == 1 && !term->parentheses.isCopy) { // Select the 'Once' nodes. parenthesesBeginOpCode = OpParenthesesSubpatternOnceBegin; parenthesesEndOpCode = OpParenthesesSubpatternOnceEnd; @@ -3060,10 +3426,10 @@ class YarrGenerator : private DefaultMacroAssembler { parenthesesEndOpCode = OpParenthesesSubpatternTerminalEnd; } else { #if ENABLE(YARR_JIT_ALL_PARENS_EXPRESSIONS) - // We only handle generic parenthesis with greedy counts. - if (term->quantityType != QuantifierGreedy) { + // We only handle generic parenthesis with non-fixed counts. + if (term->quantityType == QuantifierFixedCount) { // This subpattern is not supported by the JIT. - m_failureReason = JITFailureReason::NonGreedyParenthesizedSubpattern; + m_failureReason = JITFailureReason::FixedCountParenthesizedSubpattern; return; } @@ -3369,7 +3735,7 @@ class YarrGenerator : private DefaultMacroAssembler { // The ABI doesn't guarantee the upper bits are zero on unsigned arguments, so clear them ourselves. zeroExtend32ToPtr(index, index); zeroExtend32ToPtr(length, length); -#elif CPU(ARM) +#elif CPU(ARM_THUMB2) push(ARMRegisters::r4); push(ARMRegisters::r5); push(ARMRegisters::r6); @@ -3422,7 +3788,7 @@ class YarrGenerator : private DefaultMacroAssembler { #elif CPU(ARM64) if (m_decodeSurrogatePairs) popPair(framePointerRegister, linkRegister); -#elif CPU(ARM) +#elif CPU(ARM_THUMB2) pop(ARMRegisters::r8); pop(ARMRegisters::r6); pop(ARMRegisters::r5); @@ -3460,10 +3826,14 @@ public: } #endif -#if ENABLE(YARR_JIT_ALL_PARENS_EXPRESSIONS) - if (m_containsNestedSubpatterns) - codeBlock.setUsesPaternContextBuffer(); + if (m_pattern.m_containsBackreferences +#if ENABLE(YARR_JIT_BACKREFERENCES) + && (compileMode == MatchOnly || (m_pattern.ignoreCase() && m_charSize != Char8)) #endif + ) { + codeBlock.setFallBackWithFailureReason(JITFailureReason::BackReference); + return; + } // We need to compile before generating code since we set flags based on compilation that // are used during generation. @@ -3473,7 +3843,12 @@ public: codeBlock.setFallBackWithFailureReason(*m_failureReason); return; } - + +#if ENABLE(YARR_JIT_ALL_PARENS_EXPRESSIONS) + if (m_containsNestedSubpatterns) + codeBlock.setUsesPatternContextBuffer(); +#endif + generateEnter(); Jump hasInput = checkInput(); @@ -3618,7 +3993,10 @@ static void dumpCompileFailure(JITFailureReason failure) dataLog("Can't JIT a pattern decoding surrogate pairs\n"); break; case JITFailureReason::BackReference: - dataLog("Can't JIT a pattern containing back references\n"); + dataLog("Can't JIT some patterns containing back references\n"); + break; + case JITFailureReason::ForwardReference: + dataLog("Can't JIT a pattern containing forward references\n"); break; case JITFailureReason::VariableCountedParenthesisWithNonZeroMinimum: dataLog("Can't JIT a pattern containing a variable counted parenthesis with a non-zero minimum\n"); @@ -3626,8 +4004,8 @@ static void dumpCompileFailure(JITFailureReason failure) case JITFailureReason::ParenthesizedSubpattern: dataLog("Can't JIT a pattern containing parenthesized subpatterns\n"); break; - case JITFailureReason::NonGreedyParenthesizedSubpattern: - dataLog("Can't JIT a pattern containing non-greedy parenthesized subpatterns\n"); + case JITFailureReason::FixedCountParenthesizedSubpattern: + dataLog("Can't JIT a pattern containing fixed count parenthesized subpatterns\n"); break; case JITFailureReason::ExecutableMemoryAllocationFailure: dataLog("Can't JIT because of failure of allocation of executable memory\n"); diff --git a/src/3rdparty/masm/yarr/YarrJIT.h b/src/3rdparty/masm/yarr/YarrJIT.h index 35a0690f6e..c6410d3c44 100644 --- a/src/3rdparty/masm/yarr/YarrJIT.h +++ b/src/3rdparty/masm/yarr/YarrJIT.h @@ -54,9 +54,10 @@ namespace Yarr { enum class JITFailureReason : uint8_t { DecodeSurrogatePair, BackReference, + ForwardReference, VariableCountedParenthesisWithNonZeroMinimum, ParenthesizedSubpattern, - NonGreedyParenthesizedSubpattern, + FixedCountParenthesizedSubpattern, ExecutableMemoryAllocationFailure, }; @@ -107,7 +108,7 @@ public: #if ENABLE(YARR_JIT_ALL_PARENS_EXPRESSIONS) bool usesPatternContextBuffer() { return m_usesPatternContextBuffer; } - void setUsesPaternContextBuffer() { m_usesPatternContextBuffer = true; } + void setUsesPatternContextBuffer() { m_usesPatternContextBuffer = true; } MatchResult execute(const LChar* input, unsigned start, unsigned length, int* output, void* freeParenContext, unsigned parenContextSize) { diff --git a/src/3rdparty/masm/yarr/YarrParser.h b/src/3rdparty/masm/yarr/YarrParser.h index 3e5311f1fb..8032b39811 100644 --- a/src/3rdparty/masm/yarr/YarrParser.h +++ b/src/3rdparty/masm/yarr/YarrParser.h @@ -194,7 +194,9 @@ private: // invoked with inCharacterClass set. NO_RETURN_DUE_TO_ASSERT void assertionWordBoundary(bool) { RELEASE_ASSERT_NOT_REACHED(); } NO_RETURN_DUE_TO_ASSERT void atomBackReference(unsigned) { RELEASE_ASSERT_NOT_REACHED(); } - NO_RETURN_DUE_TO_ASSERT void atomNamedBackReference(String) { RELEASE_ASSERT_NOT_REACHED(); } + NO_RETURN_DUE_TO_ASSERT void atomNamedBackReference(const String&) { RELEASE_ASSERT_NOT_REACHED(); } + NO_RETURN_DUE_TO_ASSERT bool isValidNamedForwardReference(const String&) { RELEASE_ASSERT_NOT_REACHED(); return false; } + NO_RETURN_DUE_TO_ASSERT void atomNamedForwardReference(const String&) { RELEASE_ASSERT_NOT_REACHED(); } private: Delegate& m_delegate; @@ -421,9 +423,16 @@ private: if (!atEndOfPattern() && !inCharacterClass) { if (consume() == '<') { auto groupName = tryConsumeGroupName(); - if (groupName && m_captureGroupNames.contains(groupName.value())) { - delegate.atomNamedBackReference(groupName.value()); - break; + if (groupName) { + if (m_captureGroupNames.contains(groupName.value())) { + delegate.atomNamedBackReference(groupName.value()); + break; + } + + if (delegate.isValidNamedForwardReference(groupName.value())) { + delegate.atomNamedForwardReference(groupName.value()); + break; + } } if (m_isUnicode) { m_errorCode = ErrorCode::InvalidBackreference; @@ -1132,11 +1141,13 @@ private: * void atomCharacterClassRange(UChar32 begin, UChar32 end) * void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) * void atomCharacterClassEnd() - * void atomParenthesesSubpatternBegin(bool capture = true, std::optional<String> groupName); + * void atomParenthesesSubpatternBegin(bool capture = true, Optional<String> groupName); * void atomParentheticalAssertionBegin(bool invert = false); * void atomParenthesesEnd(); * void atomBackReference(unsigned subpatternId); - * void atomNamedBackReference(String subpatternName); + * void atomNamedBackReference(const String& subpatternName); + * bool isValidNamedForwardReference(const String& subpatternName); + * void atomNamedForwardReference(const String& subpatternName); * * void quantifyAtom(unsigned min, unsigned max, bool greedy); * diff --git a/src/3rdparty/masm/yarr/YarrPattern.cpp b/src/3rdparty/masm/yarr/YarrPattern.cpp index ac66ea1b9a..9c1cdadf3f 100644 --- a/src/3rdparty/masm/yarr/YarrPattern.cpp +++ b/src/3rdparty/masm/yarr/YarrPattern.cpp @@ -33,12 +33,9 @@ #include "YarrParser.h" #include <wtf/DataLog.h> #include <wtf/Optional.h> -//#include <wtf/Threading.h> #include <wtf/Vector.h> #include <wtf/text/WTFString.h> -using namespace WTF; - namespace JSC { namespace Yarr { #include "RegExpJitTables.h" @@ -334,7 +331,7 @@ private: ranges.insert(i, CharacterRange(lo, hi)); return; } - // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining + // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the beginning // If the new range start at or before the end of the last range, then the overlap (if it starts one after the // end of the last range they concatenate, which is just as good. if (lo <= (ranges[i].end + 1)) { @@ -446,9 +443,9 @@ public: { } - void reset() + void resetForReparsing() { - m_pattern.reset(); + m_pattern.resetForReparsing(); m_characterClassConstructor.reset(); auto body = std::make_unique<PatternDisjunction>(); @@ -456,7 +453,17 @@ public: m_alternative = body->addNewAlternative(); m_pattern.m_disjunctions.append(WTFMove(body)); } - + + void saveUnmatchedNamedForwardReferences() + { + m_unmatchedNamedForwardReferences.shrink(0); + + for (auto& entry : m_pattern.m_namedForwardReferences) { + if (!m_pattern.m_captureGroupNames.contains(entry)) + m_unmatchedNamedForwardReferences.append(entry); + } + } + void assertionBOL() { if (!m_alternative->m_terms.size() && !m_invertParentheticalAssertion) { @@ -666,12 +673,24 @@ public: m_alternative->m_terms.append(PatternTerm(subpatternId)); } - void atomNamedBackReference(String subpatternName) + void atomNamedBackReference(const String& subpatternName) { ASSERT(m_pattern.m_namedGroupToParenIndex.find(subpatternName) != m_pattern.m_namedGroupToParenIndex.end()); atomBackReference(m_pattern.m_namedGroupToParenIndex.get(subpatternName)); } + bool isValidNamedForwardReference(const String& subpatternName) + { + return !m_unmatchedNamedForwardReferences.contains(subpatternName); + } + + void atomNamedForwardReference(const String& subpatternName) + { + if (!m_pattern.m_namedForwardReferences.contains(subpatternName)) + m_pattern.m_namedForwardReferences.append(subpatternName); + m_alternative->m_terms.append(PatternTerm::ForwardReference()); + } + // deep copy the argument disjunction. If filterStartsWithBOL is true, // skip alternatives with m_startsWithBOL set true. PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction, bool filterStartsWithBOL = false) @@ -1079,6 +1098,7 @@ private: YarrPattern& m_pattern; PatternAlternative* m_alternative; CharacterClassConstructor m_characterClassConstructor; + Vector<String> m_unmatchedNamedForwardReferences; void* m_stackLimit; bool m_invertCharacterClass; bool m_invertParentheticalAssertion { false }; @@ -1101,13 +1121,14 @@ ErrorCode YarrPattern::compile(const String& patternString, void* stackLimit) // Quoting Netscape's "What's new in JavaScript 1.2", // "Note: if the number of left parentheses is less than the number specified // in \#, the \# is taken as an octal escape as described in the next row." - if (containsIllegalBackReference()) { + if (containsIllegalBackReference() || containsIllegalNamedForwardReferences()) { if (unicode()) return ErrorCode::InvalidBackreference; unsigned numSubpatterns = m_numSubpatterns; - constructor.reset(); + constructor.saveUnmatchedNamedForwardReferences(); + constructor.resetForReparsing(); ErrorCode error = parse(constructor, patternString, unicode(), numSubpatterns); ASSERT_UNUSED(error, !hasError(error)); ASSERT(numSubpatterns == m_numSubpatterns); @@ -1168,7 +1189,7 @@ void dumpCharacterClass(PrintStream& out, YarrPattern* pattern, CharacterClass* else if (characterClass == pattern->wordcharCharacterClass()) out.print("<word>"); else if (characterClass == pattern->wordUnicodeIgnoreCaseCharCharacterClass()) - out.print("<unicode ignore case>"); + out.print("<unicode word ignore case>"); else if (characterClass == pattern->nondigitsCharacterClass()) out.print("<non-digits>"); else if (characterClass == pattern->nonspacesCharacterClass()) @@ -1176,7 +1197,7 @@ void dumpCharacterClass(PrintStream& out, YarrPattern* pattern, CharacterClass* else if (characterClass == pattern->nonwordcharCharacterClass()) out.print("<non-word>"); else if (characterClass == pattern->nonwordUnicodeIgnoreCaseCharCharacterClass()) - out.print("<unicode non-ignore case>"); + out.print("<unicode non-word ignore case>"); else { bool needMatchesRangesSeperator = false; @@ -1298,75 +1319,7 @@ void PatternTerm::dump(PrintStream& out, YarrPattern* thisPattern, unsigned nest break; case TypeCharacterClass: out.print("character class "); - if (characterClass->m_anyCharacter) - out.print("<any character>"); - else if (characterClass == thisPattern->newlineCharacterClass()) - out.print("<newline>"); - else if (characterClass == thisPattern->digitsCharacterClass()) - out.print("<digits>"); - else if (characterClass == thisPattern->spacesCharacterClass()) - out.print("<whitespace>"); - else if (characterClass == thisPattern->wordcharCharacterClass()) - out.print("<word>"); - else if (characterClass == thisPattern->wordUnicodeIgnoreCaseCharCharacterClass()) - out.print("<unicode ignore case>"); - else if (characterClass == thisPattern->nondigitsCharacterClass()) - out.print("<non-digits>"); - else if (characterClass == thisPattern->nonspacesCharacterClass()) - out.print("<non-whitespace>"); - else if (characterClass == thisPattern->nonwordcharCharacterClass()) - out.print("<non-word>"); - else if (characterClass == thisPattern->nonwordUnicodeIgnoreCaseCharCharacterClass()) - out.print("<unicode non-ignore case>"); - else { - bool needMatchesRangesSeperator = false; - - auto dumpMatches = [&] (const char* prefix, Vector<UChar32> matches) { - size_t matchesSize = matches.size(); - if (matchesSize) { - if (needMatchesRangesSeperator) - out.print(","); - needMatchesRangesSeperator = true; - - out.print(prefix, ":("); - for (size_t i = 0; i < matchesSize; ++i) { - if (i) - out.print(","); - dumpUChar32(out, matches[i]); - } - out.print(")"); - } - }; - - auto dumpRanges = [&] (const char* prefix, Vector<CharacterRange> ranges) { - size_t rangeSize = ranges.size(); - if (rangeSize) { - if (needMatchesRangesSeperator) - out.print(","); - needMatchesRangesSeperator = true; - - out.print(prefix, " ranges:("); - for (size_t i = 0; i < rangeSize; ++i) { - if (i) - out.print(","); - CharacterRange range = ranges[i]; - out.print("("); - dumpUChar32(out, range.begin); - out.print(".."); - dumpUChar32(out, range.end); - out.print(")"); - } - out.print(")"); - } - }; - - out.print("["); - dumpMatches("ASCII", characterClass->m_matches); - dumpRanges("ASCII", characterClass->m_ranges); - dumpMatches("Unicode", characterClass->m_matchesUnicode); - dumpRanges("Unicode", characterClass->m_rangesUnicode); - out.print("]"); - } + dumpCharacterClass(out, thisPattern, characterClass); dumpQuantifier(out); if (quantityType != QuantifierFixedCount || thisPattern->unicode()) out.print(",frame location ", frameLocation); @@ -1439,16 +1392,10 @@ void PatternDisjunction::dump(PrintStream& out, YarrPattern* thisPattern, unsign } } -void YarrPattern::dumpPattern(const String& patternString) +void YarrPattern::dumpPatternString(PrintStream& out, const String& patternString) { - dumpPattern(WTF::dataFile(), patternString); -} + out.print("/", patternString, "/"); -void YarrPattern::dumpPattern(PrintStream& out, const String& patternString) -{ - out.print("RegExp pattern for /"); - out.print(patternString); - out.print("/"); if (global()) out.print("g"); if (ignoreCase()) @@ -1459,6 +1406,18 @@ void YarrPattern::dumpPattern(PrintStream& out, const String& patternString) out.print("u"); if (sticky()) out.print("y"); +} + +void YarrPattern::dumpPattern(const String& patternString) +{ + dumpPattern(WTF::dataFile(), patternString); +} + +void YarrPattern::dumpPattern(PrintStream& out, const String& patternString) +{ + out.print("RegExp pattern for "); + dumpPatternString(out, patternString); + if (m_flags != NoFlags) { bool printSeperator = false; out.print(" ("); diff --git a/src/3rdparty/masm/yarr/YarrPattern.h b/src/3rdparty/masm/yarr/YarrPattern.h index 59decbac46..1417ff1549 100644 --- a/src/3rdparty/masm/yarr/YarrPattern.h +++ b/src/3rdparty/masm/yarr/YarrPattern.h @@ -354,7 +354,7 @@ struct TermChain { struct YarrPattern { JS_EXPORT_PRIVATE YarrPattern(const String& pattern, RegExpFlags, ErrorCode&, void* stackLimit = nullptr); - void reset() + void resetForReparsing() { m_numSubpatterns = 0; m_maxBackReference = 0; @@ -381,6 +381,7 @@ struct YarrPattern { m_disjunctions.clear(); m_userCharacterClasses.clear(); m_captureGroupNames.shrink(0); + m_namedForwardReferences.shrink(0); } bool containsIllegalBackReference() @@ -388,6 +389,19 @@ struct YarrPattern { return m_maxBackReference > m_numSubpatterns; } + bool containsIllegalNamedForwardReferences() + { + if (m_namedForwardReferences.isEmpty()) + return false; + + for (auto& entry : m_namedForwardReferences) { + if (m_captureGroupNames.contains(entry)) + return true; + } + + return false; + } + bool containsUnsignedLengthPattern() { return m_containsUnsignedLengthPattern; @@ -489,6 +503,7 @@ struct YarrPattern { return unicodePropertiesCached.get(classID); } + void dumpPatternString(PrintStream& out, const String& patternString); void dumpPattern(const String& pattern); void dumpPattern(PrintStream& out, const String& pattern); @@ -512,6 +527,7 @@ struct YarrPattern { Vector<std::unique_ptr<PatternDisjunction>, 4> m_disjunctions; Vector<std::unique_ptr<CharacterClass>> m_userCharacterClasses; Vector<String> m_captureGroupNames; + Vector<String> m_namedForwardReferences; HashMap<String, unsigned> m_namedGroupToParenIndex; private: @@ -554,8 +570,8 @@ private: uintptr_t begin; // Not really needed for greedy quantifiers. uintptr_t matchAmount; // Not really needed for fixed quantifiers. - unsigned beginIndex() { return offsetof(BackTrackInfoBackReference, begin) / sizeof(uintptr_t); } - unsigned matchAmountIndex() { return offsetof(BackTrackInfoBackReference, matchAmount) / sizeof(uintptr_t); } + static unsigned beginIndex() { return offsetof(BackTrackInfoBackReference, begin) / sizeof(uintptr_t); } + static unsigned matchAmountIndex() { return offsetof(BackTrackInfoBackReference, matchAmount) / sizeof(uintptr_t); } }; struct BackTrackInfoAlternative { diff --git a/src/3rdparty/masm/yarr/YarrSyntaxChecker.cpp b/src/3rdparty/masm/yarr/YarrSyntaxChecker.cpp index 9f05f22852..358cc94d6b 100644 --- a/src/3rdparty/masm/yarr/YarrSyntaxChecker.cpp +++ b/src/3rdparty/masm/yarr/YarrSyntaxChecker.cpp @@ -48,7 +48,9 @@ public: void atomParentheticalAssertionBegin(bool = false) {} void atomParenthesesEnd() {} void atomBackReference(unsigned) {} - void atomNamedBackReference(String) {} + void atomNamedBackReference(const String&) {} + bool isValidNamedForwardReference(const String&) { return true; } + void atomNamedForwardReference(const String&) {} void quantifyAtom(unsigned, unsigned, bool) {} void disjunction() {} }; diff --git a/src/3rdparty/masm/yarr/create_regex_tables b/src/3rdparty/masm/yarr/create_regex_tables index 4c3dbbe3fb..992566db77 100644 --- a/src/3rdparty/masm/yarr/create_regex_tables +++ b/src/3rdparty/masm/yarr/create_regex_tables @@ -32,7 +32,7 @@ types = { "nonwordchar": { "UseTable" : True, "Inverse": "wordchar", "data": ['`', (0, ord('0') - 1), (ord('9') + 1, ord('A') - 1), (ord('Z') + 1, ord('_') - 1), (ord('z') + 1, 0x10ffff)]}, "nonwordUnicodeIgnoreCaseChar": { "UseTable" : False, "Inverse": "wordUnicodeIgnoreCaseChar", "data": ['`', (0, ord('0') - 1), (ord('9') + 1, ord('A') - 1), (ord('Z') + 1, ord('_') - 1), (ord('z') + 1, 0x017e), (0x0180, 0x2129), (0x212b, 0x10ffff)]}, "newline": { "UseTable" : False, "data": ['\n', '\r', 0x2028, 0x2029]}, - "spaces": { "UseTable" : True, "data": [' ', ('\t', '\r'), 0xa0, 0x1680, 0x180e, 0x2028, 0x2029, 0x202f, 0x205f, 0x3000, (0x2000, 0x200a), 0xfeff]}, + "spaces": { "UseTable" : True, "data": [' ', ('\t', '\r'), 0xa0, 0x1680, 0x2028, 0x2029, 0x202f, 0x205f, 0x3000, (0x2000, 0x200a), 0xfeff]}, "nonspaces": { "UseTable" : True, "Inverse": "spaces", "data": [(0, ord('\t') - 1), (ord('\r') + 1, ord(' ') - 1), (ord(' ') + 1, 0x009f), (0x00a1, 0x167f), (0x1681, 0x180d), (0x180f, 0x1fff), (0x200b, 0x2027), (0x202a, 0x202e), (0x2030, 0x205e), (0x2060, 0x2fff), (0x3001, 0xfefe), (0xff00, 0x10ffff)]}, "digits": { "UseTable" : False, "data": [('0', '9')]}, "nondigits": { "UseTable" : False, "Inverse": "digits", "data": [(0, ord('0') - 1), (ord('9') + 1, 0x10ffff)] } diff --git a/src/3rdparty/masm/yarr/generateYarrCanonicalizeUnicode b/src/3rdparty/masm/yarr/generateYarrCanonicalizeUnicode index a103bcdf16..95549c7eb5 100644 --- a/src/3rdparty/masm/yarr/generateYarrCanonicalizeUnicode +++ b/src/3rdparty/masm/yarr/generateYarrCanonicalizeUnicode @@ -31,7 +31,6 @@ import optparse import os import re import sys -from sets import Set header = """/* * Copyright (C) 2016 Apple Inc. All rights reserved. @@ -78,9 +77,12 @@ def openOrExit(path, mode): dirname = os.path.dirname(path) if not os.path.isdir(dirname): os.makedirs(dirname) - return open(path, mode) + if sys.version_info.major >= 3: + return open(path, mode, encoding="UTF-8") + else: + return open(path, mode) except IOError as e: - print "I/O error opening {0}, ({1}): {2}".format(path, e.errno, e.strerror) + print("I/O error opening {0}, ({1}): {2}".format(path, e.errno, e.strerror)) exit(1) class Canonicalize: @@ -93,7 +95,7 @@ class Canonicalize: self.canonicalGroups[mapping].append(code) def readCaseFolding(self, file): - codesSeen = Set() + codesSeen = set() for line in file: line = line.split('#', 1)[0] line = line.rstrip() @@ -154,8 +156,8 @@ class Canonicalize: for i in range(len(characterSets)): characters = "" - set = characterSets[i] - for ch in set: + cur_set = characterSets[i] + for ch in cur_set: characters = characters + "0x{character:04x}, ".format(character=ch) file.write("const UChar32 unicodeCharacterSet{index:d}[] = {{ {characters}0 }};\n".format(index=i, characters=characters)) @@ -189,7 +191,7 @@ if __name__ == "__main__": caseFoldingTxtPath = args[0] canonicalizeHPath = args[1] caseFoldingTxtFile = openOrExit(caseFoldingTxtPath, "r") - canonicalizeHFile = openOrExit(canonicalizeHPath, "wb") + canonicalizeHFile = openOrExit(canonicalizeHPath, "w") canonicalize = Canonicalize() canonicalize.readCaseFolding(caseFoldingTxtFile) |