diff options
author | Lars Knoll <lars.knoll@qt.io> | 2018-08-06 14:55:21 +0200 |
---|---|---|
committer | Lars Knoll <lars.knoll@qt.io> | 2018-08-10 14:16:09 +0000 |
commit | 18d2f78437d28987297148b63b99ceed6313a78a (patch) | |
tree | 845e016b002a123e394df43fcf88cf2dc7ee1fb6 | |
parent | 577630fe4a1f6a129239788080ff9e802118fd26 (diff) |
Update Yarr to the latest version from WebKit
Updated Yarr to a to commit
4d2a53d60487cb1f8b2a9a1e9f684af336fd7d2c in WebKit.
Adjusted the yarr code base to work with our older version of
wtf and masm.
Change-Id: I04b4593ece051e1d7aa087b87aa08c92595d1098
Reviewed-by: Simon Hausmann <simon.hausmann@qt.io>
59 files changed, 6163 insertions, 1694 deletions
diff --git a/src/3rdparty/masm/assembler/ARM64Assembler.h b/src/3rdparty/masm/assembler/ARM64Assembler.h index 1787e921e8..fcf2e485e8 100644 --- a/src/3rdparty/masm/assembler/ARM64Assembler.h +++ b/src/3rdparty/masm/assembler/ARM64Assembler.h @@ -1980,6 +1980,13 @@ public: } template<int datasize> + ALWAYS_INLINE void stp(RegisterID rt, RegisterID rt2, RegisterID rn, unsigned pimm = 0) + { + CHECK_DATASIZE(); + insn(loadStoreRegisterPairOffset(MEMPAIROPSIZE_INT(datasize), false, MemOp_STORE, pimm, rn, rt, rt2)); + } + + template<int datasize> ALWAYS_INLINE void str(RegisterID rt, RegisterID rn, RegisterID rm) { str<datasize>(rt, rn, rm, UXTX, 0); @@ -3701,6 +3708,23 @@ private: } // 'V' means vector + ALWAYS_INLINE static int loadStoreRegisterPairOffset(MemPairOpSize size, bool V, MemOp opc, int immediate, RegisterID rn, FPRegisterID rt, FPRegisterID rt2) + { + ASSERT(size < 3); + ASSERT(opc == (opc & 1)); // Only load or store, load signed 64 is handled via size. + ASSERT(V || (size != MemPairOp_LoadSigned_32) || (opc == MemOp_LOAD)); // There isn't an integer store signed. + unsigned immedShiftAmount = memPairOffsetShift(V, size); + int imm7 = immediate >> immedShiftAmount; + ASSERT((imm7 << immedShiftAmount) == immediate && isInt7(imm7)); + return (0x29000000 | size << 30 | V << 26 | opc << 22 | (imm7 & 0x7f) << 15 | rt2 << 10 | xOrSp(rn) << 5 | rt); + } + + ALWAYS_INLINE static int loadStoreRegisterPairOffset(MemPairOpSize size, bool V, MemOp opc, int immediate, RegisterID rn, RegisterID rt, RegisterID rt2) + { + return loadStoreRegisterPairOffset(size, V, opc, immediate, rn, xOrZrAsFPR(rt), xOrZrAsFPR(rt2)); + } + + // 'V' means vector // 'S' means shift rm ALWAYS_INLINE static int loadStoreRegisterRegisterOffset(MemOpSize size, bool V, MemOp opc, RegisterID rm, ExtendType option, bool S, RegisterID rn, FPRegisterID rt) { diff --git a/src/3rdparty/masm/assembler/LinkBuffer.h b/src/3rdparty/masm/assembler/LinkBuffer.h index 75b9a5c0bd..8af084c330 100644 --- a/src/3rdparty/masm/assembler/LinkBuffer.h +++ b/src/3rdparty/masm/assembler/LinkBuffer.h @@ -26,6 +26,8 @@ #ifndef LinkBuffer_h #define LinkBuffer_h +#include <Platform.h> + #if ENABLE(ASSEMBLER) #define DUMP_LINK_STATISTICS 0 @@ -66,7 +68,7 @@ struct DefaultExecutableOffsetCalculator { // template <typename MacroAssembler, template <typename T> class ExecutableOffsetCalculator> class LinkBufferBase { - WTF_MAKE_NONCOPYABLE(LinkBufferBase); + WTF_MAKE_NONCOPYABLE(LinkBufferBase) typedef MacroAssemblerCodeRef CodeRef; typedef MacroAssemblerCodePtr CodePtr; typedef typename MacroAssembler::Label Label; @@ -265,7 +267,7 @@ protected: #define FINALIZE_CODE_IF(condition, linkBufferReference, dataLogFArgumentsForHeading) \ (UNLIKELY((condition)) \ - ? ((linkBufferReference).finalizeCodeWithDisassembly dataLogFArgumentsForHeading) \ + ? ((linkBufferReference).finalizeCodeWithDisassembly (dataLogFArgumentsForHeading)) \ : (linkBufferReference).finalizeCodeWithoutDisassembly()) // Use this to finalize code, like so: @@ -518,6 +520,20 @@ public: #endif +#if CPU(ARM_THUMB2) +typedef LinkBuffer<MacroAssembler<MacroAssemblerARMv7>> DefaultLinkBuffer; +#elif CPU(ARM64) +typedef LinkBuffer<MacroAssembler<MacroAssemblerARM64>> DefaultLinkBuffer; +#elif CPU(ARM_TRADITIONAL) +typedef LinkBuffer<MacroAssembler<MacroAssemblerARM>> DefaultLinkBuffer; +#elif CPU(MIPS) +typedef LinkBuffer<MacroAssembler<MacroAssemblerMIPS>> DefaultLinkBuffer; +#elif CPU(X86) +typedef LinkBuffer<MacroAssembler<MacroAssemblerX86>> DefaultLinkBuffer; +#elif CPU(X86_64) +typedef LinkBuffer<MacroAssembler<MacroAssemblerX86_64>> DefaultLinkBuffer; +#endif + } // namespace JSC #endif // ENABLE(ASSEMBLER) diff --git a/src/3rdparty/masm/assembler/MacroAssembler.h b/src/3rdparty/masm/assembler/MacroAssembler.h index 20ddcadae1..77aec128e5 100644 --- a/src/3rdparty/masm/assembler/MacroAssembler.h +++ b/src/3rdparty/masm/assembler/MacroAssembler.h @@ -805,7 +805,7 @@ public: Jump branchPtr(RelationalCondition cond, RegisterID left, RegisterID right) { - return branch64(cond, left, right); + return this->branch64(cond, left, right); } Jump branchPtr(RelationalCondition cond, RegisterID left, TrustedImmPtr right) @@ -840,7 +840,7 @@ public: Jump branchTestPtr(ResultCondition cond, RegisterID reg, TrustedImm32 mask = TrustedImm32(-1)) { - return branchTest64(cond, reg, mask); + return this->branchTest64(cond, reg, mask); } Jump branchTestPtr(ResultCondition cond, Address address, TrustedImm32 mask = TrustedImm32(-1)) diff --git a/src/3rdparty/masm/assembler/MacroAssemblerARM64.h b/src/3rdparty/masm/assembler/MacroAssemblerARM64.h index ba0d7e93f8..e5a704292d 100644 --- a/src/3rdparty/masm/assembler/MacroAssemblerARM64.h +++ b/src/3rdparty/masm/assembler/MacroAssemblerARM64.h @@ -1126,6 +1126,11 @@ public: m_assembler.ldrh(dest, address.base, memoryTempRegister); } + void load16Unaligned(ImplicitAddress address, RegisterID dest) + { + load16(address, dest); + } + void load16Unaligned(BaseIndex address, RegisterID dest) { load16(address, dest); @@ -1283,6 +1288,16 @@ public: return label; } + void storePair64(RegisterID src1, RegisterID src2, RegisterID dest) + { + storePair64(src1, src2, dest, TrustedImm32(0)); + } + + void storePair64(RegisterID src1, RegisterID src2, RegisterID dest, TrustedImm32 offset) + { + m_assembler.stp<64>(src1, src2, dest, offset.m_value); + } + void store32(RegisterID src, ImplicitAddress address) { if (tryStoreWithOffset<32>(src, address.base, address.offset)) @@ -1420,6 +1435,14 @@ public: store8(dataTempRegister, address); } + void getEffectiveAddress(BaseIndex address, RegisterID dest) + { + m_assembler.add<64>(dest, address.base, address.index, ARM64Assembler::LSL, address.scale); + if (address.offset) + add64(TrustedImm32(address.offset), dest); + } + + // Floating-point operations: static bool supportsFloatingPoint() { return true; } diff --git a/src/3rdparty/masm/assembler/MacroAssemblerARMv7.h b/src/3rdparty/masm/assembler/MacroAssemblerARMv7.h index d91122d4a1..99801a0e3b 100644 --- a/src/3rdparty/masm/assembler/MacroAssemblerARMv7.h +++ b/src/3rdparty/masm/assembler/MacroAssemblerARMv7.h @@ -255,6 +255,14 @@ public: store32(dataTempRegister, address.m_ptr); } + void getEffectiveAddress(BaseIndex address, RegisterID dest) + { + m_assembler.lsl(addressTempRegister, address.index, static_cast<int>(address.scale)); + m_assembler.add(dest, address.base, addressTempRegister); + if (address.offset) + add32(TrustedImm32(address.offset), dest); + } + void add64(TrustedImm32 imm, AbsoluteAddress address) { move(TrustedImmPtr(address.m_ptr), addressTempRegister); @@ -680,6 +688,11 @@ public: load32(setupArmAddress(address), dest); } + void load16Unaligned(ImplicitAddress address, RegisterID dest) + { + load16(setupArmAddress(address), dest); + } + void load16Unaligned(BaseIndex address, RegisterID dest) { load16(setupArmAddress(address), dest); diff --git a/src/3rdparty/masm/assembler/MacroAssemblerCodeRef.h b/src/3rdparty/masm/assembler/MacroAssemblerCodeRef.h index e3c77d99e6..a7e78ad78f 100644 --- a/src/3rdparty/masm/assembler/MacroAssemblerCodeRef.h +++ b/src/3rdparty/masm/assembler/MacroAssemblerCodeRef.h @@ -27,6 +27,7 @@ #define MacroAssemblerCodeRef_h #include "Disassembler.h" +#include <wtf/Platform.h> #include "ExecutableAllocator.h" #include "LLIntData.h" #include <wtf/DataLog.h> @@ -141,6 +142,8 @@ public: ASSERT_VALID_CODE_POINTER(m_value); } + inline FunctionPtr(MacroAssemblerCodePtr ptr); + // MSVC doesn't seem to treat functions with different calling conventions as // different types; these methods already defined for fastcall, below. #if CALLING_CONVENTION_IS_STDCALL && !OS(WINDOWS) @@ -327,6 +330,12 @@ private: void* m_value; }; + +FunctionPtr::FunctionPtr(MacroAssemblerCodePtr ptr) + : m_value(ptr.executableAddress()) +{ +} + // MacroAssemblerCodeRef: // // A reference to a section of JIT generated code. A CodeRef consists of a diff --git a/src/3rdparty/masm/assembler/MacroAssemblerMIPS.h b/src/3rdparty/masm/assembler/MacroAssemblerMIPS.h index f2ad6a4470..07f0ec623f 100644 --- a/src/3rdparty/masm/assembler/MacroAssemblerMIPS.h +++ b/src/3rdparty/masm/assembler/MacroAssemblerMIPS.h @@ -27,6 +27,8 @@ #ifndef MacroAssemblerMIPS_h #define MacroAssemblerMIPS_h +#include <Platform.h> + #if ENABLE(ASSEMBLER) && CPU(MIPS) #include "AbstractMacroAssembler.h" @@ -268,6 +270,18 @@ public: m_assembler.sw(dataTempRegister, addrTempRegister, 4); } + void getEffectiveAddress(BaseIndex address, RegisterID dest) + { + if (!address.scale && !m_fixedWidth) + m_assembler.addu(dest, address.index, address.base); + else { + m_assembler.sll(addrTempRegister, address.index, address.scale); + m_assembler.addu(dest, addrTempRegister, address.base); + } + if (address.offset) + add32(TrustedImm32(address.offset), dest); + } + void and32(Address src, RegisterID dest) { load32(src, dataTempRegister); diff --git a/src/3rdparty/masm/assembler/MacroAssemblerX86.h b/src/3rdparty/masm/assembler/MacroAssemblerX86.h index 280cf427fc..e3e0bfe5e1 100644 --- a/src/3rdparty/masm/assembler/MacroAssemblerX86.h +++ b/src/3rdparty/masm/assembler/MacroAssemblerX86.h @@ -108,6 +108,11 @@ public: m_assembler.adcl_im(imm.m_value >> 31, reinterpret_cast<const char*>(address.m_ptr) + sizeof(int32_t)); } + void getEffectiveAddress(BaseIndex address, RegisterID dest) + { + return x86Lea32(address, dest); + } + void and32(TrustedImm32 imm, AbsoluteAddress address) { m_assembler.andl_im(imm.m_value, address.m_ptr); diff --git a/src/3rdparty/masm/assembler/MacroAssemblerX86Common.h b/src/3rdparty/masm/assembler/MacroAssemblerX86Common.h index 94771be6a7..769b4346ee 100644 --- a/src/3rdparty/masm/assembler/MacroAssemblerX86Common.h +++ b/src/3rdparty/masm/assembler/MacroAssemblerX86Common.h @@ -146,14 +146,24 @@ public: m_assembler.andl_rr(src, dest); } - void add32(RegisterID op1, RegisterID op2, RegisterID dest) + void add32(RegisterID a, RegisterID b, RegisterID dest) { - if (op2 == dest) { - add32(op1, dest); - } else { - move(op1, dest); - add32(op2, dest); + x86Lea32(BaseIndex(a, b, TimesOne), dest); + } + + void x86Lea32(BaseIndex index, RegisterID dest) + { + if (!index.scale && !index.offset) { + if (index.base == dest) { + add32(index.index, dest); + return; + } + if (index.index == dest) { + add32(index.base, dest); + return; + } } + m_assembler.leal_mr(index.offset, index.base, index.index, index.scale, dest); } void and32(TrustedImm32 imm, RegisterID dest) @@ -501,6 +511,11 @@ public: load32(address, dest); } + void load16Unaligned(ImplicitAddress address, RegisterID dest) + { + load16(address, dest); + } + void load16Unaligned(BaseIndex address, RegisterID dest) { load16(address, dest); @@ -558,6 +573,11 @@ public: m_assembler.movzwl_mr(address.offset, address.base, address.index, address.scale, dest); } + void load16(ImplicitAddress address, RegisterID dest) + { + m_assembler.movzwl_mr(address.offset, address.base, dest); + } + void load16(Address address, RegisterID dest) { m_assembler.movzwl_mr(address.offset, address.base, dest); diff --git a/src/3rdparty/masm/assembler/MacroAssemblerX86_64.h b/src/3rdparty/masm/assembler/MacroAssemblerX86_64.h index 002caaae78..f4349e1f93 100644 --- a/src/3rdparty/masm/assembler/MacroAssemblerX86_64.h +++ b/src/3rdparty/masm/assembler/MacroAssemblerX86_64.h @@ -243,6 +243,26 @@ public: add64(imm, Address(scratchRegister)); } + void x86Lea64(BaseIndex index, RegisterID dest) + { + if (!index.scale && !index.offset) { + if (index.base == dest) { + add64(index.index, dest); + return; + } + if (index.index == dest) { + add64(index.base, dest); + return; + } + } + m_assembler.leaq_mr(index.offset, index.base, index.index, index.scale, dest); + } + + void getEffectiveAddress(BaseIndex address, RegisterID dest) + { + return x86Lea64(address, dest); + } + void and64(RegisterID src, RegisterID dest) { m_assembler.andq_rr(src, dest); diff --git a/src/3rdparty/masm/assembler/X86Assembler.h b/src/3rdparty/masm/assembler/X86Assembler.h index 6fa66e0dd7..2257cb2b9a 100644 --- a/src/3rdparty/masm/assembler/X86Assembler.h +++ b/src/3rdparty/masm/assembler/X86Assembler.h @@ -26,6 +26,8 @@ #ifndef X86Assembler_h #define X86Assembler_h +#include <Platform.h> + #if ENABLE(ASSEMBLER) && (CPU(X86) || CPU(X86_64)) #include "AssemblerBuffer.h" @@ -1417,11 +1419,22 @@ public: { m_formatter.oneByteOp(OP_LEA, dst, base, offset); } + + void leal_mr(int offset, RegisterID base, RegisterID index, int scale, RegisterID dst) + { + m_formatter.oneByteOp(OP_LEA, dst, base, index, scale, offset); + } + #if CPU(X86_64) void leaq_mr(int offset, RegisterID base, RegisterID dst) { m_formatter.oneByteOp64(OP_LEA, dst, base, offset); } + + void leaq_mr(int offset, RegisterID base, RegisterID index, int scale, RegisterID dst) + { + m_formatter.oneByteOp64(OP_LEA, dst, base, index, scale, offset); + } #endif // Flow control: diff --git a/src/3rdparty/masm/masm-defs.pri b/src/3rdparty/masm/masm-defs.pri index 34d6a67451..08c46a7ac2 100644 --- a/src/3rdparty/masm/masm-defs.pri +++ b/src/3rdparty/masm/masm-defs.pri @@ -20,6 +20,7 @@ INCLUDEPATH += $$PWD/assembler INCLUDEPATH += $$PWD/runtime INCLUDEPATH += $$PWD/wtf INCLUDEPATH += $$PWD/stubs +INCLUDEPATH += $$PWD/stubs/runtime INCLUDEPATH += $$PWD/stubs/wtf INCLUDEPATH += $$PWD diff --git a/src/3rdparty/masm/masm.pri b/src/3rdparty/masm/masm.pri index f7cdae7421..0e63ac2ce5 100644 --- a/src/3rdparty/masm/masm.pri +++ b/src/3rdparty/masm/masm.pri @@ -79,10 +79,12 @@ HEADERS += $$PWD/disassembler/ARM64/A64DOpcode.h !qmldevtools_build { SOURCES += $$PWD/yarr/YarrCanonicalizeUCS2.cpp \ + $$PWD/yarr/YarrCanonicalizeUnicode.cpp \ $$PWD/yarr/YarrInterpreter.cpp \ $$PWD/yarr/YarrJIT.cpp \ $$PWD/yarr/YarrPattern.cpp \ - $$PWD/yarr/YarrSyntaxChecker.cpp + $$PWD/yarr/YarrSyntaxChecker.cpp \ + $$PWD/stubs/yarr/YarrUnicodeProperties.cpp HEADERS += $$PWD/yarr/Yarr.h \ $$PWD/yarr/YarrCanonicalizeUCS2.h \ @@ -90,7 +92,8 @@ HEADERS += $$PWD/yarr/Yarr.h \ $$PWD/yarr/YarrJIT.h \ $$PWD/yarr/YarrParser.h \ $$PWD/yarr/YarrPattern.h \ - $$PWD/yarr/YarrSyntaxChecker.h + $$PWD/yarr/YarrSyntaxChecker.h \ + $$PWD/yarr/YarrUnicodeProperties.h } # @@ -107,7 +110,7 @@ debug_and_release { INCLUDEPATH += $$GENERATEDDIR retgen.output = $$GENERATEDDIR/RegExpJitTables.h -retgen.script = $$PWD/create_regex_tables +retgen.script = $$PWD/yarr/create_regex_tables retgen.input = retgen.script retgen.CONFIG += no_link retgen.commands = python $$retgen.script > ${QMAKE_FILE_OUT} diff --git a/src/3rdparty/masm/stubs/ExecutableAllocator.h b/src/3rdparty/masm/stubs/ExecutableAllocator.h index 1ab28588fb..471fe3c952 100644 --- a/src/3rdparty/masm/stubs/ExecutableAllocator.h +++ b/src/3rdparty/masm/stubs/ExecutableAllocator.h @@ -82,7 +82,7 @@ struct ExecutableMemoryHandle : public RefCounted<ExecutableMemoryHandle> { inline bool isManaged() const { return true; } - void* start() { return m_allocation->start(); } + void *start() { return m_allocation->start(); } size_t sizeInBytes() { return m_size; } QV4::ExecutableAllocator::ChunkOfPages *chunk() const @@ -98,7 +98,7 @@ struct ExecutableAllocator { : realAllocator(alloc) {} - PassRefPtr<ExecutableMemoryHandle> allocate(JSGlobalData&, size_t size, void*, int) + Ref<ExecutableMemoryHandle> allocate(JSGlobalData&, size_t size, void*, int) { return adoptRef(new ExecutableMemoryHandle(realAllocator, size)); } diff --git a/src/3rdparty/masm/stubs/Options.h b/src/3rdparty/masm/stubs/Options.h index e03cc67690..6339c06033 100644 --- a/src/3rdparty/masm/stubs/Options.h +++ b/src/3rdparty/masm/stubs/Options.h @@ -44,6 +44,8 @@ namespace JSC { struct Options { static bool showDisassembly(); static bool showDFGDisassembly() { return true; } + static bool zeroStackFrame() { return true; } + static bool dumpCompiledRegExpPatterns() { return false; } }; } diff --git a/src/3rdparty/masm/stubs/SuperSampler.h b/src/3rdparty/masm/stubs/SuperSampler.h new file mode 100644 index 0000000000..422de528e1 --- /dev/null +++ b/src/3rdparty/masm/stubs/SuperSampler.h @@ -0,0 +1,50 @@ +/**************************************************************************** +** +** Copyright (C) 2018 The Qt Company Ltd. +** Contact: https://www.qt.io/licensing/ +** +** This file is part of the QtQml module of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL$ +** Commercial License Usage +** Licensees holding valid commercial Qt licenses may use this file in +** accordance with the commercial license agreement provided with the +** Software or, alternatively, in accordance with the terms contained in +** a written agreement between you and The Qt Company. For licensing terms +** and conditions see https://www.qt.io/terms-conditions. For further +** information use the contact form at https://www.qt.io/contact-us. +** +** GNU Lesser General Public License Usage +** Alternatively, this file may be used under the terms of the GNU Lesser +** General Public License version 3 as published by the Free Software +** Foundation and appearing in the file LICENSE.LGPL3 included in the +** packaging of this file. Please review the following information to +** ensure the GNU Lesser General Public License version 3 requirements +** will be met: https://www.gnu.org/licenses/lgpl-3.0.html. +** +** GNU General Public License Usage +** Alternatively, this file may be used under the terms of the GNU +** General Public License version 2.0 or (at your option) the GNU General +** Public license version 3 or any later version approved by the KDE Free +** Qt Foundation. The licenses are as published by the Free Software +** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3 +** included in the packaging of this file. Please review the following +** information to ensure the GNU General Public License requirements will +** be met: https://www.gnu.org/licenses/gpl-2.0.html and +** https://www.gnu.org/licenses/gpl-3.0.html. +** +** $QT_END_LICENSE$ +** +****************************************************************************/ + +#pragma once + +namespace WTF { + +struct SuperSamplerScope { + SuperSamplerScope(bool) {} +}; + +} + +using WTF::SuperSamplerScope; diff --git a/src/3rdparty/masm/stubs/runtime/ConcurrentJSLock.h b/src/3rdparty/masm/stubs/runtime/ConcurrentJSLock.h new file mode 100644 index 0000000000..43868feadb --- /dev/null +++ b/src/3rdparty/masm/stubs/runtime/ConcurrentJSLock.h @@ -0,0 +1,53 @@ +/**************************************************************************** +** +** Copyright (C) 2018 The Qt Company Ltd. +** Contact: https://www.qt.io/licensing/ +** +** This file is part of the QtQml module of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL$ +** Commercial License Usage +** Licensees holding valid commercial Qt licenses may use this file in +** accordance with the commercial license agreement provided with the +** Software or, alternatively, in accordance with the terms contained in +** a written agreement between you and The Qt Company. For licensing terms +** and conditions see https://www.qt.io/terms-conditions. For further +** information use the contact form at https://www.qt.io/contact-us. +** +** GNU Lesser General Public License Usage +** Alternatively, this file may be used under the terms of the GNU Lesser +** General Public License version 3 as published by the Free Software +** Foundation and appearing in the file LICENSE.LGPL3 included in the +** packaging of this file. Please review the following information to +** ensure the GNU Lesser General Public License version 3 requirements +** will be met: https://www.gnu.org/licenses/lgpl-3.0.html. +** +** GNU General Public License Usage +** Alternatively, this file may be used under the terms of the GNU +** General Public License version 2.0 or (at your option) the GNU General +** Public license version 3 or any later version approved by the KDE Free +** Qt Foundation. The licenses are as published by the Free Software +** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3 +** included in the packaging of this file. Please review the following +** information to ensure the GNU General Public License requirements will +** be met: https://www.gnu.org/licenses/gpl-2.0.html and +** https://www.gnu.org/licenses/gpl-3.0.html. +** +** $QT_END_LICENSE$ +** +****************************************************************************/ + +#pragma once + +namespace JSC { + +class NoLock { +public: + void lock() { } + void unlock() { } + bool isHeld() { return false; } +}; + +typedef NoLock ConcurrentJSLock; + +} // namespace JSC diff --git a/src/3rdparty/masm/stubs/runtime/RegExpKey.h b/src/3rdparty/masm/stubs/runtime/RegExpKey.h new file mode 100644 index 0000000000..392f66fb83 --- /dev/null +++ b/src/3rdparty/masm/stubs/runtime/RegExpKey.h @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2010 University of Szeged + * Copyright (C) 2010 Renata Hodovan (hodovan@inf.u-szeged.hu) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF SZEGED ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL UNIVERSITY OF SZEGED OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#pragma once + +#include <wtf/text/WTFString.h> + +namespace JSC { + +enum RegExpFlags : int8_t { + NoFlags = 0, + FlagGlobal = 1, + FlagIgnoreCase = 2, + FlagMultiline = 4, + FlagSticky = 8, + FlagUnicode = 16, + FlagDotAll = 32, + InvalidFlags = 64, + DeletedValueFlags = -1 +}; + +} // namespace JSC diff --git a/src/3rdparty/masm/stubs/runtime/VM.h b/src/3rdparty/masm/stubs/runtime/VM.h new file mode 100644 index 0000000000..94cce814f3 --- /dev/null +++ b/src/3rdparty/masm/stubs/runtime/VM.h @@ -0,0 +1,50 @@ +/**************************************************************************** +** +** Copyright (C) 2018 The Qt Company Ltd. +** Contact: https://www.qt.io/licensing/ +** +** This file is part of the QtQml module of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL$ +** Commercial License Usage +** Licensees holding valid commercial Qt licenses may use this file in +** accordance with the commercial license agreement provided with the +** Software or, alternatively, in accordance with the terms contained in +** a written agreement between you and The Qt Company. For licensing terms +** and conditions see https://www.qt.io/terms-conditions. For further +** information use the contact form at https://www.qt.io/contact-us. +** +** GNU Lesser General Public License Usage +** Alternatively, this file may be used under the terms of the GNU Lesser +** General Public License version 3 as published by the Free Software +** Foundation and appearing in the file LICENSE.LGPL3 included in the +** packaging of this file. Please review the following information to +** ensure the GNU Lesser General Public License version 3 requirements +** will be met: https://www.gnu.org/licenses/lgpl-3.0.html. +** +** GNU General Public License Usage +** Alternatively, this file may be used under the terms of the GNU +** General Public License version 2.0 or (at your option) the GNU General +** Public license version 3 or any later version approved by the KDE Free +** Qt Foundation. The licenses are as published by the Free Software +** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3 +** included in the packaging of this file. Please review the following +** information to ensure the GNU General Public License requirements will +** be met: https://www.gnu.org/licenses/gpl-2.0.html and +** https://www.gnu.org/licenses/gpl-3.0.html. +** +** $QT_END_LICENSE$ +** +****************************************************************************/ +#ifndef MASM_VM_H +#define MASM_VM_H + +#include <qv4engine_p.h> + +namespace JSC { + +class VM : public QV4::ExecutionEngine {}; + +} + +#endif // MASM_VM_H diff --git a/src/3rdparty/masm/stubs/wtf/HashMap.h b/src/3rdparty/masm/stubs/wtf/HashMap.h new file mode 100644 index 0000000000..888c6cceb0 --- /dev/null +++ b/src/3rdparty/masm/stubs/wtf/HashMap.h @@ -0,0 +1,58 @@ +/**************************************************************************** +** +** Copyright (C) 2018 The Qt Company Ltd. +** Contact: https://www.qt.io/licensing/ +** +** This file is part of the QtQml module of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL$ +** Commercial License Usage +** Licensees holding valid commercial Qt licenses may use this file in +** accordance with the commercial license agreement provided with the +** Software or, alternatively, in accordance with the terms contained in +** a written agreement between you and The Qt Company. For licensing terms +** and conditions see https://www.qt.io/terms-conditions. For further +** information use the contact form at https://www.qt.io/contact-us. +** +** GNU Lesser General Public License Usage +** Alternatively, this file may be used under the terms of the GNU Lesser +** General Public License version 3 as published by the Free Software +** Foundation and appearing in the file LICENSE.LGPL3 included in the +** packaging of this file. Please review the following information to +** ensure the GNU Lesser General Public License version 3 requirements +** will be met: https://www.gnu.org/licenses/lgpl-3.0.html. +** +** GNU General Public License Usage +** Alternatively, this file may be used under the terms of the GNU +** General Public License version 2.0 or (at your option) the GNU General +** Public license version 3 or any later version approved by the KDE Free +** Qt Foundation. The licenses are as published by the Free Software +** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3 +** included in the packaging of this file. Please review the following +** information to ensure the GNU General Public License requirements will +** be met: https://www.gnu.org/licenses/gpl-2.0.html and +** https://www.gnu.org/licenses/gpl-3.0.html. +** +** $QT_END_LICENSE$ +** +****************************************************************************/ +#ifndef HASHMAP_H +#define HASHMAP_H + +#include <QtCore/qhash.h> + +namespace WTF { + +template<typename Key, typename Value> +class HashMap final : public QHash<Key, Value> +{ +public: + void add(const Key &k, const Value &v) { QHash<Key, Value>::insert(k, v); } + Value get(const Key &k) { return QHash<Key, Value>::value(k); } +}; + +} + +using WTF::HashMap; + +#endif diff --git a/src/3rdparty/masm/stubs/wtf/HashSet.h b/src/3rdparty/masm/stubs/wtf/HashSet.h new file mode 100644 index 0000000000..3765c9a8b1 --- /dev/null +++ b/src/3rdparty/masm/stubs/wtf/HashSet.h @@ -0,0 +1,67 @@ +/**************************************************************************** +** +** Copyright (C) 2018 The Qt Company Ltd. +** Contact: https://www.qt.io/licensing/ +** +** This file is part of the QtQml module of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL$ +** Commercial License Usage +** Licensees holding valid commercial Qt licenses may use this file in +** accordance with the commercial license agreement provided with the +** Software or, alternatively, in accordance with the terms contained in +** a written agreement between you and The Qt Company. For licensing terms +** and conditions see https://www.qt.io/terms-conditions. For further +** information use the contact form at https://www.qt.io/contact-us. +** +** GNU Lesser General Public License Usage +** Alternatively, this file may be used under the terms of the GNU Lesser +** General Public License version 3 as published by the Free Software +** Foundation and appearing in the file LICENSE.LGPL3 included in the +** packaging of this file. Please review the following information to +** ensure the GNU Lesser General Public License version 3 requirements +** will be met: https://www.gnu.org/licenses/lgpl-3.0.html. +** +** GNU General Public License Usage +** Alternatively, this file may be used under the terms of the GNU +** General Public License version 2.0 or (at your option) the GNU General +** Public license version 3 or any later version approved by the KDE Free +** Qt Foundation. The licenses are as published by the Free Software +** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3 +** included in the packaging of this file. Please review the following +** information to ensure the GNU General Public License requirements will +** be met: https://www.gnu.org/licenses/gpl-2.0.html and +** https://www.gnu.org/licenses/gpl-3.0.html. +** +** $QT_END_LICENSE$ +** +****************************************************************************/ +#ifndef HASHSET_H +#define HASHSET_H + +#include <QtCore/qset.h> + +namespace WTF { + +template<typename Key> +class HashSet final : public QSet<Key> +{ +public: + struct SetAddResult { + bool isNewEntry; + }; + SetAddResult add(const Key &k) + { + if (QSet<Key>::find(k) == QSet<Key>::constEnd()) { + QSet<Key>::insert(k); + return { true }; + } + return { false }; + } +}; + +} + +using WTF::HashSet; + +#endif diff --git a/src/3rdparty/masm/stubs/wtf/Optional.h b/src/3rdparty/masm/stubs/wtf/Optional.h new file mode 100644 index 0000000000..44fa3ee62d --- /dev/null +++ b/src/3rdparty/masm/stubs/wtf/Optional.h @@ -0,0 +1,83 @@ +/**************************************************************************** +** +** Copyright (C) 2018 The Qt Company Ltd. +** Contact: https://www.qt.io/licensing/ +** +** This file is part of the QtQml module of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL$ +** Commercial License Usage +** Licensees holding valid commercial Qt licenses may use this file in +** accordance with the commercial license agreement provided with the +** Software or, alternatively, in accordance with the terms contained in +** a written agreement between you and The Qt Company. For licensing terms +** and conditions see https://www.qt.io/terms-conditions. For further +** information use the contact form at https://www.qt.io/contact-us. +** +** GNU Lesser General Public License Usage +** Alternatively, this file may be used under the terms of the GNU Lesser +** General Public License version 3 as published by the Free Software +** Foundation and appearing in the file LICENSE.LGPL3 included in the +** packaging of this file. Please review the following information to +** ensure the GNU Lesser General Public License version 3 requirements +** will be met: https://www.gnu.org/licenses/lgpl-3.0.html. +** +** GNU General Public License Usage +** Alternatively, this file may be used under the terms of the GNU +** General Public License version 2.0 or (at your option) the GNU General +** Public license version 3 or any later version approved by the KDE Free +** Qt Foundation. The licenses are as published by the Free Software +** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3 +** included in the packaging of this file. Please review the following +** information to ensure the GNU General Public License requirements will +** be met: https://www.gnu.org/licenses/gpl-2.0.html and +** https://www.gnu.org/licenses/gpl-3.0.html. +** +** $QT_END_LICENSE$ +** +****************************************************************************/ + +#pragma once + +#include <QtCore/qglobal.h> + +#if __cplusplus > 201402L && QT_HAS_INCLUDE(<optional>) +#include <optional> +#else + +namespace std { + +struct nullopt_t {}; + +constexpr nullopt_t nullopt {}; + +template<typename T> +class optional { +public: + optional() = default; + optional(nullopt_t) {} + optional(const T &v) : _value(v), _hasValue(true) {} + ~optional() = default; + + optional &operator =(nullopt_t) { + _value = T(); + _hasValue = false; + return *this; + } + + T operator->() { return _value; } + T operator*() { return _value; } + + operator bool() const { return _hasValue; } + bool has_value() const { return _hasValue; } + + T value() const { return _value; } + +private: + T _value = T(); + bool _hasValue = false; +}; + +} + +#endif diff --git a/src/3rdparty/masm/stubs/wtf/PassRefPtr.h b/src/3rdparty/masm/stubs/wtf/PassRefPtr.h index f072e70dd7..cc03a5d651 100644 --- a/src/3rdparty/masm/stubs/wtf/PassRefPtr.h +++ b/src/3rdparty/masm/stubs/wtf/PassRefPtr.h @@ -83,14 +83,22 @@ public: private: PassRefPtr<T>& operator=(const PassRefPtr<T>& t); - template <typename PtrType> friend PassRefPtr<PtrType> adoptRef(PtrType*); +protected: mutable T* m_ptr; }; template <typename T> -PassRefPtr<T> adoptRef(T* ptr) +class Ref : public PassRefPtr<T> { - PassRefPtr<T> result; + using PassRefPtr<T>::PassRefPtr; + + template <typename PtrType> friend Ref<PtrType> adoptRef(PtrType*); +}; + +template <typename T> +Ref<T> adoptRef(T* ptr) +{ + Ref<T> result; result.m_ptr = ptr; return result; } diff --git a/src/3rdparty/masm/stubs/wtf/Vector.h b/src/3rdparty/masm/stubs/wtf/Vector.h index 2025acf8a9..f4f4dc5cf4 100644 --- a/src/3rdparty/masm/stubs/wtf/Vector.h +++ b/src/3rdparty/masm/stubs/wtf/Vector.h @@ -55,6 +55,8 @@ class Vector : public std::vector<T> { public: Vector() {} Vector(int initialSize) : std::vector<T>(initialSize) {} + Vector(const Vector &other) : std::vector<T>(other) {} + Vector(std::initializer_list<T> list) : std::vector<T>(list) {} inline void append(const T& value) { this->push_back(value); } @@ -63,6 +65,9 @@ public: inline void append(const OtherType& other) { this->push_back(T(other)); } + inline void append(T&& other) + { this->push_back(std::move(other)); } + inline void append(const Vector<T>& vector) { this->insert(this->end(), vector.begin(), vector.end()); @@ -80,6 +85,8 @@ public: this->push_back(*it); } + unsigned size() const { return static_cast<unsigned>(std::vector<T>::size()); } + using std::vector<T>::insert; inline void reserveInitialCapacity(size_t size) { this->reserve(size); } diff --git a/src/3rdparty/masm/stubs/wtf/text/CString.h b/src/3rdparty/masm/stubs/wtf/text/CString.h index 26f74f7593..7129f5049e 100644 --- a/src/3rdparty/masm/stubs/wtf/text/CString.h +++ b/src/3rdparty/masm/stubs/wtf/text/CString.h @@ -39,4 +39,8 @@ #ifndef CSTRING_H #define CSTRING_H +class CString : public QByteArray { + +}; + #endif // CSTRING_H diff --git a/src/3rdparty/masm/stubs/wtf/text/StringBuilder.h b/src/3rdparty/masm/stubs/wtf/text/StringBuilder.h new file mode 100644 index 0000000000..a382f6da83 --- /dev/null +++ b/src/3rdparty/masm/stubs/wtf/text/StringBuilder.h @@ -0,0 +1,52 @@ +/**************************************************************************** +** +** Copyright (C) 2018 The Qt Company Ltd. +** Contact: https://www.qt.io/licensing/ +** +** This file is part of the QtQml module of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL$ +** Commercial License Usage +** Licensees holding valid commercial Qt licenses may use this file in +** accordance with the commercial license agreement provided with the +** Software or, alternatively, in accordance with the terms contained in +** a written agreement between you and The Qt Company. For licensing terms +** and conditions see https://www.qt.io/terms-conditions. For further +** information use the contact form at https://www.qt.io/contact-us. +** +** GNU Lesser General Public License Usage +** Alternatively, this file may be used under the terms of the GNU Lesser +** General Public License version 3 as published by the Free Software +** Foundation and appearing in the file LICENSE.LGPL3 included in the +** packaging of this file. Please review the following information to +** ensure the GNU Lesser General Public License version 3 requirements +** will be met: https://www.gnu.org/licenses/lgpl-3.0.html. +** +** GNU General Public License Usage +** Alternatively, this file may be used under the terms of the GNU +** General Public License version 2.0 or (at your option) the GNU General +** Public license version 3 or any later version approved by the KDE Free +** Qt Foundation. The licenses are as published by the Free Software +** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3 +** included in the packaging of this file. Please review the following +** information to ensure the GNU General Public License requirements will +** be met: https://www.gnu.org/licenses/gpl-2.0.html and +** https://www.gnu.org/licenses/gpl-3.0.html. +** +** $QT_END_LICENSE$ +** +****************************************************************************/ +#pragma once + +#include <wtf/text/WTFString.h> + +namespace WTF { + +struct StringBuilder : public String +{ + String toString() const { return *this; } +}; + +} + +using WTF::StringBuilder; diff --git a/src/3rdparty/masm/stubs/wtf/text/WTFString.h b/src/3rdparty/masm/stubs/wtf/text/WTFString.h index 928c684fdb..da5183f734 100644 --- a/src/3rdparty/masm/stubs/wtf/text/WTFString.h +++ b/src/3rdparty/masm/stubs/wtf/text/WTFString.h @@ -42,26 +42,33 @@ #include <QString> #include <wtf/ASCIICType.h> #include <wtf/unicode/Unicode.h> +#include <memory> namespace WTF { +class PrintStream; + class String : public QString { public: + String() = default; String(const QString& s) : QString(s) {} bool is8Bit() const { return false; } const unsigned char *characters8() const { return 0; } const UChar *characters16() const { return reinterpret_cast<const UChar*>(constData()); } template <typename T> - const T* getCharacters() const; + const T* characters() const; + + bool operator!() const { return isEmpty(); } + void dump(PrintStream &) const {} }; template <> -inline const unsigned char* String::getCharacters<unsigned char>() const { return characters8(); } +inline const unsigned char* String::characters<unsigned char>() const { return characters8(); } template <> -inline const UChar* String::getCharacters<UChar>() const { return characters16(); } +inline const UChar* String::characters<UChar>() const { return characters16(); } } @@ -70,4 +77,6 @@ namespace JSC { using WTF::String; } +#define WTFMove(value) std::move(value) + #endif // WTFSTRING_H diff --git a/src/3rdparty/masm/stubs/wtf/unicode/Unicode.h b/src/3rdparty/masm/stubs/wtf/unicode/Unicode.h index d61cec5c4e..0f7f005c89 100644 --- a/src/3rdparty/masm/stubs/wtf/unicode/Unicode.h +++ b/src/3rdparty/masm/stubs/wtf/unicode/Unicode.h @@ -43,6 +43,7 @@ typedef unsigned char LChar; typedef unsigned short UChar; +typedef int32_t UChar32; namespace Unicode { inline UChar toLower(UChar ch) { @@ -52,6 +53,35 @@ namespace Unicode { inline UChar toUpper(UChar ch) { return QChar::toUpper(ch); } + inline UChar32 u_tolower(UChar32 ch) { + return QChar::toLower(ch); + } + inline UChar32 u_toupper(UChar32 ch) { + return QChar::toUpper(ch); + } } +using Unicode::u_toupper; +using Unicode::u_tolower; + +#define U16_IS_LEAD(ch) QChar::isHighSurrogate((ch)) +#define U16_IS_TRAIL(ch) QChar::isLowSurrogate((ch)) +#define U16_GET_SUPPLEMENTARY(lead, trail) static_cast<UChar32>(QChar::surrogateToUcs4((lead), (trail))) +#define U_IS_BMP(ch) ((ch) < 0x10000) +#define U16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2) +#define UCHAR_MAX_VALUE 0x10ffff + +#define U_MASK(category) (1u << (category)) +#define U_GET_GC_MASK(c) U_MASK(QChar::category((c))) +#define U_GC_L_MASK (U_GC_LU_MASK|U_GC_LL_MASK|U_GC_LT_MASK|U_GC_LM_MASK|U_GC_LO_MASK) +#define U_GC_LU_MASK U_MASK(QChar::Letter_Uppercase) +#define U_GC_LL_MASK U_MASK(QChar::Letter_Lowercase) +#define U_GC_LT_MASK U_MASK(QChar::Letter_Titlecase) +#define U_GC_LM_MASK U_MASK(QChar::Letter_Modifier) +#define U_GC_LO_MASK U_MASK(QChar::Letter_Other) +#define U_GC_MN_MASK U_MASK(QChar::Mark_NonSpacing) +#define U_GC_MC_MASK U_MASK(QChar::Mark_SpacingCombining) +#define U_GC_ND_MASK U_MASK(QChar::Number_DecimalDigit) +#define U_GC_PC_MASK U_MASK(QChar::Punctuation_Connector) + #endif // UNICODE_H diff --git a/src/3rdparty/masm/stubs/wtf/unicode/utypes.h b/src/3rdparty/masm/stubs/wtf/unicode/utypes.h new file mode 100644 index 0000000000..e1b4ff90a6 --- /dev/null +++ b/src/3rdparty/masm/stubs/wtf/unicode/utypes.h @@ -0,0 +1 @@ +#include <unicode/Unicode.h> diff --git a/src/3rdparty/masm/stubs/yarr/YarrUnicodeProperties.cpp b/src/3rdparty/masm/stubs/yarr/YarrUnicodeProperties.cpp new file mode 100644 index 0000000000..99c925f406 --- /dev/null +++ b/src/3rdparty/masm/stubs/yarr/YarrUnicodeProperties.cpp @@ -0,0 +1,70 @@ +/**************************************************************************** +** +** Copyright (C) 2018 The Qt Company Ltd. +** Contact: https://www.qt.io/licensing/ +** +** This file is part of the QtQml module of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL$ +** Commercial License Usage +** Licensees holding valid commercial Qt licenses may use this file in +** accordance with the commercial license agreement provided with the +** Software or, alternatively, in accordance with the terms contained in +** a written agreement between you and The Qt Company. For licensing terms +** and conditions see https://www.qt.io/terms-conditions. For further +** information use the contact form at https://www.qt.io/contact-us. +** +** GNU Lesser General Public License Usage +** Alternatively, this file may be used under the terms of the GNU Lesser +** General Public License version 3 as published by the Free Software +** Foundation and appearing in the file LICENSE.LGPL3 included in the +** packaging of this file. Please review the following information to +** ensure the GNU Lesser General Public License version 3 requirements +** will be met: https://www.gnu.org/licenses/lgpl-3.0.html. +** +** GNU General Public License Usage +** Alternatively, this file may be used under the terms of the GNU +** General Public License version 2.0 or (at your option) the GNU General +** Public license version 3 or any later version approved by the KDE Free +** Qt Foundation. The licenses are as published by the Free Software +** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3 +** included in the packaging of this file. Please review the following +** information to ensure the GNU General Public License requirements will +** be met: https://www.gnu.org/licenses/gpl-2.0.html and +** https://www.gnu.org/licenses/gpl-3.0.html. +** +** $QT_END_LICENSE$ +** +****************************************************************************/ + +#include "config.h" +#include "yarr/YarrUnicodeProperties.h" +#include "qchar.h" + +#include "yarr/Yarr.h" +#include "yarr/YarrPattern.h" + +using namespace WTF; + +namespace JSC { namespace Yarr { + +std::optional<BuiltInCharacterClassID> unicodeMatchPropertyValue(WTF::String unicodePropertyName, WTF::String unicodePropertyValue) +{ + Q_UNUSED(unicodePropertyName); + Q_UNUSED(unicodePropertyValue); + return std::nullopt; +} + +std::optional<BuiltInCharacterClassID> unicodeMatchProperty(WTF::String unicodePropertyValue) +{ + Q_UNUSED(unicodePropertyValue); + return std::nullopt; +} + +std::unique_ptr<CharacterClass> createUnicodeCharacterClassFor(BuiltInCharacterClassID unicodeClassID) +{ + Q_UNUSED(unicodeClassID); + return nullptr; +} + +} } // namespace JSC::Yarr diff --git a/src/3rdparty/masm/wtf/Assertions.h b/src/3rdparty/masm/wtf/Assertions.h index 491e434498..e2c04ac0bb 100644 --- a/src/3rdparty/masm/wtf/Assertions.h +++ b/src/3rdparty/masm/wtf/Assertions.h @@ -171,7 +171,7 @@ WTF_EXPORT_PRIVATE void WTFInstallReportBacktraceOnCrashHook(); #define CRASH() \ (WTFReportBacktrace(), \ WTFInvokeCrashHook(), \ - (*(int *)(uintptr_t)0xbbadbeef = 0), \ + (*reinterpret_cast<int *>(uintptr_t(0xbbadbeef)) = 0), \ __builtin_trap()) #else #define CRASH() \ @@ -256,7 +256,7 @@ inline void assertUnused(T& x) { (void)x; } (void)0) #define ASSERT_NOT_REACHED() do { \ - WTFReportAssertionFailure(__FILE__, __LINE__, WTF_PRETTY_FUNCTION, 0); \ + WTFReportAssertionFailure(__FILE__, __LINE__, WTF_PRETTY_FUNCTION, nullptr); \ CRASH(); \ } while (0) diff --git a/src/3rdparty/masm/wtf/FilePrintStream.cpp b/src/3rdparty/masm/wtf/FilePrintStream.cpp index 45f1565f46..28714ecb6f 100644 --- a/src/3rdparty/masm/wtf/FilePrintStream.cpp +++ b/src/3rdparty/masm/wtf/FilePrintStream.cpp @@ -38,17 +38,16 @@ FilePrintStream::~FilePrintStream() { if (m_adoptionMode == Borrow) return; - if (m_file) - fclose(m_file); + fclose(m_file); } -PassOwnPtr<FilePrintStream> FilePrintStream::open(const char* filename, const char* mode) +std::unique_ptr<FilePrintStream> FilePrintStream::open(const char* filename, const char* mode) { FILE* file = fopen(filename, mode); if (!file) - return PassOwnPtr<FilePrintStream>(); - - return adoptPtr(new FilePrintStream(file)); + return nullptr; + + return std::make_unique<FilePrintStream>(file); } void FilePrintStream::vprintf(const char* format, va_list argList) diff --git a/src/3rdparty/masm/wtf/FilePrintStream.h b/src/3rdparty/masm/wtf/FilePrintStream.h index bdeab4c479..f32ca49dcb 100644 --- a/src/3rdparty/masm/wtf/FilePrintStream.h +++ b/src/3rdparty/masm/wtf/FilePrintStream.h @@ -27,7 +27,6 @@ #define FilePrintStream_h #include <stdio.h> -#include <wtf/PassOwnPtr.h> #include <wtf/PrintStream.h> namespace WTF { @@ -40,14 +39,14 @@ public: }; FilePrintStream(FILE*, AdoptionMode = Adopt); - virtual ~FilePrintStream(); + virtual ~FilePrintStream() override; - static PassOwnPtr<FilePrintStream> open(const char* filename, const char* mode); + WTF_EXPORT_PRIVATE static std::unique_ptr<FilePrintStream> open(const char* filename, const char* mode); FILE* file() { return m_file; } - void vprintf(const char* format, va_list) WTF_ATTRIBUTE_PRINTF(2, 0); - void flush(); + void vprintf(const char* format, va_list) override WTF_ATTRIBUTE_PRINTF(2, 0); + void flush() override; private: FILE* m_file; diff --git a/src/3rdparty/masm/wtf/Platform.h b/src/3rdparty/masm/wtf/Platform.h index 5905f42f45..d10a60e642 100644 --- a/src/3rdparty/masm/wtf/Platform.h +++ b/src/3rdparty/masm/wtf/Platform.h @@ -2,6 +2,7 @@ * Copyright (C) 2006, 2007, 2008, 2009, 2013 Apple Inc. All rights reserved. * Copyright (C) 2007-2009 Torch Mobile, Inc. * Copyright (C) 2010, 2011 Research In Motion Limited. All rights reserved. + * Copyright (C) 2018 The Qt Company Ltd. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -1050,4 +1051,11 @@ #define WTF_USE_CONTENT_FILTERING 1 #endif +#if ENABLE(YARR_JIT) +#if 0 // CPU(ARM64) || (CPU(X86_64) && !OS(WINDOWS)) +/* Enable JIT'ing Regular Expressions that have nested parenthesis. */ +#define ENABLE_YARR_JIT_ALL_PARENS_EXPRESSIONS 1 +#endif +#endif + #endif /* WTF_Platform_h */ diff --git a/src/3rdparty/masm/wtf/PrintStream.h b/src/3rdparty/masm/wtf/PrintStream.h index 6fcf9c1567..4372288aff 100644 --- a/src/3rdparty/masm/wtf/PrintStream.h +++ b/src/3rdparty/masm/wtf/PrintStream.h @@ -206,6 +206,10 @@ public: print(value12); print(value13); } + + void println(); + template<typename ...Types> + void println(Types... args); }; WTF_EXPORT_PRIVATE void printInternal(PrintStream&, const char*); @@ -227,6 +231,19 @@ void printInternal(PrintStream& out, const T& value) value.dump(out); } +inline +void PrintStream::println() +{ + print("\n"); +} + +template<typename ...Types> +void PrintStream::println(Types... args) +{ + print(args...); + print("\n"); +} + #define MAKE_PRINT_ADAPTOR(Name, Type, function) \ class Name { \ public: \ diff --git a/src/3rdparty/masm/wtf/StdLibExtras.h b/src/3rdparty/masm/wtf/StdLibExtras.h index f0d792ed52..18d15542ac 100644 --- a/src/3rdparty/masm/wtf/StdLibExtras.h +++ b/src/3rdparty/masm/wtf/StdLibExtras.h @@ -28,6 +28,8 @@ #include <wtf/Assertions.h> #include <wtf/CheckedArithmetic.h> +#include <wtf/Platform.h> +#include <memory> // Use these to declare and define a static local variable (static T;) so that // it is leaked so that its destructors are not called at exit. Using this @@ -71,6 +73,8 @@ #define STRINGIZE(exp) #exp #define STRINGIZE_VALUE_OF(exp) STRINGIZE(exp) +#define FALLTHROUGH + /* * The reinterpret_cast<Type1*>([pointer to Type2]) expressions - where * sizeof(Type1) > sizeof(Type2) - cause the following warning on ARM with GCC: diff --git a/src/3rdparty/masm/yarr/Yarr.h b/src/3rdparty/masm/yarr/Yarr.h index d393e9fa90..ccf78f9880 100644 --- a/src/3rdparty/masm/yarr/Yarr.h +++ b/src/3rdparty/masm/yarr/Yarr.h @@ -25,25 +25,25 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef Yarr_h -#define Yarr_h +#pragma once -#include "YarrInterpreter.h" -#include "YarrPattern.h" +#include <limits.h> +#include "YarrErrorCode.h" namespace JSC { namespace Yarr { -#define YarrStackSpaceForBackTrackInfoPatternCharacter 1 // Only for !fixed quantifiers. -#define YarrStackSpaceForBackTrackInfoCharacterClass 1 // Only for !fixed quantifiers. +#define YarrStackSpaceForBackTrackInfoPatternCharacter 2 // Only for !fixed quantifiers. +#define YarrStackSpaceForBackTrackInfoCharacterClass 2 // Only for !fixed quantifiers. #define YarrStackSpaceForBackTrackInfoBackReference 2 #define YarrStackSpaceForBackTrackInfoAlternative 1 // One per alternative. #define YarrStackSpaceForBackTrackInfoParentheticalAssertion 1 -#define YarrStackSpaceForBackTrackInfoParenthesesOnce 1 // Only for !fixed quantifiers. +#define YarrStackSpaceForBackTrackInfoParenthesesOnce 2 #define YarrStackSpaceForBackTrackInfoParenthesesTerminal 1 -#define YarrStackSpaceForBackTrackInfoParentheses 2 +#define YarrStackSpaceForBackTrackInfoParentheses 4 +#define YarrStackSpaceForDotStarEnclosure 1 static const unsigned quantifyInfinite = UINT_MAX; -static const unsigned offsetNoMatch = (unsigned)-1; +static const unsigned offsetNoMatch = std::numeric_limits<unsigned>::max(); // The below limit restricts the number of "recursive" match calls in order to // avoid spending exponential time on complex regular expressions. @@ -53,9 +53,10 @@ enum JSRegExpResult { JSRegExpMatch = 1, JSRegExpNoMatch = 0, JSRegExpErrorNoMatch = -1, - JSRegExpErrorHitLimit = -2, - JSRegExpErrorNoMemory = -3, - JSRegExpErrorInternal = -4 + JSRegExpJITCodeFailure = -2, + JSRegExpErrorHitLimit = -3, + JSRegExpErrorNoMemory = -4, + JSRegExpErrorInternal = -5, }; enum YarrCharSize { @@ -63,7 +64,14 @@ enum YarrCharSize { Char16 }; -} } // namespace JSC::Yarr +enum class BuiltInCharacterClassID : unsigned { + DigitClassID, + SpaceClassID, + WordClassID, + DotClassID, + BaseUnicodePropertyID +}; -#endif // Yarr_h +struct BytecodePattern; +} } // namespace JSC::Yarr diff --git a/src/3rdparty/masm/yarr/YarrCanonicalize.h b/src/3rdparty/masm/yarr/YarrCanonicalize.h new file mode 100644 index 0000000000..fb5e0231ac --- /dev/null +++ b/src/3rdparty/masm/yarr/YarrCanonicalize.h @@ -0,0 +1,143 @@ +/* + * Copyright (C) 2012-2016 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#pragma once + +#include <stdint.h> +#include <unicode/utypes.h> + +namespace JSC { namespace Yarr { + +// This set of data provides information for each UCS2 code point as to the set of code points +// that it should match under the ES6 case insensitive RegExp matching rules, specified in 21.2.2.8.2. +// The non-Unicode tables are autogenerated using YarrCanonicalize.js into YarrCanonicalize.cpp. +// The Unicode tables are autogenerated using the python script generateYarrCanonicalizeUnicode +// which creates YarrCanonicalizeUnicode.cpp. +enum UCS2CanonicalizationType { + CanonicalizeUnique, // No canonically equal values, e.g. 0x0. + CanonicalizeSet, // Value indicates a set in characterSetInfo. + CanonicalizeRangeLo, // Value is positive delta to pair, E.g. 0x41 has value 0x20, -> 0x61. + CanonicalizeRangeHi, // Value is positive delta to pair, E.g. 0x61 has value 0x20, -> 0x41. + CanonicalizeAlternatingAligned, // Aligned consequtive pair, e.g. 0x1f4,0x1f5. + CanonicalizeAlternatingUnaligned, // Unaligned consequtive pair, e.g. 0x241,0x242. +}; +struct CanonicalizationRange { + UChar32 begin; + UChar32 end; + UChar32 value; + UCS2CanonicalizationType type; +}; + +extern const size_t UCS2_CANONICALIZATION_RANGES; +extern const UChar32* const ucs2CharacterSetInfo[]; +extern const CanonicalizationRange ucs2RangeInfo[]; + +extern const size_t UNICODE_CANONICALIZATION_RANGES; +extern const UChar32* const unicodeCharacterSetInfo[]; +extern const CanonicalizationRange unicodeRangeInfo[]; + +enum class CanonicalMode { UCS2, Unicode }; + +inline const UChar32* canonicalCharacterSetInfo(unsigned index, CanonicalMode canonicalMode) +{ + const UChar32* const* rangeInfo = canonicalMode == CanonicalMode::UCS2 ? ucs2CharacterSetInfo : unicodeCharacterSetInfo; + return rangeInfo[index]; +} + +// This searches in log2 time over ~400-600 entries, so should typically result in 9 compares. +inline const CanonicalizationRange* canonicalRangeInfoFor(UChar32 ch, CanonicalMode canonicalMode = CanonicalMode::UCS2) +{ + const CanonicalizationRange* info = canonicalMode == CanonicalMode::UCS2 ? ucs2RangeInfo : unicodeRangeInfo; + size_t entries = canonicalMode == CanonicalMode::UCS2 ? UCS2_CANONICALIZATION_RANGES : UNICODE_CANONICALIZATION_RANGES; + + while (true) { + size_t candidate = entries >> 1; + const CanonicalizationRange* candidateInfo = info + candidate; + if (ch < candidateInfo->begin) + entries = candidate; + else if (ch <= candidateInfo->end) + return candidateInfo; + else { + info = candidateInfo + 1; + entries -= (candidate + 1); + } + } +} + +// Should only be called for characters that have one canonically matching value. +inline UChar32 getCanonicalPair(const CanonicalizationRange* info, UChar32 ch) +{ + ASSERT(ch >= info->begin && ch <= info->end); + switch (info->type) { + case CanonicalizeRangeLo: + return ch + info->value; + case CanonicalizeRangeHi: + return ch - info->value; + case CanonicalizeAlternatingAligned: + return ch ^ 1; + case CanonicalizeAlternatingUnaligned: + return ((ch - 1) ^ 1) + 1; + default: + RELEASE_ASSERT_NOT_REACHED(); + } + RELEASE_ASSERT_NOT_REACHED(); + return 0; +} + +// Returns true if no other UCS2 codepoint can match this value. +inline bool isCanonicallyUnique(UChar32 ch, CanonicalMode canonicalMode = CanonicalMode::UCS2) +{ + return canonicalRangeInfoFor(ch, canonicalMode)->type == CanonicalizeUnique; +} + +// Returns true if values are equal, under the canonicalization rules. +inline bool areCanonicallyEquivalent(UChar32 a, UChar32 b, CanonicalMode canonicalMode = CanonicalMode::UCS2) +{ + const CanonicalizationRange* info = canonicalRangeInfoFor(a, canonicalMode); + switch (info->type) { + case CanonicalizeUnique: + return a == b; + case CanonicalizeSet: { + for (const UChar32* set = canonicalCharacterSetInfo(info->value, canonicalMode); (a = *set); ++set) { + if (a == b) + return true; + } + return false; + } + case CanonicalizeRangeLo: + return (a == b) || (a + info->value == b); + case CanonicalizeRangeHi: + return (a == b) || (a - info->value == b); + case CanonicalizeAlternatingAligned: + return (a | 1) == (b | 1); + case CanonicalizeAlternatingUnaligned: + return ((a - 1) | 1) == ((b - 1) | 1); + } + + RELEASE_ASSERT_NOT_REACHED(); + return false; +} + +} } // JSC::Yarr diff --git a/src/3rdparty/masm/yarr/YarrCanonicalizeUCS2.cpp b/src/3rdparty/masm/yarr/YarrCanonicalizeUCS2.cpp index 7bb3d08eb5..d91c771590 100644 --- a/src/3rdparty/masm/yarr/YarrCanonicalizeUCS2.cpp +++ b/src/3rdparty/masm/yarr/YarrCanonicalizeUCS2.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2012 Apple Inc. All rights reserved. + * Copyright (C) 2012-2013, 2015-2016 Apple Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -23,33 +23,31 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -// DO NOT EDIT! - this file autogenerated by YarrCanonicalizeUCS2.js +// DO NOT EDIT! - this file autogenerated by YarrCanonicalize.js #include "config.h" -#include "YarrCanonicalizeUCS2.h" +#include "YarrCanonicalize.h" namespace JSC { namespace Yarr { -#include <stdint.h> - -uint16_t ucs2CharacterSet0[] = { 0x01c4u, 0x01c5u, 0x01c6u, 0 }; -uint16_t ucs2CharacterSet1[] = { 0x01c7u, 0x01c8u, 0x01c9u, 0 }; -uint16_t ucs2CharacterSet2[] = { 0x01cau, 0x01cbu, 0x01ccu, 0 }; -uint16_t ucs2CharacterSet3[] = { 0x01f1u, 0x01f2u, 0x01f3u, 0 }; -uint16_t ucs2CharacterSet4[] = { 0x0392u, 0x03b2u, 0x03d0u, 0 }; -uint16_t ucs2CharacterSet5[] = { 0x0395u, 0x03b5u, 0x03f5u, 0 }; -uint16_t ucs2CharacterSet6[] = { 0x0398u, 0x03b8u, 0x03d1u, 0 }; -uint16_t ucs2CharacterSet7[] = { 0x0345u, 0x0399u, 0x03b9u, 0x1fbeu, 0 }; -uint16_t ucs2CharacterSet8[] = { 0x039au, 0x03bau, 0x03f0u, 0 }; -uint16_t ucs2CharacterSet9[] = { 0x00b5u, 0x039cu, 0x03bcu, 0 }; -uint16_t ucs2CharacterSet10[] = { 0x03a0u, 0x03c0u, 0x03d6u, 0 }; -uint16_t ucs2CharacterSet11[] = { 0x03a1u, 0x03c1u, 0x03f1u, 0 }; -uint16_t ucs2CharacterSet12[] = { 0x03a3u, 0x03c2u, 0x03c3u, 0 }; -uint16_t ucs2CharacterSet13[] = { 0x03a6u, 0x03c6u, 0x03d5u, 0 }; -uint16_t ucs2CharacterSet14[] = { 0x1e60u, 0x1e61u, 0x1e9bu, 0 }; +const UChar32 ucs2CharacterSet0[] = { 0x01c4, 0x01c5, 0x01c6, 0 }; +const UChar32 ucs2CharacterSet1[] = { 0x01c7, 0x01c8, 0x01c9, 0 }; +const UChar32 ucs2CharacterSet2[] = { 0x01ca, 0x01cb, 0x01cc, 0 }; +const UChar32 ucs2CharacterSet3[] = { 0x01f1, 0x01f2, 0x01f3, 0 }; +const UChar32 ucs2CharacterSet4[] = { 0x0392, 0x03b2, 0x03d0, 0 }; +const UChar32 ucs2CharacterSet5[] = { 0x0395, 0x03b5, 0x03f5, 0 }; +const UChar32 ucs2CharacterSet6[] = { 0x0398, 0x03b8, 0x03d1, 0 }; +const UChar32 ucs2CharacterSet7[] = { 0x0345, 0x0399, 0x03b9, 0x1fbe, 0 }; +const UChar32 ucs2CharacterSet8[] = { 0x039a, 0x03ba, 0x03f0, 0 }; +const UChar32 ucs2CharacterSet9[] = { 0x00b5, 0x039c, 0x03bc, 0 }; +const UChar32 ucs2CharacterSet10[] = { 0x03a0, 0x03c0, 0x03d6, 0 }; +const UChar32 ucs2CharacterSet11[] = { 0x03a1, 0x03c1, 0x03f1, 0 }; +const UChar32 ucs2CharacterSet12[] = { 0x03a3, 0x03c2, 0x03c3, 0 }; +const UChar32 ucs2CharacterSet13[] = { 0x03a6, 0x03c6, 0x03d5, 0 }; +const UChar32 ucs2CharacterSet14[] = { 0x1e60, 0x1e61, 0x1e9b, 0 }; static const size_t UCS2_CANONICALIZATION_SETS = 15; -uint16_t* characterSetInfo[UCS2_CANONICALIZATION_SETS] = { +const UChar32* const ucs2CharacterSetInfo[UCS2_CANONICALIZATION_SETS] = { ucs2CharacterSet0, ucs2CharacterSet1, ucs2CharacterSet2, @@ -67,396 +65,399 @@ uint16_t* characterSetInfo[UCS2_CANONICALIZATION_SETS] = { ucs2CharacterSet14, }; -const size_t UCS2_CANONICALIZATION_RANGES = 364; -UCS2CanonicalizationRange rangeInfo[UCS2_CANONICALIZATION_RANGES] = { - { 0x0000u, 0x0040u, 0x0000u, CanonicalizeUnique }, - { 0x0041u, 0x005au, 0x0020u, CanonicalizeRangeLo }, - { 0x005bu, 0x0060u, 0x0000u, CanonicalizeUnique }, - { 0x0061u, 0x007au, 0x0020u, CanonicalizeRangeHi }, - { 0x007bu, 0x00b4u, 0x0000u, CanonicalizeUnique }, - { 0x00b5u, 0x00b5u, 0x0009u, CanonicalizeSet }, - { 0x00b6u, 0x00bfu, 0x0000u, CanonicalizeUnique }, - { 0x00c0u, 0x00d6u, 0x0020u, CanonicalizeRangeLo }, - { 0x00d7u, 0x00d7u, 0x0000u, CanonicalizeUnique }, - { 0x00d8u, 0x00deu, 0x0020u, CanonicalizeRangeLo }, - { 0x00dfu, 0x00dfu, 0x0000u, CanonicalizeUnique }, - { 0x00e0u, 0x00f6u, 0x0020u, CanonicalizeRangeHi }, - { 0x00f7u, 0x00f7u, 0x0000u, CanonicalizeUnique }, - { 0x00f8u, 0x00feu, 0x0020u, CanonicalizeRangeHi }, - { 0x00ffu, 0x00ffu, 0x0079u, CanonicalizeRangeLo }, - { 0x0100u, 0x012fu, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x0130u, 0x0131u, 0x0000u, CanonicalizeUnique }, - { 0x0132u, 0x0137u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x0138u, 0x0138u, 0x0000u, CanonicalizeUnique }, - { 0x0139u, 0x0148u, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0x0149u, 0x0149u, 0x0000u, CanonicalizeUnique }, - { 0x014au, 0x0177u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x0178u, 0x0178u, 0x0079u, CanonicalizeRangeHi }, - { 0x0179u, 0x017eu, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0x017fu, 0x017fu, 0x0000u, CanonicalizeUnique }, - { 0x0180u, 0x0180u, 0x00c3u, CanonicalizeRangeLo }, - { 0x0181u, 0x0181u, 0x00d2u, CanonicalizeRangeLo }, - { 0x0182u, 0x0185u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x0186u, 0x0186u, 0x00ceu, CanonicalizeRangeLo }, - { 0x0187u, 0x0188u, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0x0189u, 0x018au, 0x00cdu, CanonicalizeRangeLo }, - { 0x018bu, 0x018cu, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0x018du, 0x018du, 0x0000u, CanonicalizeUnique }, - { 0x018eu, 0x018eu, 0x004fu, CanonicalizeRangeLo }, - { 0x018fu, 0x018fu, 0x00cau, CanonicalizeRangeLo }, - { 0x0190u, 0x0190u, 0x00cbu, CanonicalizeRangeLo }, - { 0x0191u, 0x0192u, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0x0193u, 0x0193u, 0x00cdu, CanonicalizeRangeLo }, - { 0x0194u, 0x0194u, 0x00cfu, CanonicalizeRangeLo }, - { 0x0195u, 0x0195u, 0x0061u, CanonicalizeRangeLo }, - { 0x0196u, 0x0196u, 0x00d3u, CanonicalizeRangeLo }, - { 0x0197u, 0x0197u, 0x00d1u, CanonicalizeRangeLo }, - { 0x0198u, 0x0199u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x019au, 0x019au, 0x00a3u, CanonicalizeRangeLo }, - { 0x019bu, 0x019bu, 0x0000u, CanonicalizeUnique }, - { 0x019cu, 0x019cu, 0x00d3u, CanonicalizeRangeLo }, - { 0x019du, 0x019du, 0x00d5u, CanonicalizeRangeLo }, - { 0x019eu, 0x019eu, 0x0082u, CanonicalizeRangeLo }, - { 0x019fu, 0x019fu, 0x00d6u, CanonicalizeRangeLo }, - { 0x01a0u, 0x01a5u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x01a6u, 0x01a6u, 0x00dau, CanonicalizeRangeLo }, - { 0x01a7u, 0x01a8u, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0x01a9u, 0x01a9u, 0x00dau, CanonicalizeRangeLo }, - { 0x01aau, 0x01abu, 0x0000u, CanonicalizeUnique }, - { 0x01acu, 0x01adu, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x01aeu, 0x01aeu, 0x00dau, CanonicalizeRangeLo }, - { 0x01afu, 0x01b0u, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0x01b1u, 0x01b2u, 0x00d9u, CanonicalizeRangeLo }, - { 0x01b3u, 0x01b6u, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0x01b7u, 0x01b7u, 0x00dbu, CanonicalizeRangeLo }, - { 0x01b8u, 0x01b9u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x01bau, 0x01bbu, 0x0000u, CanonicalizeUnique }, - { 0x01bcu, 0x01bdu, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x01beu, 0x01beu, 0x0000u, CanonicalizeUnique }, - { 0x01bfu, 0x01bfu, 0x0038u, CanonicalizeRangeLo }, - { 0x01c0u, 0x01c3u, 0x0000u, CanonicalizeUnique }, - { 0x01c4u, 0x01c6u, 0x0000u, CanonicalizeSet }, - { 0x01c7u, 0x01c9u, 0x0001u, CanonicalizeSet }, - { 0x01cau, 0x01ccu, 0x0002u, CanonicalizeSet }, - { 0x01cdu, 0x01dcu, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0x01ddu, 0x01ddu, 0x004fu, CanonicalizeRangeHi }, - { 0x01deu, 0x01efu, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x01f0u, 0x01f0u, 0x0000u, CanonicalizeUnique }, - { 0x01f1u, 0x01f3u, 0x0003u, CanonicalizeSet }, - { 0x01f4u, 0x01f5u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x01f6u, 0x01f6u, 0x0061u, CanonicalizeRangeHi }, - { 0x01f7u, 0x01f7u, 0x0038u, CanonicalizeRangeHi }, - { 0x01f8u, 0x021fu, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x0220u, 0x0220u, 0x0082u, CanonicalizeRangeHi }, - { 0x0221u, 0x0221u, 0x0000u, CanonicalizeUnique }, - { 0x0222u, 0x0233u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x0234u, 0x0239u, 0x0000u, CanonicalizeUnique }, - { 0x023au, 0x023au, 0x2a2bu, CanonicalizeRangeLo }, - { 0x023bu, 0x023cu, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0x023du, 0x023du, 0x00a3u, CanonicalizeRangeHi }, - { 0x023eu, 0x023eu, 0x2a28u, CanonicalizeRangeLo }, - { 0x023fu, 0x0240u, 0x2a3fu, CanonicalizeRangeLo }, - { 0x0241u, 0x0242u, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0x0243u, 0x0243u, 0x00c3u, CanonicalizeRangeHi }, - { 0x0244u, 0x0244u, 0x0045u, CanonicalizeRangeLo }, - { 0x0245u, 0x0245u, 0x0047u, CanonicalizeRangeLo }, - { 0x0246u, 0x024fu, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x0250u, 0x0250u, 0x2a1fu, CanonicalizeRangeLo }, - { 0x0251u, 0x0251u, 0x2a1cu, CanonicalizeRangeLo }, - { 0x0252u, 0x0252u, 0x2a1eu, CanonicalizeRangeLo }, - { 0x0253u, 0x0253u, 0x00d2u, CanonicalizeRangeHi }, - { 0x0254u, 0x0254u, 0x00ceu, CanonicalizeRangeHi }, - { 0x0255u, 0x0255u, 0x0000u, CanonicalizeUnique }, - { 0x0256u, 0x0257u, 0x00cdu, CanonicalizeRangeHi }, - { 0x0258u, 0x0258u, 0x0000u, CanonicalizeUnique }, - { 0x0259u, 0x0259u, 0x00cau, CanonicalizeRangeHi }, - { 0x025au, 0x025au, 0x0000u, CanonicalizeUnique }, - { 0x025bu, 0x025bu, 0x00cbu, CanonicalizeRangeHi }, - { 0x025cu, 0x025fu, 0x0000u, CanonicalizeUnique }, - { 0x0260u, 0x0260u, 0x00cdu, CanonicalizeRangeHi }, - { 0x0261u, 0x0262u, 0x0000u, CanonicalizeUnique }, - { 0x0263u, 0x0263u, 0x00cfu, CanonicalizeRangeHi }, - { 0x0264u, 0x0264u, 0x0000u, CanonicalizeUnique }, - { 0x0265u, 0x0265u, 0xa528u, CanonicalizeRangeLo }, - { 0x0266u, 0x0267u, 0x0000u, CanonicalizeUnique }, - { 0x0268u, 0x0268u, 0x00d1u, CanonicalizeRangeHi }, - { 0x0269u, 0x0269u, 0x00d3u, CanonicalizeRangeHi }, - { 0x026au, 0x026au, 0x0000u, CanonicalizeUnique }, - { 0x026bu, 0x026bu, 0x29f7u, CanonicalizeRangeLo }, - { 0x026cu, 0x026eu, 0x0000u, CanonicalizeUnique }, - { 0x026fu, 0x026fu, 0x00d3u, CanonicalizeRangeHi }, - { 0x0270u, 0x0270u, 0x0000u, CanonicalizeUnique }, - { 0x0271u, 0x0271u, 0x29fdu, CanonicalizeRangeLo }, - { 0x0272u, 0x0272u, 0x00d5u, CanonicalizeRangeHi }, - { 0x0273u, 0x0274u, 0x0000u, CanonicalizeUnique }, - { 0x0275u, 0x0275u, 0x00d6u, CanonicalizeRangeHi }, - { 0x0276u, 0x027cu, 0x0000u, CanonicalizeUnique }, - { 0x027du, 0x027du, 0x29e7u, CanonicalizeRangeLo }, - { 0x027eu, 0x027fu, 0x0000u, CanonicalizeUnique }, - { 0x0280u, 0x0280u, 0x00dau, CanonicalizeRangeHi }, - { 0x0281u, 0x0282u, 0x0000u, CanonicalizeUnique }, - { 0x0283u, 0x0283u, 0x00dau, CanonicalizeRangeHi }, - { 0x0284u, 0x0287u, 0x0000u, CanonicalizeUnique }, - { 0x0288u, 0x0288u, 0x00dau, CanonicalizeRangeHi }, - { 0x0289u, 0x0289u, 0x0045u, CanonicalizeRangeHi }, - { 0x028au, 0x028bu, 0x00d9u, CanonicalizeRangeHi }, - { 0x028cu, 0x028cu, 0x0047u, CanonicalizeRangeHi }, - { 0x028du, 0x0291u, 0x0000u, CanonicalizeUnique }, - { 0x0292u, 0x0292u, 0x00dbu, CanonicalizeRangeHi }, - { 0x0293u, 0x0344u, 0x0000u, CanonicalizeUnique }, - { 0x0345u, 0x0345u, 0x0007u, CanonicalizeSet }, - { 0x0346u, 0x036fu, 0x0000u, CanonicalizeUnique }, - { 0x0370u, 0x0373u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x0374u, 0x0375u, 0x0000u, CanonicalizeUnique }, - { 0x0376u, 0x0377u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x0378u, 0x037au, 0x0000u, CanonicalizeUnique }, - { 0x037bu, 0x037du, 0x0082u, CanonicalizeRangeLo }, - { 0x037eu, 0x0385u, 0x0000u, CanonicalizeUnique }, - { 0x0386u, 0x0386u, 0x0026u, CanonicalizeRangeLo }, - { 0x0387u, 0x0387u, 0x0000u, CanonicalizeUnique }, - { 0x0388u, 0x038au, 0x0025u, CanonicalizeRangeLo }, - { 0x038bu, 0x038bu, 0x0000u, CanonicalizeUnique }, - { 0x038cu, 0x038cu, 0x0040u, CanonicalizeRangeLo }, - { 0x038du, 0x038du, 0x0000u, CanonicalizeUnique }, - { 0x038eu, 0x038fu, 0x003fu, CanonicalizeRangeLo }, - { 0x0390u, 0x0390u, 0x0000u, CanonicalizeUnique }, - { 0x0391u, 0x0391u, 0x0020u, CanonicalizeRangeLo }, - { 0x0392u, 0x0392u, 0x0004u, CanonicalizeSet }, - { 0x0393u, 0x0394u, 0x0020u, CanonicalizeRangeLo }, - { 0x0395u, 0x0395u, 0x0005u, CanonicalizeSet }, - { 0x0396u, 0x0397u, 0x0020u, CanonicalizeRangeLo }, - { 0x0398u, 0x0398u, 0x0006u, CanonicalizeSet }, - { 0x0399u, 0x0399u, 0x0007u, CanonicalizeSet }, - { 0x039au, 0x039au, 0x0008u, CanonicalizeSet }, - { 0x039bu, 0x039bu, 0x0020u, CanonicalizeRangeLo }, - { 0x039cu, 0x039cu, 0x0009u, CanonicalizeSet }, - { 0x039du, 0x039fu, 0x0020u, CanonicalizeRangeLo }, - { 0x03a0u, 0x03a0u, 0x000au, CanonicalizeSet }, - { 0x03a1u, 0x03a1u, 0x000bu, CanonicalizeSet }, - { 0x03a2u, 0x03a2u, 0x0000u, CanonicalizeUnique }, - { 0x03a3u, 0x03a3u, 0x000cu, CanonicalizeSet }, - { 0x03a4u, 0x03a5u, 0x0020u, CanonicalizeRangeLo }, - { 0x03a6u, 0x03a6u, 0x000du, CanonicalizeSet }, - { 0x03a7u, 0x03abu, 0x0020u, CanonicalizeRangeLo }, - { 0x03acu, 0x03acu, 0x0026u, CanonicalizeRangeHi }, - { 0x03adu, 0x03afu, 0x0025u, CanonicalizeRangeHi }, - { 0x03b0u, 0x03b0u, 0x0000u, CanonicalizeUnique }, - { 0x03b1u, 0x03b1u, 0x0020u, CanonicalizeRangeHi }, - { 0x03b2u, 0x03b2u, 0x0004u, CanonicalizeSet }, - { 0x03b3u, 0x03b4u, 0x0020u, CanonicalizeRangeHi }, - { 0x03b5u, 0x03b5u, 0x0005u, CanonicalizeSet }, - { 0x03b6u, 0x03b7u, 0x0020u, CanonicalizeRangeHi }, - { 0x03b8u, 0x03b8u, 0x0006u, CanonicalizeSet }, - { 0x03b9u, 0x03b9u, 0x0007u, CanonicalizeSet }, - { 0x03bau, 0x03bau, 0x0008u, CanonicalizeSet }, - { 0x03bbu, 0x03bbu, 0x0020u, CanonicalizeRangeHi }, - { 0x03bcu, 0x03bcu, 0x0009u, CanonicalizeSet }, - { 0x03bdu, 0x03bfu, 0x0020u, CanonicalizeRangeHi }, - { 0x03c0u, 0x03c0u, 0x000au, CanonicalizeSet }, - { 0x03c1u, 0x03c1u, 0x000bu, CanonicalizeSet }, - { 0x03c2u, 0x03c3u, 0x000cu, CanonicalizeSet }, - { 0x03c4u, 0x03c5u, 0x0020u, CanonicalizeRangeHi }, - { 0x03c6u, 0x03c6u, 0x000du, CanonicalizeSet }, - { 0x03c7u, 0x03cbu, 0x0020u, CanonicalizeRangeHi }, - { 0x03ccu, 0x03ccu, 0x0040u, CanonicalizeRangeHi }, - { 0x03cdu, 0x03ceu, 0x003fu, CanonicalizeRangeHi }, - { 0x03cfu, 0x03cfu, 0x0008u, CanonicalizeRangeLo }, - { 0x03d0u, 0x03d0u, 0x0004u, CanonicalizeSet }, - { 0x03d1u, 0x03d1u, 0x0006u, CanonicalizeSet }, - { 0x03d2u, 0x03d4u, 0x0000u, CanonicalizeUnique }, - { 0x03d5u, 0x03d5u, 0x000du, CanonicalizeSet }, - { 0x03d6u, 0x03d6u, 0x000au, CanonicalizeSet }, - { 0x03d7u, 0x03d7u, 0x0008u, CanonicalizeRangeHi }, - { 0x03d8u, 0x03efu, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x03f0u, 0x03f0u, 0x0008u, CanonicalizeSet }, - { 0x03f1u, 0x03f1u, 0x000bu, CanonicalizeSet }, - { 0x03f2u, 0x03f2u, 0x0007u, CanonicalizeRangeLo }, - { 0x03f3u, 0x03f4u, 0x0000u, CanonicalizeUnique }, - { 0x03f5u, 0x03f5u, 0x0005u, CanonicalizeSet }, - { 0x03f6u, 0x03f6u, 0x0000u, CanonicalizeUnique }, - { 0x03f7u, 0x03f8u, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0x03f9u, 0x03f9u, 0x0007u, CanonicalizeRangeHi }, - { 0x03fau, 0x03fbu, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x03fcu, 0x03fcu, 0x0000u, CanonicalizeUnique }, - { 0x03fdu, 0x03ffu, 0x0082u, CanonicalizeRangeHi }, - { 0x0400u, 0x040fu, 0x0050u, CanonicalizeRangeLo }, - { 0x0410u, 0x042fu, 0x0020u, CanonicalizeRangeLo }, - { 0x0430u, 0x044fu, 0x0020u, CanonicalizeRangeHi }, - { 0x0450u, 0x045fu, 0x0050u, CanonicalizeRangeHi }, - { 0x0460u, 0x0481u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x0482u, 0x0489u, 0x0000u, CanonicalizeUnique }, - { 0x048au, 0x04bfu, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x04c0u, 0x04c0u, 0x000fu, CanonicalizeRangeLo }, - { 0x04c1u, 0x04ceu, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0x04cfu, 0x04cfu, 0x000fu, CanonicalizeRangeHi }, - { 0x04d0u, 0x0527u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x0528u, 0x0530u, 0x0000u, CanonicalizeUnique }, - { 0x0531u, 0x0556u, 0x0030u, CanonicalizeRangeLo }, - { 0x0557u, 0x0560u, 0x0000u, CanonicalizeUnique }, - { 0x0561u, 0x0586u, 0x0030u, CanonicalizeRangeHi }, - { 0x0587u, 0x109fu, 0x0000u, CanonicalizeUnique }, - { 0x10a0u, 0x10c5u, 0x1c60u, CanonicalizeRangeLo }, - { 0x10c6u, 0x1d78u, 0x0000u, CanonicalizeUnique }, - { 0x1d79u, 0x1d79u, 0x8a04u, CanonicalizeRangeLo }, - { 0x1d7au, 0x1d7cu, 0x0000u, CanonicalizeUnique }, - { 0x1d7du, 0x1d7du, 0x0ee6u, CanonicalizeRangeLo }, - { 0x1d7eu, 0x1dffu, 0x0000u, CanonicalizeUnique }, - { 0x1e00u, 0x1e5fu, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x1e60u, 0x1e61u, 0x000eu, CanonicalizeSet }, - { 0x1e62u, 0x1e95u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x1e96u, 0x1e9au, 0x0000u, CanonicalizeUnique }, - { 0x1e9bu, 0x1e9bu, 0x000eu, CanonicalizeSet }, - { 0x1e9cu, 0x1e9fu, 0x0000u, CanonicalizeUnique }, - { 0x1ea0u, 0x1effu, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x1f00u, 0x1f07u, 0x0008u, CanonicalizeRangeLo }, - { 0x1f08u, 0x1f0fu, 0x0008u, CanonicalizeRangeHi }, - { 0x1f10u, 0x1f15u, 0x0008u, CanonicalizeRangeLo }, - { 0x1f16u, 0x1f17u, 0x0000u, CanonicalizeUnique }, - { 0x1f18u, 0x1f1du, 0x0008u, CanonicalizeRangeHi }, - { 0x1f1eu, 0x1f1fu, 0x0000u, CanonicalizeUnique }, - { 0x1f20u, 0x1f27u, 0x0008u, CanonicalizeRangeLo }, - { 0x1f28u, 0x1f2fu, 0x0008u, CanonicalizeRangeHi }, - { 0x1f30u, 0x1f37u, 0x0008u, CanonicalizeRangeLo }, - { 0x1f38u, 0x1f3fu, 0x0008u, CanonicalizeRangeHi }, - { 0x1f40u, 0x1f45u, 0x0008u, CanonicalizeRangeLo }, - { 0x1f46u, 0x1f47u, 0x0000u, CanonicalizeUnique }, - { 0x1f48u, 0x1f4du, 0x0008u, CanonicalizeRangeHi }, - { 0x1f4eu, 0x1f50u, 0x0000u, CanonicalizeUnique }, - { 0x1f51u, 0x1f51u, 0x0008u, CanonicalizeRangeLo }, - { 0x1f52u, 0x1f52u, 0x0000u, CanonicalizeUnique }, - { 0x1f53u, 0x1f53u, 0x0008u, CanonicalizeRangeLo }, - { 0x1f54u, 0x1f54u, 0x0000u, CanonicalizeUnique }, - { 0x1f55u, 0x1f55u, 0x0008u, CanonicalizeRangeLo }, - { 0x1f56u, 0x1f56u, 0x0000u, CanonicalizeUnique }, - { 0x1f57u, 0x1f57u, 0x0008u, CanonicalizeRangeLo }, - { 0x1f58u, 0x1f58u, 0x0000u, CanonicalizeUnique }, - { 0x1f59u, 0x1f59u, 0x0008u, CanonicalizeRangeHi }, - { 0x1f5au, 0x1f5au, 0x0000u, CanonicalizeUnique }, - { 0x1f5bu, 0x1f5bu, 0x0008u, CanonicalizeRangeHi }, - { 0x1f5cu, 0x1f5cu, 0x0000u, CanonicalizeUnique }, - { 0x1f5du, 0x1f5du, 0x0008u, CanonicalizeRangeHi }, - { 0x1f5eu, 0x1f5eu, 0x0000u, CanonicalizeUnique }, - { 0x1f5fu, 0x1f5fu, 0x0008u, CanonicalizeRangeHi }, - { 0x1f60u, 0x1f67u, 0x0008u, CanonicalizeRangeLo }, - { 0x1f68u, 0x1f6fu, 0x0008u, CanonicalizeRangeHi }, - { 0x1f70u, 0x1f71u, 0x004au, CanonicalizeRangeLo }, - { 0x1f72u, 0x1f75u, 0x0056u, CanonicalizeRangeLo }, - { 0x1f76u, 0x1f77u, 0x0064u, CanonicalizeRangeLo }, - { 0x1f78u, 0x1f79u, 0x0080u, CanonicalizeRangeLo }, - { 0x1f7au, 0x1f7bu, 0x0070u, CanonicalizeRangeLo }, - { 0x1f7cu, 0x1f7du, 0x007eu, CanonicalizeRangeLo }, - { 0x1f7eu, 0x1fafu, 0x0000u, CanonicalizeUnique }, - { 0x1fb0u, 0x1fb1u, 0x0008u, CanonicalizeRangeLo }, - { 0x1fb2u, 0x1fb7u, 0x0000u, CanonicalizeUnique }, - { 0x1fb8u, 0x1fb9u, 0x0008u, CanonicalizeRangeHi }, - { 0x1fbau, 0x1fbbu, 0x004au, CanonicalizeRangeHi }, - { 0x1fbcu, 0x1fbdu, 0x0000u, CanonicalizeUnique }, - { 0x1fbeu, 0x1fbeu, 0x0007u, CanonicalizeSet }, - { 0x1fbfu, 0x1fc7u, 0x0000u, CanonicalizeUnique }, - { 0x1fc8u, 0x1fcbu, 0x0056u, CanonicalizeRangeHi }, - { 0x1fccu, 0x1fcfu, 0x0000u, CanonicalizeUnique }, - { 0x1fd0u, 0x1fd1u, 0x0008u, CanonicalizeRangeLo }, - { 0x1fd2u, 0x1fd7u, 0x0000u, CanonicalizeUnique }, - { 0x1fd8u, 0x1fd9u, 0x0008u, CanonicalizeRangeHi }, - { 0x1fdau, 0x1fdbu, 0x0064u, CanonicalizeRangeHi }, - { 0x1fdcu, 0x1fdfu, 0x0000u, CanonicalizeUnique }, - { 0x1fe0u, 0x1fe1u, 0x0008u, CanonicalizeRangeLo }, - { 0x1fe2u, 0x1fe4u, 0x0000u, CanonicalizeUnique }, - { 0x1fe5u, 0x1fe5u, 0x0007u, CanonicalizeRangeLo }, - { 0x1fe6u, 0x1fe7u, 0x0000u, CanonicalizeUnique }, - { 0x1fe8u, 0x1fe9u, 0x0008u, CanonicalizeRangeHi }, - { 0x1feau, 0x1febu, 0x0070u, CanonicalizeRangeHi }, - { 0x1fecu, 0x1fecu, 0x0007u, CanonicalizeRangeHi }, - { 0x1fedu, 0x1ff7u, 0x0000u, CanonicalizeUnique }, - { 0x1ff8u, 0x1ff9u, 0x0080u, CanonicalizeRangeHi }, - { 0x1ffau, 0x1ffbu, 0x007eu, CanonicalizeRangeHi }, - { 0x1ffcu, 0x2131u, 0x0000u, CanonicalizeUnique }, - { 0x2132u, 0x2132u, 0x001cu, CanonicalizeRangeLo }, - { 0x2133u, 0x214du, 0x0000u, CanonicalizeUnique }, - { 0x214eu, 0x214eu, 0x001cu, CanonicalizeRangeHi }, - { 0x214fu, 0x215fu, 0x0000u, CanonicalizeUnique }, - { 0x2160u, 0x216fu, 0x0010u, CanonicalizeRangeLo }, - { 0x2170u, 0x217fu, 0x0010u, CanonicalizeRangeHi }, - { 0x2180u, 0x2182u, 0x0000u, CanonicalizeUnique }, - { 0x2183u, 0x2184u, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0x2185u, 0x24b5u, 0x0000u, CanonicalizeUnique }, - { 0x24b6u, 0x24cfu, 0x001au, CanonicalizeRangeLo }, - { 0x24d0u, 0x24e9u, 0x001au, CanonicalizeRangeHi }, - { 0x24eau, 0x2bffu, 0x0000u, CanonicalizeUnique }, - { 0x2c00u, 0x2c2eu, 0x0030u, CanonicalizeRangeLo }, - { 0x2c2fu, 0x2c2fu, 0x0000u, CanonicalizeUnique }, - { 0x2c30u, 0x2c5eu, 0x0030u, CanonicalizeRangeHi }, - { 0x2c5fu, 0x2c5fu, 0x0000u, CanonicalizeUnique }, - { 0x2c60u, 0x2c61u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x2c62u, 0x2c62u, 0x29f7u, CanonicalizeRangeHi }, - { 0x2c63u, 0x2c63u, 0x0ee6u, CanonicalizeRangeHi }, - { 0x2c64u, 0x2c64u, 0x29e7u, CanonicalizeRangeHi }, - { 0x2c65u, 0x2c65u, 0x2a2bu, CanonicalizeRangeHi }, - { 0x2c66u, 0x2c66u, 0x2a28u, CanonicalizeRangeHi }, - { 0x2c67u, 0x2c6cu, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0x2c6du, 0x2c6du, 0x2a1cu, CanonicalizeRangeHi }, - { 0x2c6eu, 0x2c6eu, 0x29fdu, CanonicalizeRangeHi }, - { 0x2c6fu, 0x2c6fu, 0x2a1fu, CanonicalizeRangeHi }, - { 0x2c70u, 0x2c70u, 0x2a1eu, CanonicalizeRangeHi }, - { 0x2c71u, 0x2c71u, 0x0000u, CanonicalizeUnique }, - { 0x2c72u, 0x2c73u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x2c74u, 0x2c74u, 0x0000u, CanonicalizeUnique }, - { 0x2c75u, 0x2c76u, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0x2c77u, 0x2c7du, 0x0000u, CanonicalizeUnique }, - { 0x2c7eu, 0x2c7fu, 0x2a3fu, CanonicalizeRangeHi }, - { 0x2c80u, 0x2ce3u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0x2ce4u, 0x2ceau, 0x0000u, CanonicalizeUnique }, - { 0x2cebu, 0x2ceeu, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0x2cefu, 0x2cffu, 0x0000u, CanonicalizeUnique }, - { 0x2d00u, 0x2d25u, 0x1c60u, CanonicalizeRangeHi }, - { 0x2d26u, 0xa63fu, 0x0000u, CanonicalizeUnique }, - { 0xa640u, 0xa66du, 0x0000u, CanonicalizeAlternatingAligned }, - { 0xa66eu, 0xa67fu, 0x0000u, CanonicalizeUnique }, - { 0xa680u, 0xa697u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0xa698u, 0xa721u, 0x0000u, CanonicalizeUnique }, - { 0xa722u, 0xa72fu, 0x0000u, CanonicalizeAlternatingAligned }, - { 0xa730u, 0xa731u, 0x0000u, CanonicalizeUnique }, - { 0xa732u, 0xa76fu, 0x0000u, CanonicalizeAlternatingAligned }, - { 0xa770u, 0xa778u, 0x0000u, CanonicalizeUnique }, - { 0xa779u, 0xa77cu, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0xa77du, 0xa77du, 0x8a04u, CanonicalizeRangeHi }, - { 0xa77eu, 0xa787u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0xa788u, 0xa78au, 0x0000u, CanonicalizeUnique }, - { 0xa78bu, 0xa78cu, 0x0000u, CanonicalizeAlternatingUnaligned }, - { 0xa78du, 0xa78du, 0xa528u, CanonicalizeRangeHi }, - { 0xa78eu, 0xa78fu, 0x0000u, CanonicalizeUnique }, - { 0xa790u, 0xa791u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0xa792u, 0xa79fu, 0x0000u, CanonicalizeUnique }, - { 0xa7a0u, 0xa7a9u, 0x0000u, CanonicalizeAlternatingAligned }, - { 0xa7aau, 0xff20u, 0x0000u, CanonicalizeUnique }, - { 0xff21u, 0xff3au, 0x0020u, CanonicalizeRangeLo }, - { 0xff3bu, 0xff40u, 0x0000u, CanonicalizeUnique }, - { 0xff41u, 0xff5au, 0x0020u, CanonicalizeRangeHi }, - { 0xff5bu, 0xffffu, 0x0000u, CanonicalizeUnique }, -}; - -const size_t LATIN_CANONICALIZATION_RANGES = 20; -LatinCanonicalizationRange latinRangeInfo[LATIN_CANONICALIZATION_RANGES] = { - { 0x0000u, 0x0040u, 0x0000u, CanonicalizeLatinSelf }, - { 0x0041u, 0x005au, 0x0000u, CanonicalizeLatinMask0x20 }, - { 0x005bu, 0x0060u, 0x0000u, CanonicalizeLatinSelf }, - { 0x0061u, 0x007au, 0x0000u, CanonicalizeLatinMask0x20 }, - { 0x007bu, 0x00bfu, 0x0000u, CanonicalizeLatinSelf }, - { 0x00c0u, 0x00d6u, 0x0000u, CanonicalizeLatinMask0x20 }, - { 0x00d7u, 0x00d7u, 0x0000u, CanonicalizeLatinSelf }, - { 0x00d8u, 0x00deu, 0x0000u, CanonicalizeLatinMask0x20 }, - { 0x00dfu, 0x00dfu, 0x0000u, CanonicalizeLatinSelf }, - { 0x00e0u, 0x00f6u, 0x0000u, CanonicalizeLatinMask0x20 }, - { 0x00f7u, 0x00f7u, 0x0000u, CanonicalizeLatinSelf }, - { 0x00f8u, 0x00feu, 0x0000u, CanonicalizeLatinMask0x20 }, - { 0x00ffu, 0x00ffu, 0x0000u, CanonicalizeLatinSelf }, - { 0x0100u, 0x0177u, 0x0000u, CanonicalizeLatinInvalid }, - { 0x0178u, 0x0178u, 0x00ffu, CanonicalizeLatinOther }, - { 0x0179u, 0x039bu, 0x0000u, CanonicalizeLatinInvalid }, - { 0x039cu, 0x039cu, 0x00b5u, CanonicalizeLatinOther }, - { 0x039du, 0x03bbu, 0x0000u, CanonicalizeLatinInvalid }, - { 0x03bcu, 0x03bcu, 0x00b5u, CanonicalizeLatinOther }, - { 0x03bdu, 0xffffu, 0x0000u, CanonicalizeLatinInvalid }, +const size_t UCS2_CANONICALIZATION_RANGES = 391; +const CanonicalizationRange ucs2RangeInfo[UCS2_CANONICALIZATION_RANGES] = { + { 0x0000, 0x0040, 0x0000, CanonicalizeUnique }, + { 0x0041, 0x005a, 0x0020, CanonicalizeRangeLo }, + { 0x005b, 0x0060, 0x0000, CanonicalizeUnique }, + { 0x0061, 0x007a, 0x0020, CanonicalizeRangeHi }, + { 0x007b, 0x00b4, 0x0000, CanonicalizeUnique }, + { 0x00b5, 0x00b5, 0x0009, CanonicalizeSet }, + { 0x00b6, 0x00bf, 0x0000, CanonicalizeUnique }, + { 0x00c0, 0x00d6, 0x0020, CanonicalizeRangeLo }, + { 0x00d7, 0x00d7, 0x0000, CanonicalizeUnique }, + { 0x00d8, 0x00de, 0x0020, CanonicalizeRangeLo }, + { 0x00df, 0x00df, 0x0000, CanonicalizeUnique }, + { 0x00e0, 0x00f6, 0x0020, CanonicalizeRangeHi }, + { 0x00f7, 0x00f7, 0x0000, CanonicalizeUnique }, + { 0x00f8, 0x00fe, 0x0020, CanonicalizeRangeHi }, + { 0x00ff, 0x00ff, 0x0079, CanonicalizeRangeLo }, + { 0x0100, 0x012f, 0x0000, CanonicalizeAlternatingAligned }, + { 0x0130, 0x0131, 0x0000, CanonicalizeUnique }, + { 0x0132, 0x0137, 0x0000, CanonicalizeAlternatingAligned }, + { 0x0138, 0x0138, 0x0000, CanonicalizeUnique }, + { 0x0139, 0x0148, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0x0149, 0x0149, 0x0000, CanonicalizeUnique }, + { 0x014a, 0x0177, 0x0000, CanonicalizeAlternatingAligned }, + { 0x0178, 0x0178, 0x0079, CanonicalizeRangeHi }, + { 0x0179, 0x017e, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0x017f, 0x017f, 0x0000, CanonicalizeUnique }, + { 0x0180, 0x0180, 0x00c3, CanonicalizeRangeLo }, + { 0x0181, 0x0181, 0x00d2, CanonicalizeRangeLo }, + { 0x0182, 0x0185, 0x0000, CanonicalizeAlternatingAligned }, + { 0x0186, 0x0186, 0x00ce, CanonicalizeRangeLo }, + { 0x0187, 0x0188, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0x0189, 0x018a, 0x00cd, CanonicalizeRangeLo }, + { 0x018b, 0x018c, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0x018d, 0x018d, 0x0000, CanonicalizeUnique }, + { 0x018e, 0x018e, 0x004f, CanonicalizeRangeLo }, + { 0x018f, 0x018f, 0x00ca, CanonicalizeRangeLo }, + { 0x0190, 0x0190, 0x00cb, CanonicalizeRangeLo }, + { 0x0191, 0x0192, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0x0193, 0x0193, 0x00cd, CanonicalizeRangeLo }, + { 0x0194, 0x0194, 0x00cf, CanonicalizeRangeLo }, + { 0x0195, 0x0195, 0x0061, CanonicalizeRangeLo }, + { 0x0196, 0x0196, 0x00d3, CanonicalizeRangeLo }, + { 0x0197, 0x0197, 0x00d1, CanonicalizeRangeLo }, + { 0x0198, 0x0199, 0x0000, CanonicalizeAlternatingAligned }, + { 0x019a, 0x019a, 0x00a3, CanonicalizeRangeLo }, + { 0x019b, 0x019b, 0x0000, CanonicalizeUnique }, + { 0x019c, 0x019c, 0x00d3, CanonicalizeRangeLo }, + { 0x019d, 0x019d, 0x00d5, CanonicalizeRangeLo }, + { 0x019e, 0x019e, 0x0082, CanonicalizeRangeLo }, + { 0x019f, 0x019f, 0x00d6, CanonicalizeRangeLo }, + { 0x01a0, 0x01a5, 0x0000, CanonicalizeAlternatingAligned }, + { 0x01a6, 0x01a6, 0x00da, CanonicalizeRangeLo }, + { 0x01a7, 0x01a8, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0x01a9, 0x01a9, 0x00da, CanonicalizeRangeLo }, + { 0x01aa, 0x01ab, 0x0000, CanonicalizeUnique }, + { 0x01ac, 0x01ad, 0x0000, CanonicalizeAlternatingAligned }, + { 0x01ae, 0x01ae, 0x00da, CanonicalizeRangeLo }, + { 0x01af, 0x01b0, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0x01b1, 0x01b2, 0x00d9, CanonicalizeRangeLo }, + { 0x01b3, 0x01b6, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0x01b7, 0x01b7, 0x00db, CanonicalizeRangeLo }, + { 0x01b8, 0x01b9, 0x0000, CanonicalizeAlternatingAligned }, + { 0x01ba, 0x01bb, 0x0000, CanonicalizeUnique }, + { 0x01bc, 0x01bd, 0x0000, CanonicalizeAlternatingAligned }, + { 0x01be, 0x01be, 0x0000, CanonicalizeUnique }, + { 0x01bf, 0x01bf, 0x0038, CanonicalizeRangeLo }, + { 0x01c0, 0x01c3, 0x0000, CanonicalizeUnique }, + { 0x01c4, 0x01c6, 0x0000, CanonicalizeSet }, + { 0x01c7, 0x01c9, 0x0001, CanonicalizeSet }, + { 0x01ca, 0x01cc, 0x0002, CanonicalizeSet }, + { 0x01cd, 0x01dc, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0x01dd, 0x01dd, 0x004f, CanonicalizeRangeHi }, + { 0x01de, 0x01ef, 0x0000, CanonicalizeAlternatingAligned }, + { 0x01f0, 0x01f0, 0x0000, CanonicalizeUnique }, + { 0x01f1, 0x01f3, 0x0003, CanonicalizeSet }, + { 0x01f4, 0x01f5, 0x0000, CanonicalizeAlternatingAligned }, + { 0x01f6, 0x01f6, 0x0061, CanonicalizeRangeHi }, + { 0x01f7, 0x01f7, 0x0038, CanonicalizeRangeHi }, + { 0x01f8, 0x021f, 0x0000, CanonicalizeAlternatingAligned }, + { 0x0220, 0x0220, 0x0082, CanonicalizeRangeHi }, + { 0x0221, 0x0221, 0x0000, CanonicalizeUnique }, + { 0x0222, 0x0233, 0x0000, CanonicalizeAlternatingAligned }, + { 0x0234, 0x0239, 0x0000, CanonicalizeUnique }, + { 0x023a, 0x023a, 0x2a2b, CanonicalizeRangeLo }, + { 0x023b, 0x023c, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0x023d, 0x023d, 0x00a3, CanonicalizeRangeHi }, + { 0x023e, 0x023e, 0x2a28, CanonicalizeRangeLo }, + { 0x023f, 0x0240, 0x2a3f, CanonicalizeRangeLo }, + { 0x0241, 0x0242, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0x0243, 0x0243, 0x00c3, CanonicalizeRangeHi }, + { 0x0244, 0x0244, 0x0045, CanonicalizeRangeLo }, + { 0x0245, 0x0245, 0x0047, CanonicalizeRangeLo }, + { 0x0246, 0x024f, 0x0000, CanonicalizeAlternatingAligned }, + { 0x0250, 0x0250, 0x2a1f, CanonicalizeRangeLo }, + { 0x0251, 0x0251, 0x2a1c, CanonicalizeRangeLo }, + { 0x0252, 0x0252, 0x2a1e, CanonicalizeRangeLo }, + { 0x0253, 0x0253, 0x00d2, CanonicalizeRangeHi }, + { 0x0254, 0x0254, 0x00ce, CanonicalizeRangeHi }, + { 0x0255, 0x0255, 0x0000, CanonicalizeUnique }, + { 0x0256, 0x0257, 0x00cd, CanonicalizeRangeHi }, + { 0x0258, 0x0258, 0x0000, CanonicalizeUnique }, + { 0x0259, 0x0259, 0x00ca, CanonicalizeRangeHi }, + { 0x025a, 0x025a, 0x0000, CanonicalizeUnique }, + { 0x025b, 0x025b, 0x00cb, CanonicalizeRangeHi }, + { 0x025c, 0x025c, 0xa54f, CanonicalizeRangeLo }, + { 0x025d, 0x025f, 0x0000, CanonicalizeUnique }, + { 0x0260, 0x0260, 0x00cd, CanonicalizeRangeHi }, + { 0x0261, 0x0261, 0xa54b, CanonicalizeRangeLo }, + { 0x0262, 0x0262, 0x0000, CanonicalizeUnique }, + { 0x0263, 0x0263, 0x00cf, CanonicalizeRangeHi }, + { 0x0264, 0x0264, 0x0000, CanonicalizeUnique }, + { 0x0265, 0x0265, 0xa528, CanonicalizeRangeLo }, + { 0x0266, 0x0266, 0xa544, CanonicalizeRangeLo }, + { 0x0267, 0x0267, 0x0000, CanonicalizeUnique }, + { 0x0268, 0x0268, 0x00d1, CanonicalizeRangeHi }, + { 0x0269, 0x0269, 0x00d3, CanonicalizeRangeHi }, + { 0x026a, 0x026a, 0x0000, CanonicalizeUnique }, + { 0x026b, 0x026b, 0x29f7, CanonicalizeRangeLo }, + { 0x026c, 0x026c, 0xa541, CanonicalizeRangeLo }, + { 0x026d, 0x026e, 0x0000, CanonicalizeUnique }, + { 0x026f, 0x026f, 0x00d3, CanonicalizeRangeHi }, + { 0x0270, 0x0270, 0x0000, CanonicalizeUnique }, + { 0x0271, 0x0271, 0x29fd, CanonicalizeRangeLo }, + { 0x0272, 0x0272, 0x00d5, CanonicalizeRangeHi }, + { 0x0273, 0x0274, 0x0000, CanonicalizeUnique }, + { 0x0275, 0x0275, 0x00d6, CanonicalizeRangeHi }, + { 0x0276, 0x027c, 0x0000, CanonicalizeUnique }, + { 0x027d, 0x027d, 0x29e7, CanonicalizeRangeLo }, + { 0x027e, 0x027f, 0x0000, CanonicalizeUnique }, + { 0x0280, 0x0280, 0x00da, CanonicalizeRangeHi }, + { 0x0281, 0x0282, 0x0000, CanonicalizeUnique }, + { 0x0283, 0x0283, 0x00da, CanonicalizeRangeHi }, + { 0x0284, 0x0286, 0x0000, CanonicalizeUnique }, + { 0x0287, 0x0287, 0xa52a, CanonicalizeRangeLo }, + { 0x0288, 0x0288, 0x00da, CanonicalizeRangeHi }, + { 0x0289, 0x0289, 0x0045, CanonicalizeRangeHi }, + { 0x028a, 0x028b, 0x00d9, CanonicalizeRangeHi }, + { 0x028c, 0x028c, 0x0047, CanonicalizeRangeHi }, + { 0x028d, 0x0291, 0x0000, CanonicalizeUnique }, + { 0x0292, 0x0292, 0x00db, CanonicalizeRangeHi }, + { 0x0293, 0x029d, 0x0000, CanonicalizeUnique }, + { 0x029e, 0x029e, 0xa512, CanonicalizeRangeLo }, + { 0x029f, 0x0344, 0x0000, CanonicalizeUnique }, + { 0x0345, 0x0345, 0x0007, CanonicalizeSet }, + { 0x0346, 0x036f, 0x0000, CanonicalizeUnique }, + { 0x0370, 0x0373, 0x0000, CanonicalizeAlternatingAligned }, + { 0x0374, 0x0375, 0x0000, CanonicalizeUnique }, + { 0x0376, 0x0377, 0x0000, CanonicalizeAlternatingAligned }, + { 0x0378, 0x037a, 0x0000, CanonicalizeUnique }, + { 0x037b, 0x037d, 0x0082, CanonicalizeRangeLo }, + { 0x037e, 0x037e, 0x0000, CanonicalizeUnique }, + { 0x037f, 0x037f, 0x0074, CanonicalizeRangeLo }, + { 0x0380, 0x0385, 0x0000, CanonicalizeUnique }, + { 0x0386, 0x0386, 0x0026, CanonicalizeRangeLo }, + { 0x0387, 0x0387, 0x0000, CanonicalizeUnique }, + { 0x0388, 0x038a, 0x0025, CanonicalizeRangeLo }, + { 0x038b, 0x038b, 0x0000, CanonicalizeUnique }, + { 0x038c, 0x038c, 0x0040, CanonicalizeRangeLo }, + { 0x038d, 0x038d, 0x0000, CanonicalizeUnique }, + { 0x038e, 0x038f, 0x003f, CanonicalizeRangeLo }, + { 0x0390, 0x0390, 0x0000, CanonicalizeUnique }, + { 0x0391, 0x0391, 0x0020, CanonicalizeRangeLo }, + { 0x0392, 0x0392, 0x0004, CanonicalizeSet }, + { 0x0393, 0x0394, 0x0020, CanonicalizeRangeLo }, + { 0x0395, 0x0395, 0x0005, CanonicalizeSet }, + { 0x0396, 0x0397, 0x0020, CanonicalizeRangeLo }, + { 0x0398, 0x0398, 0x0006, CanonicalizeSet }, + { 0x0399, 0x0399, 0x0007, CanonicalizeSet }, + { 0x039a, 0x039a, 0x0008, CanonicalizeSet }, + { 0x039b, 0x039b, 0x0020, CanonicalizeRangeLo }, + { 0x039c, 0x039c, 0x0009, CanonicalizeSet }, + { 0x039d, 0x039f, 0x0020, CanonicalizeRangeLo }, + { 0x03a0, 0x03a0, 0x000a, CanonicalizeSet }, + { 0x03a1, 0x03a1, 0x000b, CanonicalizeSet }, + { 0x03a2, 0x03a2, 0x0000, CanonicalizeUnique }, + { 0x03a3, 0x03a3, 0x000c, CanonicalizeSet }, + { 0x03a4, 0x03a5, 0x0020, CanonicalizeRangeLo }, + { 0x03a6, 0x03a6, 0x000d, CanonicalizeSet }, + { 0x03a7, 0x03ab, 0x0020, CanonicalizeRangeLo }, + { 0x03ac, 0x03ac, 0x0026, CanonicalizeRangeHi }, + { 0x03ad, 0x03af, 0x0025, CanonicalizeRangeHi }, + { 0x03b0, 0x03b0, 0x0000, CanonicalizeUnique }, + { 0x03b1, 0x03b1, 0x0020, CanonicalizeRangeHi }, + { 0x03b2, 0x03b2, 0x0004, CanonicalizeSet }, + { 0x03b3, 0x03b4, 0x0020, CanonicalizeRangeHi }, + { 0x03b5, 0x03b5, 0x0005, CanonicalizeSet }, + { 0x03b6, 0x03b7, 0x0020, CanonicalizeRangeHi }, + { 0x03b8, 0x03b8, 0x0006, CanonicalizeSet }, + { 0x03b9, 0x03b9, 0x0007, CanonicalizeSet }, + { 0x03ba, 0x03ba, 0x0008, CanonicalizeSet }, + { 0x03bb, 0x03bb, 0x0020, CanonicalizeRangeHi }, + { 0x03bc, 0x03bc, 0x0009, CanonicalizeSet }, + { 0x03bd, 0x03bf, 0x0020, CanonicalizeRangeHi }, + { 0x03c0, 0x03c0, 0x000a, CanonicalizeSet }, + { 0x03c1, 0x03c1, 0x000b, CanonicalizeSet }, + { 0x03c2, 0x03c3, 0x000c, CanonicalizeSet }, + { 0x03c4, 0x03c5, 0x0020, CanonicalizeRangeHi }, + { 0x03c6, 0x03c6, 0x000d, CanonicalizeSet }, + { 0x03c7, 0x03cb, 0x0020, CanonicalizeRangeHi }, + { 0x03cc, 0x03cc, 0x0040, CanonicalizeRangeHi }, + { 0x03cd, 0x03ce, 0x003f, CanonicalizeRangeHi }, + { 0x03cf, 0x03cf, 0x0008, CanonicalizeRangeLo }, + { 0x03d0, 0x03d0, 0x0004, CanonicalizeSet }, + { 0x03d1, 0x03d1, 0x0006, CanonicalizeSet }, + { 0x03d2, 0x03d4, 0x0000, CanonicalizeUnique }, + { 0x03d5, 0x03d5, 0x000d, CanonicalizeSet }, + { 0x03d6, 0x03d6, 0x000a, CanonicalizeSet }, + { 0x03d7, 0x03d7, 0x0008, CanonicalizeRangeHi }, + { 0x03d8, 0x03ef, 0x0000, CanonicalizeAlternatingAligned }, + { 0x03f0, 0x03f0, 0x0008, CanonicalizeSet }, + { 0x03f1, 0x03f1, 0x000b, CanonicalizeSet }, + { 0x03f2, 0x03f2, 0x0007, CanonicalizeRangeLo }, + { 0x03f3, 0x03f3, 0x0074, CanonicalizeRangeHi }, + { 0x03f4, 0x03f4, 0x0000, CanonicalizeUnique }, + { 0x03f5, 0x03f5, 0x0005, CanonicalizeSet }, + { 0x03f6, 0x03f6, 0x0000, CanonicalizeUnique }, + { 0x03f7, 0x03f8, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0x03f9, 0x03f9, 0x0007, CanonicalizeRangeHi }, + { 0x03fa, 0x03fb, 0x0000, CanonicalizeAlternatingAligned }, + { 0x03fc, 0x03fc, 0x0000, CanonicalizeUnique }, + { 0x03fd, 0x03ff, 0x0082, CanonicalizeRangeHi }, + { 0x0400, 0x040f, 0x0050, CanonicalizeRangeLo }, + { 0x0410, 0x042f, 0x0020, CanonicalizeRangeLo }, + { 0x0430, 0x044f, 0x0020, CanonicalizeRangeHi }, + { 0x0450, 0x045f, 0x0050, CanonicalizeRangeHi }, + { 0x0460, 0x0481, 0x0000, CanonicalizeAlternatingAligned }, + { 0x0482, 0x0489, 0x0000, CanonicalizeUnique }, + { 0x048a, 0x04bf, 0x0000, CanonicalizeAlternatingAligned }, + { 0x04c0, 0x04c0, 0x000f, CanonicalizeRangeLo }, + { 0x04c1, 0x04ce, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0x04cf, 0x04cf, 0x000f, CanonicalizeRangeHi }, + { 0x04d0, 0x052f, 0x0000, CanonicalizeAlternatingAligned }, + { 0x0530, 0x0530, 0x0000, CanonicalizeUnique }, + { 0x0531, 0x0556, 0x0030, CanonicalizeRangeLo }, + { 0x0557, 0x0560, 0x0000, CanonicalizeUnique }, + { 0x0561, 0x0586, 0x0030, CanonicalizeRangeHi }, + { 0x0587, 0x109f, 0x0000, CanonicalizeUnique }, + { 0x10a0, 0x10c5, 0x1c60, CanonicalizeRangeLo }, + { 0x10c6, 0x10c6, 0x0000, CanonicalizeUnique }, + { 0x10c7, 0x10c7, 0x1c60, CanonicalizeRangeLo }, + { 0x10c8, 0x10cc, 0x0000, CanonicalizeUnique }, + { 0x10cd, 0x10cd, 0x1c60, CanonicalizeRangeLo }, + { 0x10ce, 0x1d78, 0x0000, CanonicalizeUnique }, + { 0x1d79, 0x1d79, 0x8a04, CanonicalizeRangeLo }, + { 0x1d7a, 0x1d7c, 0x0000, CanonicalizeUnique }, + { 0x1d7d, 0x1d7d, 0x0ee6, CanonicalizeRangeLo }, + { 0x1d7e, 0x1dff, 0x0000, CanonicalizeUnique }, + { 0x1e00, 0x1e5f, 0x0000, CanonicalizeAlternatingAligned }, + { 0x1e60, 0x1e61, 0x000e, CanonicalizeSet }, + { 0x1e62, 0x1e95, 0x0000, CanonicalizeAlternatingAligned }, + { 0x1e96, 0x1e9a, 0x0000, CanonicalizeUnique }, + { 0x1e9b, 0x1e9b, 0x000e, CanonicalizeSet }, + { 0x1e9c, 0x1e9f, 0x0000, CanonicalizeUnique }, + { 0x1ea0, 0x1eff, 0x0000, CanonicalizeAlternatingAligned }, + { 0x1f00, 0x1f07, 0x0008, CanonicalizeRangeLo }, + { 0x1f08, 0x1f0f, 0x0008, CanonicalizeRangeHi }, + { 0x1f10, 0x1f15, 0x0008, CanonicalizeRangeLo }, + { 0x1f16, 0x1f17, 0x0000, CanonicalizeUnique }, + { 0x1f18, 0x1f1d, 0x0008, CanonicalizeRangeHi }, + { 0x1f1e, 0x1f1f, 0x0000, CanonicalizeUnique }, + { 0x1f20, 0x1f27, 0x0008, CanonicalizeRangeLo }, + { 0x1f28, 0x1f2f, 0x0008, CanonicalizeRangeHi }, + { 0x1f30, 0x1f37, 0x0008, CanonicalizeRangeLo }, + { 0x1f38, 0x1f3f, 0x0008, CanonicalizeRangeHi }, + { 0x1f40, 0x1f45, 0x0008, CanonicalizeRangeLo }, + { 0x1f46, 0x1f47, 0x0000, CanonicalizeUnique }, + { 0x1f48, 0x1f4d, 0x0008, CanonicalizeRangeHi }, + { 0x1f4e, 0x1f50, 0x0000, CanonicalizeUnique }, + { 0x1f51, 0x1f51, 0x0008, CanonicalizeRangeLo }, + { 0x1f52, 0x1f52, 0x0000, CanonicalizeUnique }, + { 0x1f53, 0x1f53, 0x0008, CanonicalizeRangeLo }, + { 0x1f54, 0x1f54, 0x0000, CanonicalizeUnique }, + { 0x1f55, 0x1f55, 0x0008, CanonicalizeRangeLo }, + { 0x1f56, 0x1f56, 0x0000, CanonicalizeUnique }, + { 0x1f57, 0x1f57, 0x0008, CanonicalizeRangeLo }, + { 0x1f58, 0x1f58, 0x0000, CanonicalizeUnique }, + { 0x1f59, 0x1f59, 0x0008, CanonicalizeRangeHi }, + { 0x1f5a, 0x1f5a, 0x0000, CanonicalizeUnique }, + { 0x1f5b, 0x1f5b, 0x0008, CanonicalizeRangeHi }, + { 0x1f5c, 0x1f5c, 0x0000, CanonicalizeUnique }, + { 0x1f5d, 0x1f5d, 0x0008, CanonicalizeRangeHi }, + { 0x1f5e, 0x1f5e, 0x0000, CanonicalizeUnique }, + { 0x1f5f, 0x1f5f, 0x0008, CanonicalizeRangeHi }, + { 0x1f60, 0x1f67, 0x0008, CanonicalizeRangeLo }, + { 0x1f68, 0x1f6f, 0x0008, CanonicalizeRangeHi }, + { 0x1f70, 0x1f71, 0x004a, CanonicalizeRangeLo }, + { 0x1f72, 0x1f75, 0x0056, CanonicalizeRangeLo }, + { 0x1f76, 0x1f77, 0x0064, CanonicalizeRangeLo }, + { 0x1f78, 0x1f79, 0x0080, CanonicalizeRangeLo }, + { 0x1f7a, 0x1f7b, 0x0070, CanonicalizeRangeLo }, + { 0x1f7c, 0x1f7d, 0x007e, CanonicalizeRangeLo }, + { 0x1f7e, 0x1faf, 0x0000, CanonicalizeUnique }, + { 0x1fb0, 0x1fb1, 0x0008, CanonicalizeRangeLo }, + { 0x1fb2, 0x1fb7, 0x0000, CanonicalizeUnique }, + { 0x1fb8, 0x1fb9, 0x0008, CanonicalizeRangeHi }, + { 0x1fba, 0x1fbb, 0x004a, CanonicalizeRangeHi }, + { 0x1fbc, 0x1fbd, 0x0000, CanonicalizeUnique }, + { 0x1fbe, 0x1fbe, 0x0007, CanonicalizeSet }, + { 0x1fbf, 0x1fc7, 0x0000, CanonicalizeUnique }, + { 0x1fc8, 0x1fcb, 0x0056, CanonicalizeRangeHi }, + { 0x1fcc, 0x1fcf, 0x0000, CanonicalizeUnique }, + { 0x1fd0, 0x1fd1, 0x0008, CanonicalizeRangeLo }, + { 0x1fd2, 0x1fd7, 0x0000, CanonicalizeUnique }, + { 0x1fd8, 0x1fd9, 0x0008, CanonicalizeRangeHi }, + { 0x1fda, 0x1fdb, 0x0064, CanonicalizeRangeHi }, + { 0x1fdc, 0x1fdf, 0x0000, CanonicalizeUnique }, + { 0x1fe0, 0x1fe1, 0x0008, CanonicalizeRangeLo }, + { 0x1fe2, 0x1fe4, 0x0000, CanonicalizeUnique }, + { 0x1fe5, 0x1fe5, 0x0007, CanonicalizeRangeLo }, + { 0x1fe6, 0x1fe7, 0x0000, CanonicalizeUnique }, + { 0x1fe8, 0x1fe9, 0x0008, CanonicalizeRangeHi }, + { 0x1fea, 0x1feb, 0x0070, CanonicalizeRangeHi }, + { 0x1fec, 0x1fec, 0x0007, CanonicalizeRangeHi }, + { 0x1fed, 0x1ff7, 0x0000, CanonicalizeUnique }, + { 0x1ff8, 0x1ff9, 0x0080, CanonicalizeRangeHi }, + { 0x1ffa, 0x1ffb, 0x007e, CanonicalizeRangeHi }, + { 0x1ffc, 0x2131, 0x0000, CanonicalizeUnique }, + { 0x2132, 0x2132, 0x001c, CanonicalizeRangeLo }, + { 0x2133, 0x214d, 0x0000, CanonicalizeUnique }, + { 0x214e, 0x214e, 0x001c, CanonicalizeRangeHi }, + { 0x214f, 0x215f, 0x0000, CanonicalizeUnique }, + { 0x2160, 0x216f, 0x0010, CanonicalizeRangeLo }, + { 0x2170, 0x217f, 0x0010, CanonicalizeRangeHi }, + { 0x2180, 0x2182, 0x0000, CanonicalizeUnique }, + { 0x2183, 0x2184, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0x2185, 0x24b5, 0x0000, CanonicalizeUnique }, + { 0x24b6, 0x24cf, 0x001a, CanonicalizeRangeLo }, + { 0x24d0, 0x24e9, 0x001a, CanonicalizeRangeHi }, + { 0x24ea, 0x2bff, 0x0000, CanonicalizeUnique }, + { 0x2c00, 0x2c2e, 0x0030, CanonicalizeRangeLo }, + { 0x2c2f, 0x2c2f, 0x0000, CanonicalizeUnique }, + { 0x2c30, 0x2c5e, 0x0030, CanonicalizeRangeHi }, + { 0x2c5f, 0x2c5f, 0x0000, CanonicalizeUnique }, + { 0x2c60, 0x2c61, 0x0000, CanonicalizeAlternatingAligned }, + { 0x2c62, 0x2c62, 0x29f7, CanonicalizeRangeHi }, + { 0x2c63, 0x2c63, 0x0ee6, CanonicalizeRangeHi }, + { 0x2c64, 0x2c64, 0x29e7, CanonicalizeRangeHi }, + { 0x2c65, 0x2c65, 0x2a2b, CanonicalizeRangeHi }, + { 0x2c66, 0x2c66, 0x2a28, CanonicalizeRangeHi }, + { 0x2c67, 0x2c6c, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0x2c6d, 0x2c6d, 0x2a1c, CanonicalizeRangeHi }, + { 0x2c6e, 0x2c6e, 0x29fd, CanonicalizeRangeHi }, + { 0x2c6f, 0x2c6f, 0x2a1f, CanonicalizeRangeHi }, + { 0x2c70, 0x2c70, 0x2a1e, CanonicalizeRangeHi }, + { 0x2c71, 0x2c71, 0x0000, CanonicalizeUnique }, + { 0x2c72, 0x2c73, 0x0000, CanonicalizeAlternatingAligned }, + { 0x2c74, 0x2c74, 0x0000, CanonicalizeUnique }, + { 0x2c75, 0x2c76, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0x2c77, 0x2c7d, 0x0000, CanonicalizeUnique }, + { 0x2c7e, 0x2c7f, 0x2a3f, CanonicalizeRangeHi }, + { 0x2c80, 0x2ce3, 0x0000, CanonicalizeAlternatingAligned }, + { 0x2ce4, 0x2cea, 0x0000, CanonicalizeUnique }, + { 0x2ceb, 0x2cee, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0x2cef, 0x2cf1, 0x0000, CanonicalizeUnique }, + { 0x2cf2, 0x2cf3, 0x0000, CanonicalizeAlternatingAligned }, + { 0x2cf4, 0x2cff, 0x0000, CanonicalizeUnique }, + { 0x2d00, 0x2d25, 0x1c60, CanonicalizeRangeHi }, + { 0x2d26, 0x2d26, 0x0000, CanonicalizeUnique }, + { 0x2d27, 0x2d27, 0x1c60, CanonicalizeRangeHi }, + { 0x2d28, 0x2d2c, 0x0000, CanonicalizeUnique }, + { 0x2d2d, 0x2d2d, 0x1c60, CanonicalizeRangeHi }, + { 0x2d2e, 0xa63f, 0x0000, CanonicalizeUnique }, + { 0xa640, 0xa66d, 0x0000, CanonicalizeAlternatingAligned }, + { 0xa66e, 0xa67f, 0x0000, CanonicalizeUnique }, + { 0xa680, 0xa69b, 0x0000, CanonicalizeAlternatingAligned }, + { 0xa69c, 0xa721, 0x0000, CanonicalizeUnique }, + { 0xa722, 0xa72f, 0x0000, CanonicalizeAlternatingAligned }, + { 0xa730, 0xa731, 0x0000, CanonicalizeUnique }, + { 0xa732, 0xa76f, 0x0000, CanonicalizeAlternatingAligned }, + { 0xa770, 0xa778, 0x0000, CanonicalizeUnique }, + { 0xa779, 0xa77c, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0xa77d, 0xa77d, 0x8a04, CanonicalizeRangeHi }, + { 0xa77e, 0xa787, 0x0000, CanonicalizeAlternatingAligned }, + { 0xa788, 0xa78a, 0x0000, CanonicalizeUnique }, + { 0xa78b, 0xa78c, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0xa78d, 0xa78d, 0xa528, CanonicalizeRangeHi }, + { 0xa78e, 0xa78f, 0x0000, CanonicalizeUnique }, + { 0xa790, 0xa793, 0x0000, CanonicalizeAlternatingAligned }, + { 0xa794, 0xa795, 0x0000, CanonicalizeUnique }, + { 0xa796, 0xa7a9, 0x0000, CanonicalizeAlternatingAligned }, + { 0xa7aa, 0xa7aa, 0xa544, CanonicalizeRangeHi }, + { 0xa7ab, 0xa7ab, 0xa54f, CanonicalizeRangeHi }, + { 0xa7ac, 0xa7ac, 0xa54b, CanonicalizeRangeHi }, + { 0xa7ad, 0xa7ad, 0xa541, CanonicalizeRangeHi }, + { 0xa7ae, 0xa7af, 0x0000, CanonicalizeUnique }, + { 0xa7b0, 0xa7b0, 0xa512, CanonicalizeRangeHi }, + { 0xa7b1, 0xa7b1, 0xa52a, CanonicalizeRangeHi }, + { 0xa7b2, 0xff20, 0x0000, CanonicalizeUnique }, + { 0xff21, 0xff3a, 0x0020, CanonicalizeRangeLo }, + { 0xff3b, 0xff40, 0x0000, CanonicalizeUnique }, + { 0xff41, 0xff5a, 0x0020, CanonicalizeRangeHi }, + { 0xff5b, 0xffff, 0x0000, CanonicalizeUnique }, }; } } // JSC::Yarr diff --git a/src/3rdparty/masm/yarr/YarrCanonicalizeUCS2.js b/src/3rdparty/masm/yarr/YarrCanonicalizeUCS2.js index 00361dd46e..dc578cfece 100644 --- a/src/3rdparty/masm/yarr/YarrCanonicalizeUCS2.js +++ b/src/3rdparty/masm/yarr/YarrCanonicalizeUCS2.js @@ -1,5 +1,5 @@ /* - * Copyright (C) 2012 Apple Inc. All rights reserved. + * Copyright (C) 2012, 2016 Apple Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -23,7 +23,61 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -// See ES 5.1, 15.10.2.8 +function printHeader() +{ + var copyright = ( + "/*" + "\n" + + " * Copyright (C) 2012-2013, 2015-2016 Apple Inc. All rights reserved." + "\n" + + " *" + "\n" + + " * Redistribution and use in source and binary forms, with or without" + "\n" + + " * modification, are permitted provided that the following conditions" + "\n" + + " * are met:" + "\n" + + " * 1. Redistributions of source code must retain the above copyright" + "\n" + + " * notice, this list of conditions and the following disclaimer." + "\n" + + " * 2. Redistributions in binary form must reproduce the above copyright" + "\n" + + " * notice, this list of conditions and the following disclaimer in the" + "\n" + + " * documentation and/or other materials provided with the distribution." + "\n" + + " *" + "\n" + + " * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY" + "\n" + + " * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE" + "\n" + + " * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR" + "\n" + + " * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR" + "\n" + + " * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL," + "\n" + + " * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO," + "\n" + + " * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR" + "\n" + + " * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY" + "\n" + + " * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT" + "\n" + + " * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE" + "\n" + + " * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. " + "\n" + + " */"); + + print(copyright); + print(); + print("// DO NOT EDIT! - this file autogenerated by YarrCanonicalize.js"); + print(); + print('#include "config.h"'); + print('#include "YarrCanonicalize.h"'); + print(); + print("namespace JSC { namespace Yarr {"); + print(); +} + +function printFooter() +{ + print("} } // JSC::Yarr"); + print(); +} + +// Helper function to convert a number to a fixed width hex representation of a UChar32. +function hex(x) +{ + var s = Number(x).toString(16); + while (s.length < 4) + s = 0 + s; + return "0x" + s; +} + +// See ES 6.0, 21.2.2.8.2 Steps 3 function canonicalize(ch) { var u = String.fromCharCode(ch).toUpperCase(); @@ -36,184 +90,104 @@ function canonicalize(ch) } var MAX_UCS2 = 0xFFFF; -var MAX_LATIN = 0xFF; - -var groupedCanonically = []; -// Pass 1: populate groupedCanonically - this is mapping from canonicalized -// values back to the set of character code that canonicalize to them. -for (var i = 0; i <= MAX_UCS2; ++i) { - var ch = canonicalize(i); - if (!groupedCanonically[ch]) - groupedCanonically[ch] = []; - groupedCanonically[ch].push(i); -} -var typeInfo = []; -var latinTypeInfo = []; -var characterSetInfo = []; -// Pass 2: populate typeInfo & characterSetInfo. For every character calculate -// a typeInfo value, described by the types above, and a value payload. -for (cu in groupedCanonically) { - // The set of characters that canonicalize to cu - var characters = groupedCanonically[cu]; - - // If there is only one, it is unique. - if (characters.length == 1) { - typeInfo[characters[0]] = "CanonicalizeUnique:0"; - latinTypeInfo[characters[0]] = characters[0] <= MAX_LATIN ? "CanonicalizeLatinSelf:0" : "CanonicalizeLatinInvalid:0"; - continue; +function createUCS2CanonicalGroups() +{ + var groupedCanonically = []; + // Pass 1: populate groupedCanonically - this is mapping from canonicalized + // values back to the set of character code that canonicalize to them. + for (var i = 0; i <= MAX_UCS2; ++i) { + var ch = canonicalize(i); + if (!groupedCanonically[ch]) + groupedCanonically[ch] = []; + groupedCanonically[ch].push(i); } - // Sort the array. - characters.sort(function(x,y){return x-y;}); + return groupedCanonically; +} - // If there are more than two characters, create an entry in characterSetInfo. - if (characters.length > 2) { - for (i in characters) - typeInfo[characters[i]] = "CanonicalizeSet:" + characterSetInfo.length; - characterSetInfo.push(characters); +function createTables(prefix, maxValue, canonicalGroups) +{ + var prefixLower = prefix.toLowerCase(); + var prefixUpper = prefix.toUpperCase(); + var typeInfo = []; + var characterSetInfo = []; + // Pass 2: populate typeInfo & characterSetInfo. For every character calculate + // a typeInfo value, described by the types above, and a value payload. + for (cu in canonicalGroups) { + // The set of characters that canonicalize to cu + var characters = canonicalGroups[cu]; + + // If there is only one, it is unique. + if (characters.length == 1) { + typeInfo[characters[0]] = "CanonicalizeUnique:0"; + continue; + } - if (characters[1] <= MAX_LATIN) - throw new Error("sets with more than one latin character not supported!"); - if (characters[0] <= MAX_LATIN) { - for (i in characters) - latinTypeInfo[characters[i]] = "CanonicalizeLatinOther:" + characters[0]; - latinTypeInfo[characters[0]] = "CanonicalizeLatinSelf:0"; - } else { + // Sort the array. + characters.sort(function(x,y){return x-y;}); + + // If there are more than two characters, create an entry in characterSetInfo. + if (characters.length > 2) { for (i in characters) - latinTypeInfo[characters[i]] = "CanonicalizeLatinInvalid:0"; + typeInfo[characters[i]] = "CanonicalizeSet:" + characterSetInfo.length; + characterSetInfo.push(characters); + + continue; } - continue; + // We have a pair, mark alternating ranges, otherwise track whether this is the low or high partner. + var lo = characters[0]; + var hi = characters[1]; + var delta = hi - lo; + if (delta == 1) { + var type = lo & 1 ? "CanonicalizeAlternatingUnaligned:0" : "CanonicalizeAlternatingAligned:0"; + typeInfo[lo] = type; + typeInfo[hi] = type; + } else { + typeInfo[lo] = "CanonicalizeRangeLo:" + delta; + typeInfo[hi] = "CanonicalizeRangeHi:" + delta; + } } - // We have a pair, mark alternating ranges, otherwise track whether this is the low or high partner. - var lo = characters[0]; - var hi = characters[1]; - var delta = hi - lo; - if (delta == 1) { - var type = lo & 1 ? "CanonicalizeAlternatingUnaligned:0" : "CanonicalizeAlternatingAligned:0"; - typeInfo[lo] = type; - typeInfo[hi] = type; - } else { - typeInfo[lo] = "CanonicalizeRangeLo:" + delta; - typeInfo[hi] = "CanonicalizeRangeHi:" + delta; + var rangeInfo = []; + // Pass 3: coallesce types into ranges. + for (var end = 0; end <= maxValue; ++end) { + var begin = end; + var type = typeInfo[end]; + while (end < maxValue && typeInfo[end + 1] == type) + ++end; + rangeInfo.push({begin:begin, end:end, type:type}); } - if (lo > MAX_LATIN) { - latinTypeInfo[lo] = "CanonicalizeLatinInvalid:0"; - latinTypeInfo[hi] = "CanonicalizeLatinInvalid:0"; - } else if (hi > MAX_LATIN) { - latinTypeInfo[lo] = "CanonicalizeLatinSelf:0"; - latinTypeInfo[hi] = "CanonicalizeLatinOther:" + lo; - } else { - if (delta != 0x20 || lo & 0x20) - throw new Error("pairs of latin characters that don't mask with 0x20 not supported!"); - latinTypeInfo[lo] = "CanonicalizeLatinMask0x20:0"; - latinTypeInfo[hi] = "CanonicalizeLatinMask0x20:0"; + for (i in characterSetInfo) { + var characters = "" + var set = characterSetInfo[i]; + for (var j in set) + characters += hex(set[j]) + ", "; + print("const UChar32 " + prefixLower + "CharacterSet" + i + "[] = { " + characters + "0 };"); } + print(); + print("static const size_t " + prefixUpper + "_CANONICALIZATION_SETS = " + characterSetInfo.length + ";"); + print("const UChar32* const " + prefixLower + "CharacterSetInfo[" + prefixUpper + "_CANONICALIZATION_SETS] = {"); + for (i in characterSetInfo) + print(" " + prefixLower + "CharacterSet" + i + ","); + print("};"); + print(); + print("const size_t " + prefixUpper + "_CANONICALIZATION_RANGES = " + rangeInfo.length + ";"); + print("const CanonicalizationRange " + prefixLower + "RangeInfo[" + prefixUpper + "_CANONICALIZATION_RANGES] = {"); + for (i in rangeInfo) { + var info = rangeInfo[i]; + var typeAndValue = info.type.split(':'); + print(" { " + hex(info.begin) + ", " + hex(info.end) + ", " + hex(typeAndValue[1]) + ", " + typeAndValue[0] + " },"); + } + print("};"); + print(); } -var rangeInfo = []; -// Pass 3: coallesce types into ranges. -for (var end = 0; end <= MAX_UCS2; ++end) { - var begin = end; - var type = typeInfo[end]; - while (end < MAX_UCS2 && typeInfo[end + 1] == type) - ++end; - rangeInfo.push({begin:begin, end:end, type:type}); -} +printHeader(); -var latinRangeInfo = []; -// Pass 4: coallesce latin-1 types into ranges. -for (var end = 0; end <= MAX_UCS2; ++end) { - var begin = end; - var type = latinTypeInfo[end]; - while (end < MAX_UCS2 && latinTypeInfo[end + 1] == type) - ++end; - latinRangeInfo.push({begin:begin, end:end, type:type}); -} +createTables("UCS2", MAX_UCS2, createUCS2CanonicalGroups()); - -// Helper function to convert a number to a fixed width hex representation of a C uint16_t. -function hex(x) -{ - var s = Number(x).toString(16); - while (s.length < 4) - s = 0 + s; - return "0x" + s + "u"; -} - -var copyright = ( - "/*" + "\n" + - " * Copyright (C) 2012 Apple Inc. All rights reserved." + "\n" + - " *" + "\n" + - " * Redistribution and use in source and binary forms, with or without" + "\n" + - " * modification, are permitted provided that the following conditions" + "\n" + - " * are met:" + "\n" + - " * 1. Redistributions of source code must retain the above copyright" + "\n" + - " * notice, this list of conditions and the following disclaimer." + "\n" + - " * 2. Redistributions in binary form must reproduce the above copyright" + "\n" + - " * notice, this list of conditions and the following disclaimer in the" + "\n" + - " * documentation and/or other materials provided with the distribution." + "\n" + - " *" + "\n" + - " * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY" + "\n" + - " * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE" + "\n" + - " * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR" + "\n" + - " * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR" + "\n" + - " * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL," + "\n" + - " * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO," + "\n" + - " * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR" + "\n" + - " * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY" + "\n" + - " * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT" + "\n" + - " * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE" + "\n" + - " * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. " + "\n" + - " */"); - -print(copyright); -print(); -print("// DO NOT EDIT! - this file autogenerated by YarrCanonicalizeUCS2.js"); -print(); -print('#include "config.h"'); -print('#include "YarrCanonicalizeUCS2.h"'); -print(); -print("namespace JSC { namespace Yarr {"); -print(); -print("#include <stdint.h>"); -print(); - -for (i in characterSetInfo) { - var characters = "" - var set = characterSetInfo[i]; - for (var j in set) - characters += hex(set[j]) + ", "; - print("uint16_t ucs2CharacterSet" + i + "[] = { " + characters + "0 };"); -} -print(); -print("static const size_t UCS2_CANONICALIZATION_SETS = " + characterSetInfo.length + ";"); -print("uint16_t* characterSetInfo[UCS2_CANONICALIZATION_SETS] = {"); -for (i in characterSetInfo) -print(" ucs2CharacterSet" + i + ","); -print("};"); -print(); -print("const size_t UCS2_CANONICALIZATION_RANGES = " + rangeInfo.length + ";"); -print("UCS2CanonicalizationRange rangeInfo[UCS2_CANONICALIZATION_RANGES] = {"); -for (i in rangeInfo) { - var info = rangeInfo[i]; - var typeAndValue = info.type.split(':'); - print(" { " + hex(info.begin) + ", " + hex(info.end) + ", " + hex(typeAndValue[1]) + ", " + typeAndValue[0] + " },"); -} -print("};"); -print(); -print("const size_t LATIN_CANONICALIZATION_RANGES = " + latinRangeInfo.length + ";"); -print("LatinCanonicalizationRange latinRangeInfo[LATIN_CANONICALIZATION_RANGES] = {"); -for (i in latinRangeInfo) { - var info = latinRangeInfo[i]; - var typeAndValue = info.type.split(':'); - print(" { " + hex(info.begin) + ", " + hex(info.end) + ", " + hex(typeAndValue[1]) + ", " + typeAndValue[0] + " },"); -} -print("};"); -print(); -print("} } // JSC::Yarr"); -print(); +printFooter(); diff --git a/src/3rdparty/masm/yarr/YarrCanonicalizeUnicode.cpp b/src/3rdparty/masm/yarr/YarrCanonicalizeUnicode.cpp new file mode 100644 index 0000000000..37bfc5e060 --- /dev/null +++ b/src/3rdparty/masm/yarr/YarrCanonicalizeUnicode.cpp @@ -0,0 +1,591 @@ +/* +* Copyright (C) 2016 Apple Inc. All rights reserved. +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* +* 1. Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* 2. Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in the +* documentation and/or other materials provided with the distribution. +* +* THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY +* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +* DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY +* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +// DO NO EDIT! - This file was generated by generateYarrCanonicalizeUnicode + +#include "config.h" +#include "YarrCanonicalize.h" + +namespace JSC { namespace Yarr { + +const UChar32 unicodeCharacterSet0[] = { 0x004b, 0x006b, 0x212a, 0 }; +const UChar32 unicodeCharacterSet1[] = { 0x0053, 0x0073, 0x017f, 0 }; +const UChar32 unicodeCharacterSet2[] = { 0x00c5, 0x00e5, 0x212b, 0 }; +const UChar32 unicodeCharacterSet3[] = { 0x01c4, 0x01c5, 0x01c6, 0 }; +const UChar32 unicodeCharacterSet4[] = { 0x01c7, 0x01c8, 0x01c9, 0 }; +const UChar32 unicodeCharacterSet5[] = { 0x01ca, 0x01cb, 0x01cc, 0 }; +const UChar32 unicodeCharacterSet6[] = { 0x01f1, 0x01f2, 0x01f3, 0 }; +const UChar32 unicodeCharacterSet7[] = { 0x0392, 0x03b2, 0x03d0, 0 }; +const UChar32 unicodeCharacterSet8[] = { 0x0395, 0x03b5, 0x03f5, 0 }; +const UChar32 unicodeCharacterSet9[] = { 0x0398, 0x03b8, 0x03d1, 0x03f4, 0 }; +const UChar32 unicodeCharacterSet10[] = { 0x0345, 0x0399, 0x03b9, 0x1fbe, 0 }; +const UChar32 unicodeCharacterSet11[] = { 0x039a, 0x03ba, 0x03f0, 0 }; +const UChar32 unicodeCharacterSet12[] = { 0x00b5, 0x039c, 0x03bc, 0 }; +const UChar32 unicodeCharacterSet13[] = { 0x03a0, 0x03c0, 0x03d6, 0 }; +const UChar32 unicodeCharacterSet14[] = { 0x03a1, 0x03c1, 0x03f1, 0 }; +const UChar32 unicodeCharacterSet15[] = { 0x03a3, 0x03c2, 0x03c3, 0 }; +const UChar32 unicodeCharacterSet16[] = { 0x03a6, 0x03c6, 0x03d5, 0 }; +const UChar32 unicodeCharacterSet17[] = { 0x03a9, 0x03c9, 0x2126, 0 }; +const UChar32 unicodeCharacterSet18[] = { 0x0412, 0x0432, 0x1c80, 0 }; +const UChar32 unicodeCharacterSet19[] = { 0x0414, 0x0434, 0x1c81, 0 }; +const UChar32 unicodeCharacterSet20[] = { 0x041e, 0x043e, 0x1c82, 0 }; +const UChar32 unicodeCharacterSet21[] = { 0x0421, 0x0441, 0x1c83, 0 }; +const UChar32 unicodeCharacterSet22[] = { 0x0422, 0x0442, 0x1c84, 0x1c85, 0 }; +const UChar32 unicodeCharacterSet23[] = { 0x042a, 0x044a, 0x1c86, 0 }; +const UChar32 unicodeCharacterSet24[] = { 0x0462, 0x0463, 0x1c87, 0 }; +const UChar32 unicodeCharacterSet25[] = { 0x1e60, 0x1e61, 0x1e9b, 0 }; +const UChar32 unicodeCharacterSet26[] = { 0x1c88, 0xa64a, 0xa64b, 0 }; + +static const size_t UNICODE_CANONICALIZATION_SETS = 27; +const UChar32* const unicodeCharacterSetInfo[UNICODE_CANONICALIZATION_SETS] = { + unicodeCharacterSet0, + unicodeCharacterSet1, + unicodeCharacterSet2, + unicodeCharacterSet3, + unicodeCharacterSet4, + unicodeCharacterSet5, + unicodeCharacterSet6, + unicodeCharacterSet7, + unicodeCharacterSet8, + unicodeCharacterSet9, + unicodeCharacterSet10, + unicodeCharacterSet11, + unicodeCharacterSet12, + unicodeCharacterSet13, + unicodeCharacterSet14, + unicodeCharacterSet15, + unicodeCharacterSet16, + unicodeCharacterSet17, + unicodeCharacterSet18, + unicodeCharacterSet19, + unicodeCharacterSet20, + unicodeCharacterSet21, + unicodeCharacterSet22, + unicodeCharacterSet23, + unicodeCharacterSet24, + unicodeCharacterSet25, + unicodeCharacterSet26, +}; + +const size_t UNICODE_CANONICALIZATION_RANGES = 495; +const CanonicalizationRange unicodeRangeInfo[UNICODE_CANONICALIZATION_RANGES] = { + { 0x0000, 0x0040, 0x0000, CanonicalizeUnique }, + { 0x0041, 0x004a, 0x0020, CanonicalizeRangeLo }, + { 0x004b, 0x004b, 0x0000, CanonicalizeSet }, + { 0x004c, 0x0052, 0x0020, CanonicalizeRangeLo }, + { 0x0053, 0x0053, 0x0001, CanonicalizeSet }, + { 0x0054, 0x005a, 0x0020, CanonicalizeRangeLo }, + { 0x005b, 0x0060, 0x0000, CanonicalizeUnique }, + { 0x0061, 0x006a, 0x0020, CanonicalizeRangeHi }, + { 0x006b, 0x006b, 0x0000, CanonicalizeSet }, + { 0x006c, 0x0072, 0x0020, CanonicalizeRangeHi }, + { 0x0073, 0x0073, 0x0001, CanonicalizeSet }, + { 0x0074, 0x007a, 0x0020, CanonicalizeRangeHi }, + { 0x007b, 0x00b4, 0x0000, CanonicalizeUnique }, + { 0x00b5, 0x00b5, 0x000c, CanonicalizeSet }, + { 0x00b6, 0x00bf, 0x0000, CanonicalizeUnique }, + { 0x00c0, 0x00c4, 0x0020, CanonicalizeRangeLo }, + { 0x00c5, 0x00c5, 0x0002, CanonicalizeSet }, + { 0x00c6, 0x00d6, 0x0020, CanonicalizeRangeLo }, + { 0x00d7, 0x00d7, 0x0000, CanonicalizeUnique }, + { 0x00d8, 0x00de, 0x0020, CanonicalizeRangeLo }, + { 0x00df, 0x00df, 0x1dbf, CanonicalizeRangeLo }, + { 0x00e0, 0x00e4, 0x0020, CanonicalizeRangeHi }, + { 0x00e5, 0x00e5, 0x0002, CanonicalizeSet }, + { 0x00e6, 0x00f6, 0x0020, CanonicalizeRangeHi }, + { 0x00f7, 0x00f7, 0x0000, CanonicalizeUnique }, + { 0x00f8, 0x00fe, 0x0020, CanonicalizeRangeHi }, + { 0x00ff, 0x00ff, 0x0079, CanonicalizeRangeLo }, + { 0x0100, 0x012f, 0x0000, CanonicalizeAlternatingAligned }, + { 0x0130, 0x0131, 0x0000, CanonicalizeUnique }, + { 0x0132, 0x0137, 0x0000, CanonicalizeAlternatingAligned }, + { 0x0138, 0x0138, 0x0000, CanonicalizeUnique }, + { 0x0139, 0x0148, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0x0149, 0x0149, 0x0000, CanonicalizeUnique }, + { 0x014a, 0x0177, 0x0000, CanonicalizeAlternatingAligned }, + { 0x0178, 0x0178, 0x0079, CanonicalizeRangeHi }, + { 0x0179, 0x017e, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0x017f, 0x017f, 0x0001, CanonicalizeSet }, + { 0x0180, 0x0180, 0x00c3, CanonicalizeRangeLo }, + { 0x0181, 0x0181, 0x00d2, CanonicalizeRangeLo }, + { 0x0182, 0x0185, 0x0000, CanonicalizeAlternatingAligned }, + { 0x0186, 0x0186, 0x00ce, CanonicalizeRangeLo }, + { 0x0187, 0x0188, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0x0189, 0x018a, 0x00cd, CanonicalizeRangeLo }, + { 0x018b, 0x018c, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0x018d, 0x018d, 0x0000, CanonicalizeUnique }, + { 0x018e, 0x018e, 0x004f, CanonicalizeRangeLo }, + { 0x018f, 0x018f, 0x00ca, CanonicalizeRangeLo }, + { 0x0190, 0x0190, 0x00cb, CanonicalizeRangeLo }, + { 0x0191, 0x0192, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0x0193, 0x0193, 0x00cd, CanonicalizeRangeLo }, + { 0x0194, 0x0194, 0x00cf, CanonicalizeRangeLo }, + { 0x0195, 0x0195, 0x0061, CanonicalizeRangeLo }, + { 0x0196, 0x0196, 0x00d3, CanonicalizeRangeLo }, + { 0x0197, 0x0197, 0x00d1, CanonicalizeRangeLo }, + { 0x0198, 0x0199, 0x0000, CanonicalizeAlternatingAligned }, + { 0x019a, 0x019a, 0x00a3, CanonicalizeRangeLo }, + { 0x019b, 0x019b, 0x0000, CanonicalizeUnique }, + { 0x019c, 0x019c, 0x00d3, CanonicalizeRangeLo }, + { 0x019d, 0x019d, 0x00d5, CanonicalizeRangeLo }, + { 0x019e, 0x019e, 0x0082, CanonicalizeRangeLo }, + { 0x019f, 0x019f, 0x00d6, CanonicalizeRangeLo }, + { 0x01a0, 0x01a5, 0x0000, CanonicalizeAlternatingAligned }, + { 0x01a6, 0x01a6, 0x00da, CanonicalizeRangeLo }, + { 0x01a7, 0x01a8, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0x01a9, 0x01a9, 0x00da, CanonicalizeRangeLo }, + { 0x01aa, 0x01ab, 0x0000, CanonicalizeUnique }, + { 0x01ac, 0x01ad, 0x0000, CanonicalizeAlternatingAligned }, + { 0x01ae, 0x01ae, 0x00da, CanonicalizeRangeLo }, + { 0x01af, 0x01b0, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0x01b1, 0x01b2, 0x00d9, CanonicalizeRangeLo }, + { 0x01b3, 0x01b6, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0x01b7, 0x01b7, 0x00db, CanonicalizeRangeLo }, + { 0x01b8, 0x01b9, 0x0000, CanonicalizeAlternatingAligned }, + { 0x01ba, 0x01bb, 0x0000, CanonicalizeUnique }, + { 0x01bc, 0x01bd, 0x0000, CanonicalizeAlternatingAligned }, + { 0x01be, 0x01be, 0x0000, CanonicalizeUnique }, + { 0x01bf, 0x01bf, 0x0038, CanonicalizeRangeLo }, + { 0x01c0, 0x01c3, 0x0000, CanonicalizeUnique }, + { 0x01c4, 0x01c6, 0x0003, CanonicalizeSet }, + { 0x01c7, 0x01c9, 0x0004, CanonicalizeSet }, + { 0x01ca, 0x01cc, 0x0005, CanonicalizeSet }, + { 0x01cd, 0x01dc, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0x01dd, 0x01dd, 0x004f, CanonicalizeRangeHi }, + { 0x01de, 0x01ef, 0x0000, CanonicalizeAlternatingAligned }, + { 0x01f0, 0x01f0, 0x0000, CanonicalizeUnique }, + { 0x01f1, 0x01f3, 0x0006, CanonicalizeSet }, + { 0x01f4, 0x01f5, 0x0000, CanonicalizeAlternatingAligned }, + { 0x01f6, 0x01f6, 0x0061, CanonicalizeRangeHi }, + { 0x01f7, 0x01f7, 0x0038, CanonicalizeRangeHi }, + { 0x01f8, 0x021f, 0x0000, CanonicalizeAlternatingAligned }, + { 0x0220, 0x0220, 0x0082, CanonicalizeRangeHi }, + { 0x0221, 0x0221, 0x0000, CanonicalizeUnique }, + { 0x0222, 0x0233, 0x0000, CanonicalizeAlternatingAligned }, + { 0x0234, 0x0239, 0x0000, CanonicalizeUnique }, + { 0x023a, 0x023a, 0x2a2b, CanonicalizeRangeLo }, + { 0x023b, 0x023c, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0x023d, 0x023d, 0x00a3, CanonicalizeRangeHi }, + { 0x023e, 0x023e, 0x2a28, CanonicalizeRangeLo }, + { 0x023f, 0x0240, 0x2a3f, CanonicalizeRangeLo }, + { 0x0241, 0x0242, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0x0243, 0x0243, 0x00c3, CanonicalizeRangeHi }, + { 0x0244, 0x0244, 0x0045, CanonicalizeRangeLo }, + { 0x0245, 0x0245, 0x0047, CanonicalizeRangeLo }, + { 0x0246, 0x024f, 0x0000, CanonicalizeAlternatingAligned }, + { 0x0250, 0x0250, 0x2a1f, CanonicalizeRangeLo }, + { 0x0251, 0x0251, 0x2a1c, CanonicalizeRangeLo }, + { 0x0252, 0x0252, 0x2a1e, CanonicalizeRangeLo }, + { 0x0253, 0x0253, 0x00d2, CanonicalizeRangeHi }, + { 0x0254, 0x0254, 0x00ce, CanonicalizeRangeHi }, + { 0x0255, 0x0255, 0x0000, CanonicalizeUnique }, + { 0x0256, 0x0257, 0x00cd, CanonicalizeRangeHi }, + { 0x0258, 0x0258, 0x0000, CanonicalizeUnique }, + { 0x0259, 0x0259, 0x00ca, CanonicalizeRangeHi }, + { 0x025a, 0x025a, 0x0000, CanonicalizeUnique }, + { 0x025b, 0x025b, 0x00cb, CanonicalizeRangeHi }, + { 0x025c, 0x025c, 0xa54f, CanonicalizeRangeLo }, + { 0x025d, 0x025f, 0x0000, CanonicalizeUnique }, + { 0x0260, 0x0260, 0x00cd, CanonicalizeRangeHi }, + { 0x0261, 0x0261, 0xa54b, CanonicalizeRangeLo }, + { 0x0262, 0x0262, 0x0000, CanonicalizeUnique }, + { 0x0263, 0x0263, 0x00cf, CanonicalizeRangeHi }, + { 0x0264, 0x0264, 0x0000, CanonicalizeUnique }, + { 0x0265, 0x0265, 0xa528, CanonicalizeRangeLo }, + { 0x0266, 0x0266, 0xa544, CanonicalizeRangeLo }, + { 0x0267, 0x0267, 0x0000, CanonicalizeUnique }, + { 0x0268, 0x0268, 0x00d1, CanonicalizeRangeHi }, + { 0x0269, 0x0269, 0x00d3, CanonicalizeRangeHi }, + { 0x026a, 0x026a, 0xa544, CanonicalizeRangeLo }, + { 0x026b, 0x026b, 0x29f7, CanonicalizeRangeLo }, + { 0x026c, 0x026c, 0xa541, CanonicalizeRangeLo }, + { 0x026d, 0x026e, 0x0000, CanonicalizeUnique }, + { 0x026f, 0x026f, 0x00d3, CanonicalizeRangeHi }, + { 0x0270, 0x0270, 0x0000, CanonicalizeUnique }, + { 0x0271, 0x0271, 0x29fd, CanonicalizeRangeLo }, + { 0x0272, 0x0272, 0x00d5, CanonicalizeRangeHi }, + { 0x0273, 0x0274, 0x0000, CanonicalizeUnique }, + { 0x0275, 0x0275, 0x00d6, CanonicalizeRangeHi }, + { 0x0276, 0x027c, 0x0000, CanonicalizeUnique }, + { 0x027d, 0x027d, 0x29e7, CanonicalizeRangeLo }, + { 0x027e, 0x027f, 0x0000, CanonicalizeUnique }, + { 0x0280, 0x0280, 0x00da, CanonicalizeRangeHi }, + { 0x0281, 0x0282, 0x0000, CanonicalizeUnique }, + { 0x0283, 0x0283, 0x00da, CanonicalizeRangeHi }, + { 0x0284, 0x0286, 0x0000, CanonicalizeUnique }, + { 0x0287, 0x0287, 0xa52a, CanonicalizeRangeLo }, + { 0x0288, 0x0288, 0x00da, CanonicalizeRangeHi }, + { 0x0289, 0x0289, 0x0045, CanonicalizeRangeHi }, + { 0x028a, 0x028b, 0x00d9, CanonicalizeRangeHi }, + { 0x028c, 0x028c, 0x0047, CanonicalizeRangeHi }, + { 0x028d, 0x0291, 0x0000, CanonicalizeUnique }, + { 0x0292, 0x0292, 0x00db, CanonicalizeRangeHi }, + { 0x0293, 0x029c, 0x0000, CanonicalizeUnique }, + { 0x029d, 0x029d, 0xa515, CanonicalizeRangeLo }, + { 0x029e, 0x029e, 0xa512, CanonicalizeRangeLo }, + { 0x029f, 0x0344, 0x0000, CanonicalizeUnique }, + { 0x0345, 0x0345, 0x000a, CanonicalizeSet }, + { 0x0346, 0x036f, 0x0000, CanonicalizeUnique }, + { 0x0370, 0x0373, 0x0000, CanonicalizeAlternatingAligned }, + { 0x0374, 0x0375, 0x0000, CanonicalizeUnique }, + { 0x0376, 0x0377, 0x0000, CanonicalizeAlternatingAligned }, + { 0x0378, 0x037a, 0x0000, CanonicalizeUnique }, + { 0x037b, 0x037d, 0x0082, CanonicalizeRangeLo }, + { 0x037e, 0x037e, 0x0000, CanonicalizeUnique }, + { 0x037f, 0x037f, 0x0074, CanonicalizeRangeLo }, + { 0x0380, 0x0385, 0x0000, CanonicalizeUnique }, + { 0x0386, 0x0386, 0x0026, CanonicalizeRangeLo }, + { 0x0387, 0x0387, 0x0000, CanonicalizeUnique }, + { 0x0388, 0x038a, 0x0025, CanonicalizeRangeLo }, + { 0x038b, 0x038b, 0x0000, CanonicalizeUnique }, + { 0x038c, 0x038c, 0x0040, CanonicalizeRangeLo }, + { 0x038d, 0x038d, 0x0000, CanonicalizeUnique }, + { 0x038e, 0x038f, 0x003f, CanonicalizeRangeLo }, + { 0x0390, 0x0390, 0x0000, CanonicalizeUnique }, + { 0x0391, 0x0391, 0x0020, CanonicalizeRangeLo }, + { 0x0392, 0x0392, 0x0007, CanonicalizeSet }, + { 0x0393, 0x0394, 0x0020, CanonicalizeRangeLo }, + { 0x0395, 0x0395, 0x0008, CanonicalizeSet }, + { 0x0396, 0x0397, 0x0020, CanonicalizeRangeLo }, + { 0x0398, 0x0398, 0x0009, CanonicalizeSet }, + { 0x0399, 0x0399, 0x000a, CanonicalizeSet }, + { 0x039a, 0x039a, 0x000b, CanonicalizeSet }, + { 0x039b, 0x039b, 0x0020, CanonicalizeRangeLo }, + { 0x039c, 0x039c, 0x000c, CanonicalizeSet }, + { 0x039d, 0x039f, 0x0020, CanonicalizeRangeLo }, + { 0x03a0, 0x03a0, 0x000d, CanonicalizeSet }, + { 0x03a1, 0x03a1, 0x000e, CanonicalizeSet }, + { 0x03a2, 0x03a2, 0x0000, CanonicalizeUnique }, + { 0x03a3, 0x03a3, 0x000f, CanonicalizeSet }, + { 0x03a4, 0x03a5, 0x0020, CanonicalizeRangeLo }, + { 0x03a6, 0x03a6, 0x0010, CanonicalizeSet }, + { 0x03a7, 0x03a8, 0x0020, CanonicalizeRangeLo }, + { 0x03a9, 0x03a9, 0x0011, CanonicalizeSet }, + { 0x03aa, 0x03ab, 0x0020, CanonicalizeRangeLo }, + { 0x03ac, 0x03ac, 0x0026, CanonicalizeRangeHi }, + { 0x03ad, 0x03af, 0x0025, CanonicalizeRangeHi }, + { 0x03b0, 0x03b0, 0x0000, CanonicalizeUnique }, + { 0x03b1, 0x03b1, 0x0020, CanonicalizeRangeHi }, + { 0x03b2, 0x03b2, 0x0007, CanonicalizeSet }, + { 0x03b3, 0x03b4, 0x0020, CanonicalizeRangeHi }, + { 0x03b5, 0x03b5, 0x0008, CanonicalizeSet }, + { 0x03b6, 0x03b7, 0x0020, CanonicalizeRangeHi }, + { 0x03b8, 0x03b8, 0x0009, CanonicalizeSet }, + { 0x03b9, 0x03b9, 0x000a, CanonicalizeSet }, + { 0x03ba, 0x03ba, 0x000b, CanonicalizeSet }, + { 0x03bb, 0x03bb, 0x0020, CanonicalizeRangeHi }, + { 0x03bc, 0x03bc, 0x000c, CanonicalizeSet }, + { 0x03bd, 0x03bf, 0x0020, CanonicalizeRangeHi }, + { 0x03c0, 0x03c0, 0x000d, CanonicalizeSet }, + { 0x03c1, 0x03c1, 0x000e, CanonicalizeSet }, + { 0x03c2, 0x03c3, 0x000f, CanonicalizeSet }, + { 0x03c4, 0x03c5, 0x0020, CanonicalizeRangeHi }, + { 0x03c6, 0x03c6, 0x0010, CanonicalizeSet }, + { 0x03c7, 0x03c8, 0x0020, CanonicalizeRangeHi }, + { 0x03c9, 0x03c9, 0x0011, CanonicalizeSet }, + { 0x03ca, 0x03cb, 0x0020, CanonicalizeRangeHi }, + { 0x03cc, 0x03cc, 0x0040, CanonicalizeRangeHi }, + { 0x03cd, 0x03ce, 0x003f, CanonicalizeRangeHi }, + { 0x03cf, 0x03cf, 0x0008, CanonicalizeRangeLo }, + { 0x03d0, 0x03d0, 0x0007, CanonicalizeSet }, + { 0x03d1, 0x03d1, 0x0009, CanonicalizeSet }, + { 0x03d2, 0x03d4, 0x0000, CanonicalizeUnique }, + { 0x03d5, 0x03d5, 0x0010, CanonicalizeSet }, + { 0x03d6, 0x03d6, 0x000d, CanonicalizeSet }, + { 0x03d7, 0x03d7, 0x0008, CanonicalizeRangeHi }, + { 0x03d8, 0x03ef, 0x0000, CanonicalizeAlternatingAligned }, + { 0x03f0, 0x03f0, 0x000b, CanonicalizeSet }, + { 0x03f1, 0x03f1, 0x000e, CanonicalizeSet }, + { 0x03f2, 0x03f2, 0x0007, CanonicalizeRangeLo }, + { 0x03f3, 0x03f3, 0x0074, CanonicalizeRangeHi }, + { 0x03f4, 0x03f4, 0x0009, CanonicalizeSet }, + { 0x03f5, 0x03f5, 0x0008, CanonicalizeSet }, + { 0x03f6, 0x03f6, 0x0000, CanonicalizeUnique }, + { 0x03f7, 0x03f8, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0x03f9, 0x03f9, 0x0007, CanonicalizeRangeHi }, + { 0x03fa, 0x03fb, 0x0000, CanonicalizeAlternatingAligned }, + { 0x03fc, 0x03fc, 0x0000, CanonicalizeUnique }, + { 0x03fd, 0x03ff, 0x0082, CanonicalizeRangeHi }, + { 0x0400, 0x040f, 0x0050, CanonicalizeRangeLo }, + { 0x0410, 0x0411, 0x0020, CanonicalizeRangeLo }, + { 0x0412, 0x0412, 0x0012, CanonicalizeSet }, + { 0x0413, 0x0413, 0x0020, CanonicalizeRangeLo }, + { 0x0414, 0x0414, 0x0013, CanonicalizeSet }, + { 0x0415, 0x041d, 0x0020, CanonicalizeRangeLo }, + { 0x041e, 0x041e, 0x0014, CanonicalizeSet }, + { 0x041f, 0x0420, 0x0020, CanonicalizeRangeLo }, + { 0x0421, 0x0421, 0x0015, CanonicalizeSet }, + { 0x0422, 0x0422, 0x0016, CanonicalizeSet }, + { 0x0423, 0x0429, 0x0020, CanonicalizeRangeLo }, + { 0x042a, 0x042a, 0x0017, CanonicalizeSet }, + { 0x042b, 0x042f, 0x0020, CanonicalizeRangeLo }, + { 0x0430, 0x0431, 0x0020, CanonicalizeRangeHi }, + { 0x0432, 0x0432, 0x0012, CanonicalizeSet }, + { 0x0433, 0x0433, 0x0020, CanonicalizeRangeHi }, + { 0x0434, 0x0434, 0x0013, CanonicalizeSet }, + { 0x0435, 0x043d, 0x0020, CanonicalizeRangeHi }, + { 0x043e, 0x043e, 0x0014, CanonicalizeSet }, + { 0x043f, 0x0440, 0x0020, CanonicalizeRangeHi }, + { 0x0441, 0x0441, 0x0015, CanonicalizeSet }, + { 0x0442, 0x0442, 0x0016, CanonicalizeSet }, + { 0x0443, 0x0449, 0x0020, CanonicalizeRangeHi }, + { 0x044a, 0x044a, 0x0017, CanonicalizeSet }, + { 0x044b, 0x044f, 0x0020, CanonicalizeRangeHi }, + { 0x0450, 0x045f, 0x0050, CanonicalizeRangeHi }, + { 0x0460, 0x0461, 0x0000, CanonicalizeAlternatingAligned }, + { 0x0462, 0x0463, 0x0018, CanonicalizeSet }, + { 0x0464, 0x0481, 0x0000, CanonicalizeAlternatingAligned }, + { 0x0482, 0x0489, 0x0000, CanonicalizeUnique }, + { 0x048a, 0x04bf, 0x0000, CanonicalizeAlternatingAligned }, + { 0x04c0, 0x04c0, 0x000f, CanonicalizeRangeLo }, + { 0x04c1, 0x04ce, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0x04cf, 0x04cf, 0x000f, CanonicalizeRangeHi }, + { 0x04d0, 0x052f, 0x0000, CanonicalizeAlternatingAligned }, + { 0x0530, 0x0530, 0x0000, CanonicalizeUnique }, + { 0x0531, 0x0556, 0x0030, CanonicalizeRangeLo }, + { 0x0557, 0x0560, 0x0000, CanonicalizeUnique }, + { 0x0561, 0x0586, 0x0030, CanonicalizeRangeHi }, + { 0x0587, 0x109f, 0x0000, CanonicalizeUnique }, + { 0x10a0, 0x10c5, 0x1c60, CanonicalizeRangeLo }, + { 0x10c6, 0x10c6, 0x0000, CanonicalizeUnique }, + { 0x10c7, 0x10c7, 0x1c60, CanonicalizeRangeLo }, + { 0x10c8, 0x10cc, 0x0000, CanonicalizeUnique }, + { 0x10cd, 0x10cd, 0x1c60, CanonicalizeRangeLo }, + { 0x10ce, 0x139f, 0x0000, CanonicalizeUnique }, + { 0x13a0, 0x13ef, 0x97d0, CanonicalizeRangeLo }, + { 0x13f0, 0x13f5, 0x0008, CanonicalizeRangeLo }, + { 0x13f6, 0x13f7, 0x0000, CanonicalizeUnique }, + { 0x13f8, 0x13fd, 0x0008, CanonicalizeRangeHi }, + { 0x13fe, 0x1c7f, 0x0000, CanonicalizeUnique }, + { 0x1c80, 0x1c80, 0x0012, CanonicalizeSet }, + { 0x1c81, 0x1c81, 0x0013, CanonicalizeSet }, + { 0x1c82, 0x1c82, 0x0014, CanonicalizeSet }, + { 0x1c83, 0x1c83, 0x0015, CanonicalizeSet }, + { 0x1c84, 0x1c85, 0x0016, CanonicalizeSet }, + { 0x1c86, 0x1c86, 0x0017, CanonicalizeSet }, + { 0x1c87, 0x1c87, 0x0018, CanonicalizeSet }, + { 0x1c88, 0x1c88, 0x001a, CanonicalizeSet }, + { 0x1c89, 0x1d78, 0x0000, CanonicalizeUnique }, + { 0x1d79, 0x1d79, 0x8a04, CanonicalizeRangeLo }, + { 0x1d7a, 0x1d7c, 0x0000, CanonicalizeUnique }, + { 0x1d7d, 0x1d7d, 0x0ee6, CanonicalizeRangeLo }, + { 0x1d7e, 0x1dff, 0x0000, CanonicalizeUnique }, + { 0x1e00, 0x1e5f, 0x0000, CanonicalizeAlternatingAligned }, + { 0x1e60, 0x1e61, 0x0019, CanonicalizeSet }, + { 0x1e62, 0x1e95, 0x0000, CanonicalizeAlternatingAligned }, + { 0x1e96, 0x1e9a, 0x0000, CanonicalizeUnique }, + { 0x1e9b, 0x1e9b, 0x0019, CanonicalizeSet }, + { 0x1e9c, 0x1e9d, 0x0000, CanonicalizeUnique }, + { 0x1e9e, 0x1e9e, 0x1dbf, CanonicalizeRangeHi }, + { 0x1e9f, 0x1e9f, 0x0000, CanonicalizeUnique }, + { 0x1ea0, 0x1eff, 0x0000, CanonicalizeAlternatingAligned }, + { 0x1f00, 0x1f07, 0x0008, CanonicalizeRangeLo }, + { 0x1f08, 0x1f0f, 0x0008, CanonicalizeRangeHi }, + { 0x1f10, 0x1f15, 0x0008, CanonicalizeRangeLo }, + { 0x1f16, 0x1f17, 0x0000, CanonicalizeUnique }, + { 0x1f18, 0x1f1d, 0x0008, CanonicalizeRangeHi }, + { 0x1f1e, 0x1f1f, 0x0000, CanonicalizeUnique }, + { 0x1f20, 0x1f27, 0x0008, CanonicalizeRangeLo }, + { 0x1f28, 0x1f2f, 0x0008, CanonicalizeRangeHi }, + { 0x1f30, 0x1f37, 0x0008, CanonicalizeRangeLo }, + { 0x1f38, 0x1f3f, 0x0008, CanonicalizeRangeHi }, + { 0x1f40, 0x1f45, 0x0008, CanonicalizeRangeLo }, + { 0x1f46, 0x1f47, 0x0000, CanonicalizeUnique }, + { 0x1f48, 0x1f4d, 0x0008, CanonicalizeRangeHi }, + { 0x1f4e, 0x1f50, 0x0000, CanonicalizeUnique }, + { 0x1f51, 0x1f51, 0x0008, CanonicalizeRangeLo }, + { 0x1f52, 0x1f52, 0x0000, CanonicalizeUnique }, + { 0x1f53, 0x1f53, 0x0008, CanonicalizeRangeLo }, + { 0x1f54, 0x1f54, 0x0000, CanonicalizeUnique }, + { 0x1f55, 0x1f55, 0x0008, CanonicalizeRangeLo }, + { 0x1f56, 0x1f56, 0x0000, CanonicalizeUnique }, + { 0x1f57, 0x1f57, 0x0008, CanonicalizeRangeLo }, + { 0x1f58, 0x1f58, 0x0000, CanonicalizeUnique }, + { 0x1f59, 0x1f59, 0x0008, CanonicalizeRangeHi }, + { 0x1f5a, 0x1f5a, 0x0000, CanonicalizeUnique }, + { 0x1f5b, 0x1f5b, 0x0008, CanonicalizeRangeHi }, + { 0x1f5c, 0x1f5c, 0x0000, CanonicalizeUnique }, + { 0x1f5d, 0x1f5d, 0x0008, CanonicalizeRangeHi }, + { 0x1f5e, 0x1f5e, 0x0000, CanonicalizeUnique }, + { 0x1f5f, 0x1f5f, 0x0008, CanonicalizeRangeHi }, + { 0x1f60, 0x1f67, 0x0008, CanonicalizeRangeLo }, + { 0x1f68, 0x1f6f, 0x0008, CanonicalizeRangeHi }, + { 0x1f70, 0x1f71, 0x004a, CanonicalizeRangeLo }, + { 0x1f72, 0x1f75, 0x0056, CanonicalizeRangeLo }, + { 0x1f76, 0x1f77, 0x0064, CanonicalizeRangeLo }, + { 0x1f78, 0x1f79, 0x0080, CanonicalizeRangeLo }, + { 0x1f7a, 0x1f7b, 0x0070, CanonicalizeRangeLo }, + { 0x1f7c, 0x1f7d, 0x007e, CanonicalizeRangeLo }, + { 0x1f7e, 0x1f7f, 0x0000, CanonicalizeUnique }, + { 0x1f80, 0x1f87, 0x0008, CanonicalizeRangeLo }, + { 0x1f88, 0x1f8f, 0x0008, CanonicalizeRangeHi }, + { 0x1f90, 0x1f97, 0x0008, CanonicalizeRangeLo }, + { 0x1f98, 0x1f9f, 0x0008, CanonicalizeRangeHi }, + { 0x1fa0, 0x1fa7, 0x0008, CanonicalizeRangeLo }, + { 0x1fa8, 0x1faf, 0x0008, CanonicalizeRangeHi }, + { 0x1fb0, 0x1fb1, 0x0008, CanonicalizeRangeLo }, + { 0x1fb2, 0x1fb2, 0x0000, CanonicalizeUnique }, + { 0x1fb3, 0x1fb3, 0x0009, CanonicalizeRangeLo }, + { 0x1fb4, 0x1fb7, 0x0000, CanonicalizeUnique }, + { 0x1fb8, 0x1fb9, 0x0008, CanonicalizeRangeHi }, + { 0x1fba, 0x1fbb, 0x004a, CanonicalizeRangeHi }, + { 0x1fbc, 0x1fbc, 0x0009, CanonicalizeRangeHi }, + { 0x1fbd, 0x1fbd, 0x0000, CanonicalizeUnique }, + { 0x1fbe, 0x1fbe, 0x000a, CanonicalizeSet }, + { 0x1fbf, 0x1fc2, 0x0000, CanonicalizeUnique }, + { 0x1fc3, 0x1fc3, 0x0009, CanonicalizeRangeLo }, + { 0x1fc4, 0x1fc7, 0x0000, CanonicalizeUnique }, + { 0x1fc8, 0x1fcb, 0x0056, CanonicalizeRangeHi }, + { 0x1fcc, 0x1fcc, 0x0009, CanonicalizeRangeHi }, + { 0x1fcd, 0x1fcf, 0x0000, CanonicalizeUnique }, + { 0x1fd0, 0x1fd1, 0x0008, CanonicalizeRangeLo }, + { 0x1fd2, 0x1fd7, 0x0000, CanonicalizeUnique }, + { 0x1fd8, 0x1fd9, 0x0008, CanonicalizeRangeHi }, + { 0x1fda, 0x1fdb, 0x0064, CanonicalizeRangeHi }, + { 0x1fdc, 0x1fdf, 0x0000, CanonicalizeUnique }, + { 0x1fe0, 0x1fe1, 0x0008, CanonicalizeRangeLo }, + { 0x1fe2, 0x1fe4, 0x0000, CanonicalizeUnique }, + { 0x1fe5, 0x1fe5, 0x0007, CanonicalizeRangeLo }, + { 0x1fe6, 0x1fe7, 0x0000, CanonicalizeUnique }, + { 0x1fe8, 0x1fe9, 0x0008, CanonicalizeRangeHi }, + { 0x1fea, 0x1feb, 0x0070, CanonicalizeRangeHi }, + { 0x1fec, 0x1fec, 0x0007, CanonicalizeRangeHi }, + { 0x1fed, 0x1ff2, 0x0000, CanonicalizeUnique }, + { 0x1ff3, 0x1ff3, 0x0009, CanonicalizeRangeLo }, + { 0x1ff4, 0x1ff7, 0x0000, CanonicalizeUnique }, + { 0x1ff8, 0x1ff9, 0x0080, CanonicalizeRangeHi }, + { 0x1ffa, 0x1ffb, 0x007e, CanonicalizeRangeHi }, + { 0x1ffc, 0x1ffc, 0x0009, CanonicalizeRangeHi }, + { 0x1ffd, 0x2125, 0x0000, CanonicalizeUnique }, + { 0x2126, 0x2126, 0x0011, CanonicalizeSet }, + { 0x2127, 0x2129, 0x0000, CanonicalizeUnique }, + { 0x212a, 0x212a, 0x0000, CanonicalizeSet }, + { 0x212b, 0x212b, 0x0002, CanonicalizeSet }, + { 0x212c, 0x2131, 0x0000, CanonicalizeUnique }, + { 0x2132, 0x2132, 0x001c, CanonicalizeRangeLo }, + { 0x2133, 0x214d, 0x0000, CanonicalizeUnique }, + { 0x214e, 0x214e, 0x001c, CanonicalizeRangeHi }, + { 0x214f, 0x215f, 0x0000, CanonicalizeUnique }, + { 0x2160, 0x216f, 0x0010, CanonicalizeRangeLo }, + { 0x2170, 0x217f, 0x0010, CanonicalizeRangeHi }, + { 0x2180, 0x2182, 0x0000, CanonicalizeUnique }, + { 0x2183, 0x2184, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0x2185, 0x24b5, 0x0000, CanonicalizeUnique }, + { 0x24b6, 0x24cf, 0x001a, CanonicalizeRangeLo }, + { 0x24d0, 0x24e9, 0x001a, CanonicalizeRangeHi }, + { 0x24ea, 0x2bff, 0x0000, CanonicalizeUnique }, + { 0x2c00, 0x2c2e, 0x0030, CanonicalizeRangeLo }, + { 0x2c2f, 0x2c2f, 0x0000, CanonicalizeUnique }, + { 0x2c30, 0x2c5e, 0x0030, CanonicalizeRangeHi }, + { 0x2c5f, 0x2c5f, 0x0000, CanonicalizeUnique }, + { 0x2c60, 0x2c61, 0x0000, CanonicalizeAlternatingAligned }, + { 0x2c62, 0x2c62, 0x29f7, CanonicalizeRangeHi }, + { 0x2c63, 0x2c63, 0x0ee6, CanonicalizeRangeHi }, + { 0x2c64, 0x2c64, 0x29e7, CanonicalizeRangeHi }, + { 0x2c65, 0x2c65, 0x2a2b, CanonicalizeRangeHi }, + { 0x2c66, 0x2c66, 0x2a28, CanonicalizeRangeHi }, + { 0x2c67, 0x2c6c, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0x2c6d, 0x2c6d, 0x2a1c, CanonicalizeRangeHi }, + { 0x2c6e, 0x2c6e, 0x29fd, CanonicalizeRangeHi }, + { 0x2c6f, 0x2c6f, 0x2a1f, CanonicalizeRangeHi }, + { 0x2c70, 0x2c70, 0x2a1e, CanonicalizeRangeHi }, + { 0x2c71, 0x2c71, 0x0000, CanonicalizeUnique }, + { 0x2c72, 0x2c73, 0x0000, CanonicalizeAlternatingAligned }, + { 0x2c74, 0x2c74, 0x0000, CanonicalizeUnique }, + { 0x2c75, 0x2c76, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0x2c77, 0x2c7d, 0x0000, CanonicalizeUnique }, + { 0x2c7e, 0x2c7f, 0x2a3f, CanonicalizeRangeHi }, + { 0x2c80, 0x2ce3, 0x0000, CanonicalizeAlternatingAligned }, + { 0x2ce4, 0x2cea, 0x0000, CanonicalizeUnique }, + { 0x2ceb, 0x2cee, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0x2cef, 0x2cf1, 0x0000, CanonicalizeUnique }, + { 0x2cf2, 0x2cf3, 0x0000, CanonicalizeAlternatingAligned }, + { 0x2cf4, 0x2cff, 0x0000, CanonicalizeUnique }, + { 0x2d00, 0x2d25, 0x1c60, CanonicalizeRangeHi }, + { 0x2d26, 0x2d26, 0x0000, CanonicalizeUnique }, + { 0x2d27, 0x2d27, 0x1c60, CanonicalizeRangeHi }, + { 0x2d28, 0x2d2c, 0x0000, CanonicalizeUnique }, + { 0x2d2d, 0x2d2d, 0x1c60, CanonicalizeRangeHi }, + { 0x2d2e, 0xa63f, 0x0000, CanonicalizeUnique }, + { 0xa640, 0xa649, 0x0000, CanonicalizeAlternatingAligned }, + { 0xa64a, 0xa64b, 0x001a, CanonicalizeSet }, + { 0xa64c, 0xa66d, 0x0000, CanonicalizeAlternatingAligned }, + { 0xa66e, 0xa67f, 0x0000, CanonicalizeUnique }, + { 0xa680, 0xa69b, 0x0000, CanonicalizeAlternatingAligned }, + { 0xa69c, 0xa721, 0x0000, CanonicalizeUnique }, + { 0xa722, 0xa72f, 0x0000, CanonicalizeAlternatingAligned }, + { 0xa730, 0xa731, 0x0000, CanonicalizeUnique }, + { 0xa732, 0xa76f, 0x0000, CanonicalizeAlternatingAligned }, + { 0xa770, 0xa778, 0x0000, CanonicalizeUnique }, + { 0xa779, 0xa77c, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0xa77d, 0xa77d, 0x8a04, CanonicalizeRangeHi }, + { 0xa77e, 0xa787, 0x0000, CanonicalizeAlternatingAligned }, + { 0xa788, 0xa78a, 0x0000, CanonicalizeUnique }, + { 0xa78b, 0xa78c, 0x0000, CanonicalizeAlternatingUnaligned }, + { 0xa78d, 0xa78d, 0xa528, CanonicalizeRangeHi }, + { 0xa78e, 0xa78f, 0x0000, CanonicalizeUnique }, + { 0xa790, 0xa793, 0x0000, CanonicalizeAlternatingAligned }, + { 0xa794, 0xa795, 0x0000, CanonicalizeUnique }, + { 0xa796, 0xa7a9, 0x0000, CanonicalizeAlternatingAligned }, + { 0xa7aa, 0xa7aa, 0xa544, CanonicalizeRangeHi }, + { 0xa7ab, 0xa7ab, 0xa54f, CanonicalizeRangeHi }, + { 0xa7ac, 0xa7ac, 0xa54b, CanonicalizeRangeHi }, + { 0xa7ad, 0xa7ad, 0xa541, CanonicalizeRangeHi }, + { 0xa7ae, 0xa7ae, 0xa544, CanonicalizeRangeHi }, + { 0xa7af, 0xa7af, 0x0000, CanonicalizeUnique }, + { 0xa7b0, 0xa7b0, 0xa512, CanonicalizeRangeHi }, + { 0xa7b1, 0xa7b1, 0xa52a, CanonicalizeRangeHi }, + { 0xa7b2, 0xa7b2, 0xa515, CanonicalizeRangeHi }, + { 0xa7b3, 0xa7b3, 0x03a0, CanonicalizeRangeLo }, + { 0xa7b4, 0xa7b7, 0x0000, CanonicalizeAlternatingAligned }, + { 0xa7b8, 0xab52, 0x0000, CanonicalizeUnique }, + { 0xab53, 0xab53, 0x03a0, CanonicalizeRangeHi }, + { 0xab54, 0xab6f, 0x0000, CanonicalizeUnique }, + { 0xab70, 0xabbf, 0x97d0, CanonicalizeRangeHi }, + { 0xabc0, 0xff20, 0x0000, CanonicalizeUnique }, + { 0xff21, 0xff3a, 0x0020, CanonicalizeRangeLo }, + { 0xff3b, 0xff40, 0x0000, CanonicalizeUnique }, + { 0xff41, 0xff5a, 0x0020, CanonicalizeRangeHi }, + { 0xff5b, 0x103ff, 0x0000, CanonicalizeUnique }, + { 0x10400, 0x10427, 0x0028, CanonicalizeRangeLo }, + { 0x10428, 0x1044f, 0x0028, CanonicalizeRangeHi }, + { 0x10450, 0x104af, 0x0000, CanonicalizeUnique }, + { 0x104b0, 0x104d3, 0x0028, CanonicalizeRangeLo }, + { 0x104d4, 0x104d7, 0x0000, CanonicalizeUnique }, + { 0x104d8, 0x104fb, 0x0028, CanonicalizeRangeHi }, + { 0x104fc, 0x10c7f, 0x0000, CanonicalizeUnique }, + { 0x10c80, 0x10cb2, 0x0040, CanonicalizeRangeLo }, + { 0x10cb3, 0x10cbf, 0x0000, CanonicalizeUnique }, + { 0x10cc0, 0x10cf2, 0x0040, CanonicalizeRangeHi }, + { 0x10cf3, 0x1189f, 0x0000, CanonicalizeUnique }, + { 0x118a0, 0x118bf, 0x0020, CanonicalizeRangeLo }, + { 0x118c0, 0x118df, 0x0020, CanonicalizeRangeHi }, + { 0x118e0, 0x1e8ff, 0x0000, CanonicalizeUnique }, + { 0x1e900, 0x1e921, 0x0022, CanonicalizeRangeLo }, + { 0x1e922, 0x1e943, 0x0022, CanonicalizeRangeHi }, + { 0x1e944, 0x10ffff, 0x0000, CanonicalizeUnique }, +}; + +} } // JSC::Yarr diff --git a/src/3rdparty/masm/yarr/YarrErrorCode.cpp b/src/3rdparty/masm/yarr/YarrErrorCode.cpp new file mode 100644 index 0000000000..aaebd4613d --- /dev/null +++ b/src/3rdparty/masm/yarr/YarrErrorCode.cpp @@ -0,0 +1,96 @@ +/* + * Copyright (C) 2017 Yusuke Suzuki <utatane.tea@gmail.com>. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "YarrErrorCode.h" + +#include "Error.h" + +namespace JSC { namespace Yarr { + +const char* errorMessage(ErrorCode error) +{ +#define REGEXP_ERROR_PREFIX "Invalid regular expression: " + // The order of this array must match the ErrorCode enum. + static const char* errorMessages[] = { + nullptr, // NoError + REGEXP_ERROR_PREFIX "regular expression too large", // PatternTooLarge + REGEXP_ERROR_PREFIX "numbers out of order in {} quantifier", // QuantifierOutOfOrder + REGEXP_ERROR_PREFIX "nothing to repeat", // QuantifierWithoutAtom + REGEXP_ERROR_PREFIX "number too large in {} quantifier", // QuantifierTooLarge + REGEXP_ERROR_PREFIX "missing )", // MissingParentheses + REGEXP_ERROR_PREFIX "unmatched parentheses", // ParenthesesUnmatched + REGEXP_ERROR_PREFIX "unrecognized character after (?", // ParenthesesTypeInvalid + REGEXP_ERROR_PREFIX "invalid group specifier name", // InvalidGroupName + REGEXP_ERROR_PREFIX "duplicate group specifier name", // DuplicateGroupName + REGEXP_ERROR_PREFIX "missing terminating ] for character class", // CharacterClassUnmatched + REGEXP_ERROR_PREFIX "range out of order in character class", // CharacterClassOutOfOrder + REGEXP_ERROR_PREFIX "\\ at end of pattern", // EscapeUnterminated + REGEXP_ERROR_PREFIX "invalid unicode {} escape", // InvalidUnicodeEscape + REGEXP_ERROR_PREFIX "invalid backreference for unicode pattern", // InvalidBackreference + REGEXP_ERROR_PREFIX "invalid escaped character for unicode pattern", // InvalidIdentityEscape + REGEXP_ERROR_PREFIX "invalid property expression", // InvalidUnicodePropertyExpression + REGEXP_ERROR_PREFIX "too many nested disjunctions", // TooManyDisjunctions + REGEXP_ERROR_PREFIX "pattern exceeds string length limits", // OffsetTooLarge + REGEXP_ERROR_PREFIX "invalid flags" // InvalidRegularExpressionFlags + }; + + return errorMessages[static_cast<unsigned>(error)]; +} + +JSObject* errorToThrow(ExecState* exec, ErrorCode error) +{ + switch (error) { + case ErrorCode::NoError: + ASSERT_NOT_REACHED(); + return nullptr; + case ErrorCode::PatternTooLarge: + case ErrorCode::QuantifierOutOfOrder: + case ErrorCode::QuantifierWithoutAtom: + case ErrorCode::QuantifierTooLarge: + case ErrorCode::MissingParentheses: + case ErrorCode::ParenthesesUnmatched: + case ErrorCode::ParenthesesTypeInvalid: + case ErrorCode::InvalidGroupName: + case ErrorCode::DuplicateGroupName: + case ErrorCode::CharacterClassUnmatched: + case ErrorCode::CharacterClassOutOfOrder: + case ErrorCode::EscapeUnterminated: + case ErrorCode::InvalidUnicodeEscape: + case ErrorCode::InvalidBackreference: + case ErrorCode::InvalidIdentityEscape: + case ErrorCode::InvalidUnicodePropertyExpression: + case ErrorCode::OffsetTooLarge: + case ErrorCode::InvalidRegularExpressionFlags: + return createSyntaxError(exec, errorMessage(error)); + case ErrorCode::TooManyDisjunctions: + return createOutOfMemoryError(exec, errorMessage(error)); + } + + ASSERT_NOT_REACHED(); + return nullptr; +} + +} } // namespace JSC::Yarr diff --git a/src/3rdparty/masm/yarr/YarrErrorCode.h b/src/3rdparty/masm/yarr/YarrErrorCode.h new file mode 100644 index 0000000000..48f2bb7900 --- /dev/null +++ b/src/3rdparty/masm/yarr/YarrErrorCode.h @@ -0,0 +1,65 @@ +/* + * Copyright (C) 2017 Yusuke Suzuki <utatane.tea@gmail.com>. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#pragma once + +namespace JSC { + +class ExecState; +class JSObject; + +namespace Yarr { + +enum class ErrorCode : uint8_t { + NoError = 0, + PatternTooLarge, + QuantifierOutOfOrder, + QuantifierWithoutAtom, + QuantifierTooLarge, + MissingParentheses, + ParenthesesUnmatched, + ParenthesesTypeInvalid, + InvalidGroupName, + DuplicateGroupName, + CharacterClassUnmatched, + CharacterClassOutOfOrder, + EscapeUnterminated, + InvalidUnicodeEscape, + InvalidBackreference, + InvalidIdentityEscape, + InvalidUnicodePropertyExpression, + TooManyDisjunctions, + OffsetTooLarge, + InvalidRegularExpressionFlags, +}; + +JS_EXPORT_PRIVATE const char* errorMessage(ErrorCode); +inline bool hasError(ErrorCode errorCode) +{ + return errorCode != ErrorCode::NoError; +} +JS_EXPORT_PRIVATE JSObject* errorToThrow(ExecState*, ErrorCode); + +} } // namespace JSC::Yarr diff --git a/src/3rdparty/masm/yarr/YarrInterpreter.cpp b/src/3rdparty/masm/yarr/YarrInterpreter.cpp index 16fc183cad..6eb6750dc4 100644 --- a/src/3rdparty/masm/yarr/YarrInterpreter.cpp +++ b/src/3rdparty/masm/yarr/YarrInterpreter.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2009 Apple Inc. All rights reserved. + * Copyright (C) 2009, 2013-2017 Apple Inc. All rights reserved. * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged * * Redistribution and use in source and binary forms, with or without @@ -27,17 +27,15 @@ #include "config.h" #include "YarrInterpreter.h" +#include "Options.h" +#include "SuperSampler.h" #include "Yarr.h" -#include "YarrCanonicalizeUCS2.h" +#include "YarrCanonicalize.h" #include <wtf/BumpPointerAllocator.h> #include <wtf/DataLog.h> #include <wtf/text/CString.h> #include <wtf/text/WTFString.h> -#ifndef NDEBUG -#include <stdio.h> -#endif - using namespace WTF; namespace JSC { namespace Yarr { @@ -47,28 +45,6 @@ class Interpreter { public: struct ParenthesesDisjunctionContext; - struct BackTrackInfoPatternCharacter { - uintptr_t matchAmount; - }; - struct BackTrackInfoCharacterClass { - uintptr_t matchAmount; - }; - struct BackTrackInfoBackReference { - uintptr_t begin; // Not really needed for greedy quantifiers. - uintptr_t matchAmount; // Not really needed for fixed quantifiers. - }; - struct BackTrackInfoAlternative { - uintptr_t offset; - }; - struct BackTrackInfoParentheticalAssertion { - uintptr_t begin; - }; - struct BackTrackInfoParenthesesOnce { - uintptr_t begin; - }; - struct BackTrackInfoParenthesesTerminal { - uintptr_t begin; - }; struct BackTrackInfoParentheses { uintptr_t matchAmount; ParenthesesDisjunctionContext* lastContext; @@ -158,7 +134,7 @@ public: ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, unsigned* output, ByteTerm& term) { - size_t size = sizeof(ParenthesesDisjunctionContext) - sizeof(unsigned) + (term.atom.parenthesesDisjunction->m_numSubpatterns << 1) * sizeof(unsigned) + sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t); + size_t size = sizeof(ParenthesesDisjunctionContext) - sizeof(unsigned) + (term.atom.parenthesesDisjunction->m_numSubpatterns << 1) * sizeof(unsigned) + sizeof(DisjunctionContext) - sizeof(uintptr_t) + static_cast<size_t>(disjunction->m_frameSize) * sizeof(uintptr_t); allocatorPool = allocatorPool->ensureCapacity(size); RELEASE_ASSERT(allocatorPool); return new (allocatorPool->alloc(size)) ParenthesesDisjunctionContext(output, term); @@ -171,10 +147,11 @@ public: class InputStream { public: - InputStream(const CharType* input, unsigned start, unsigned length) + InputStream(const CharType* input, unsigned start, unsigned length, bool decodeSurrogatePairs) : input(input) , pos(start) , length(length) + , decodeSurrogatePairs(decodeSurrogatePairs) { } @@ -208,13 +185,40 @@ public: RELEASE_ASSERT(pos >= negativePositionOffest); unsigned p = pos - negativePositionOffest; ASSERT(p < length); - return input[p]; + int result = input[p]; + if (U16_IS_LEAD(result) && decodeSurrogatePairs && p + 1 < length && U16_IS_TRAIL(input[p + 1])) { + if (atEnd()) + return -1; + + result = U16_GET_SUPPLEMENTARY(result, input[p + 1]); + next(); + } + return result; + } + + int readSurrogatePairChecked(unsigned negativePositionOffset) + { + RELEASE_ASSERT(pos >= negativePositionOffset); + unsigned p = pos - negativePositionOffset; + ASSERT(p < length); + if (p + 1 >= length) + return -1; + + int first = input[p]; + int second = input[p + 1]; + if (U16_IS_LEAD(first) && U16_IS_TRAIL(second)) + return U16_GET_SUPPLEMENTARY(first, second); + + return -1; } int reread(unsigned from) { ASSERT(from < length); - return input[from]; + int result = input[from]; + if (U16_IS_LEAD(result) && decodeSurrogatePairs && from + 1 < length && U16_IS_TRAIL(input[from + 1])) + result = U16_GET_SUPPLEMENTARY(result, input[from + 1]); + return result; } int prev() @@ -265,9 +269,9 @@ public: pos -= count; } - bool atStart(unsigned negativePositionOffest) + bool atStart(unsigned negativePositionOffset) { - return pos == negativePositionOffest; + return pos == negativePositionOffset; } bool atEnd(unsigned negativePositionOffest) @@ -285,24 +289,106 @@ public: const CharType* input; unsigned pos; unsigned length; + bool decodeSurrogatePairs; }; bool testCharacterClass(CharacterClass* characterClass, int ch) { - if (ch & 0xFF80) { - for (unsigned i = 0; i < characterClass->m_matchesUnicode.size(); ++i) - if (ch == characterClass->m_matchesUnicode[i]) + auto linearSearchMatches = [&ch](const Vector<UChar32>& matches) { + for (unsigned i = 0; i < matches.size(); ++i) { + if (ch == matches[i]) + return true; + } + + return false; + }; + + auto binarySearchMatches = [&ch](const Vector<UChar32>& matches) { + size_t low = 0; + size_t high = matches.size() - 1; + + while (low <= high) { + size_t mid = low + (high - low) / 2; + int diff = ch - matches[mid]; + if (!diff) + return true; + + if (diff < 0) { + if (mid == low) + return false; + high = mid - 1; + } else + low = mid + 1; + } + return false; + }; + + auto linearSearchRanges = [&ch](const Vector<CharacterRange>& ranges) { + for (unsigned i = 0; i < ranges.size(); ++i) { + if ((ch >= ranges[i].begin) && (ch <= ranges[i].end)) return true; - for (unsigned i = 0; i < characterClass->m_rangesUnicode.size(); ++i) - if ((ch >= characterClass->m_rangesUnicode[i].begin) && (ch <= characterClass->m_rangesUnicode[i].end)) + } + + return false; + }; + + auto binarySearchRanges = [&ch](const Vector<CharacterRange>& ranges) { + size_t low = 0; + size_t high = ranges.size() - 1; + + while (low <= high) { + size_t mid = low + (high - low) / 2; + int rangeBeginDiff = ch - ranges[mid].begin; + if (rangeBeginDiff >= 0 && ch <= ranges[mid].end) return true; + + if (rangeBeginDiff < 0) { + if (mid == low) + return false; + high = mid - 1; + } else + low = mid + 1; + } + return false; + }; + + if (characterClass->m_anyCharacter) + return true; + + const size_t thresholdForBinarySearch = 6; + + if (!isASCII(ch)) { + if (characterClass->m_matchesUnicode.size()) { + if (characterClass->m_matchesUnicode.size() > thresholdForBinarySearch) { + if (binarySearchMatches(characterClass->m_matchesUnicode)) + return true; + } else if (linearSearchMatches(characterClass->m_matchesUnicode)) + return true; + } + + if (characterClass->m_rangesUnicode.size()) { + if (characterClass->m_rangesUnicode.size() > thresholdForBinarySearch) { + if (binarySearchRanges(characterClass->m_rangesUnicode)) + return true; + } else if (linearSearchRanges(characterClass->m_rangesUnicode)) + return true; + } } else { - for (unsigned i = 0; i < characterClass->m_matches.size(); ++i) - if (ch == characterClass->m_matches[i]) + if (characterClass->m_matches.size()) { + if (characterClass->m_matches.size() > thresholdForBinarySearch) { + if (binarySearchMatches(characterClass->m_matches)) + return true; + } else if (linearSearchMatches(characterClass->m_matches)) return true; - for (unsigned i = 0; i < characterClass->m_ranges.size(); ++i) - if ((ch >= characterClass->m_ranges[i].begin) && (ch <= characterClass->m_ranges[i].end)) + } + + if (characterClass->m_ranges.size()) { + if (characterClass->m_ranges.size() > thresholdForBinarySearch) { + if (binarySearchRanges(characterClass->m_ranges)) + return true; + } else if (linearSearchRanges(characterClass->m_ranges)) return true; + } } return false; @@ -313,6 +399,11 @@ public: return testChar == input.readChecked(negativeInputOffset); } + bool checkSurrogatePair(int testUnicodeChar, unsigned negativeInputOffset) + { + return testUnicodeChar == input.readSurrogatePairChecked(negativeInputOffset); + } + bool checkCasedCharacter(int loChar, int hiChar, unsigned negativeInputOffset) { int ch = input.readChecked(negativeInputOffset); @@ -332,32 +423,31 @@ public: if (!input.checkInput(matchSize)) return false; - if (pattern->m_ignoreCase) { - for (unsigned i = 0; i < matchSize; ++i) { - int oldCh = input.reread(matchBegin + i); - int ch = input.readChecked(negativeInputOffset + matchSize - i); - - if (oldCh == ch) - continue; - - // The definition for canonicalize (see ES 5.1, 15.10.2.8) means that - // unicode values are never allowed to match against ascii ones. - if (isASCII(oldCh) || isASCII(ch)) { + for (unsigned i = 0; i < matchSize; ++i) { + int oldCh = input.reread(matchBegin + i); + int ch; + if (!U_IS_BMP(oldCh)) { + ch = input.readSurrogatePairChecked(negativeInputOffset + matchSize - i); + ++i; + } else + ch = input.readChecked(negativeInputOffset + matchSize - i); + + if (oldCh == ch) + continue; + + if (pattern->ignoreCase()) { + // See ES 6.0, 21.2.2.8.2 for the definition of Canonicalize(). For non-Unicode + // patterns, Unicode values are never allowed to match against ASCII ones. + // For Unicode, we need to check all canonical equivalents of a character. + if (!unicode && (isASCII(oldCh) || isASCII(ch))) { if (toASCIIUpper(oldCh) == toASCIIUpper(ch)) continue; - } else if (areCanonicallyEquivalent(oldCh, ch)) + } else if (areCanonicallyEquivalent(oldCh, ch, unicode ? CanonicalMode::Unicode : CanonicalMode::UCS2)) continue; - - input.uncheckInput(matchSize); - return false; - } - } else { - for (unsigned i = 0; i < matchSize; ++i) { - if (!checkCharacter(input.reread(matchBegin + i), negativeInputOffset + matchSize - i)) { - input.uncheckInput(matchSize); - return false; - } } + + input.uncheckInput(matchSize); + return false; } return true; @@ -365,15 +455,15 @@ public: bool matchAssertionBOL(ByteTerm& term) { - return (input.atStart(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition + 1))); + return (input.atStart(term.inputPosition)) || (pattern->multiline() && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition + 1))); } bool matchAssertionEOL(ByteTerm& term) { if (term.inputPosition) - return (input.atEnd(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition))); + return (input.atEnd(term.inputPosition)) || (pattern->multiline() && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition))); - return (input.atEnd()) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.read())); + return (input.atEnd()) || (pattern->multiline() && testCharacterClass(pattern->newlineCharacterClass, input.read())); } bool matchAssertionWordBoundary(ByteTerm& term) @@ -400,18 +490,18 @@ public: case QuantifierGreedy: if (backTrack->matchAmount) { --backTrack->matchAmount; - input.uncheckInput(1); + input.uncheckInput(U16_LENGTH(term.atom.patternCharacter)); return true; } break; case QuantifierNonGreedy: - if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { + if ((backTrack->matchAmount < term.atom.quantityMaxCount) && input.checkInput(1)) { ++backTrack->matchAmount; if (checkCharacter(term.atom.patternCharacter, term.inputPosition + 1)) return true; } - input.uncheckInput(backTrack->matchAmount); + input.setPos(backTrack->begin); break; } @@ -435,7 +525,7 @@ public: break; case QuantifierNonGreedy: - if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { + if ((backTrack->matchAmount < term.atom.quantityMaxCount) && input.checkInput(1)) { ++backTrack->matchAmount; if (checkCasedCharacter(term.atom.casedCharacter.lo, term.atom.casedCharacter.hi, term.inputPosition + 1)) return true; @@ -450,11 +540,24 @@ public: bool matchCharacterClass(ByteTerm& term, DisjunctionContext* context) { ASSERT(term.type == ByteTerm::TypeCharacterClass); - BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); + BackTrackInfoCharacterClass* backTrack = reinterpret_cast<BackTrackInfoCharacterClass*>(context->frame + term.frameLocation); switch (term.atom.quantityType) { case QuantifierFixedCount: { - for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) { + if (unicode) { + backTrack->begin = input.getPos(); + unsigned matchAmount = 0; + for (matchAmount = 0; matchAmount < term.atom.quantityMaxCount; ++matchAmount) { + if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - matchAmount)) { + input.setPos(backTrack->begin); + return false; + } + } + + return true; + } + + for (unsigned matchAmount = 0; matchAmount < term.atom.quantityMaxCount; ++matchAmount) { if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - matchAmount)) return false; } @@ -462,13 +565,16 @@ public: } case QuantifierGreedy: { + unsigned position = input.getPos(); + backTrack->begin = position; unsigned matchAmount = 0; - while ((matchAmount < term.atom.quantityCount) && input.checkInput(1)) { + while ((matchAmount < term.atom.quantityMaxCount) && input.checkInput(1)) { if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + 1)) { - input.uncheckInput(1); + input.setPos(position); break; } ++matchAmount; + position = input.getPos(); } backTrack->matchAmount = matchAmount; @@ -476,6 +582,7 @@ public: } case QuantifierNonGreedy: + backTrack->begin = input.getPos(); backTrack->matchAmount = 0; return true; } @@ -487,14 +594,28 @@ public: bool backtrackCharacterClass(ByteTerm& term, DisjunctionContext* context) { ASSERT(term.type == ByteTerm::TypeCharacterClass); - BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); + BackTrackInfoCharacterClass* backTrack = reinterpret_cast<BackTrackInfoCharacterClass*>(context->frame + term.frameLocation); switch (term.atom.quantityType) { case QuantifierFixedCount: + if (unicode) + input.setPos(backTrack->begin); break; case QuantifierGreedy: if (backTrack->matchAmount) { + if (unicode) { + // Rematch one less match + input.setPos(backTrack->begin); + --backTrack->matchAmount; + for (unsigned matchAmount = 0; (matchAmount < backTrack->matchAmount) && input.checkInput(1); ++matchAmount) { + if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + 1)) { + input.uncheckInput(1); + break; + } + } + return true; + } --backTrack->matchAmount; input.uncheckInput(1); return true; @@ -502,12 +623,12 @@ public: break; case QuantifierNonGreedy: - if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { + if ((backTrack->matchAmount < term.atom.quantityMaxCount) && input.checkInput(1)) { ++backTrack->matchAmount; if (checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + 1)) return true; } - input.uncheckInput(backTrack->matchAmount); + input.setPos(backTrack->begin); break; } @@ -539,7 +660,7 @@ public: switch (term.atom.quantityType) { case QuantifierFixedCount: { backTrack->begin = input.getPos(); - for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) { + for (unsigned matchAmount = 0; matchAmount < term.atom.quantityMaxCount; ++matchAmount) { if (!tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) { input.setPos(backTrack->begin); return false; @@ -550,7 +671,7 @@ public: case QuantifierGreedy: { unsigned matchAmount = 0; - while ((matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) + while ((matchAmount < term.atom.quantityMaxCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) ++matchAmount; backTrack->matchAmount = matchAmount; return true; @@ -584,7 +705,7 @@ public: switch (term.atom.quantityType) { case QuantifierFixedCount: - // for quantityCount == 1, could rewind. + // for quantityMaxCount == 1, could rewind. input.setPos(backTrack->begin); break; @@ -597,7 +718,7 @@ public: break; case QuantifierNonGreedy: - if ((backTrack->matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) { + if ((backTrack->matchAmount < term.atom.quantityMaxCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) { ++backTrack->matchAmount; return true; } @@ -612,8 +733,8 @@ public: { if (term.capture()) { unsigned subpatternId = term.atom.subpatternId; - output[(subpatternId << 1)] = context->getDisjunctionContext(term)->matchBegin + term.inputPosition; - output[(subpatternId << 1) + 1] = context->getDisjunctionContext(term)->matchEnd + term.inputPosition; + output[(subpatternId << 1)] = context->getDisjunctionContext(term)->matchBegin - term.inputPosition; + output[(subpatternId << 1) + 1] = context->getDisjunctionContext(term)->matchEnd - term.inputPosition; } } void resetMatches(ByteTerm& term, ParenthesesDisjunctionContext* context) @@ -645,7 +766,7 @@ public: bool matchParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context) { ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin); - ASSERT(term.atom.quantityCount == 1); + ASSERT(term.atom.quantityMaxCount == 1); BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); @@ -675,11 +796,11 @@ public: bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context) { ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd); - ASSERT(term.atom.quantityCount == 1); + ASSERT(term.atom.quantityMaxCount == 1); if (term.capture()) { unsigned subpatternId = term.atom.subpatternId; - output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition; + output[(subpatternId << 1) + 1] = input.getPos() - term.inputPosition; } if (term.atom.quantityType == QuantifierFixedCount) @@ -692,7 +813,7 @@ public: bool backtrackParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context) { ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin); - ASSERT(term.atom.quantityCount == 1); + ASSERT(term.atom.quantityMaxCount == 1); BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); @@ -711,6 +832,7 @@ public: return true; case QuantifierNonGreedy: ASSERT(backTrack->begin != notFound); + FALLTHROUGH; case QuantifierFixedCount: break; } @@ -721,7 +843,7 @@ public: bool backtrackParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context) { ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd); - ASSERT(term.atom.quantityCount == 1); + ASSERT(term.atom.quantityMaxCount == 1); BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); @@ -731,7 +853,7 @@ public: context->term -= term.atom.parenthesesWidth; return false; } - Q_FALLTHROUGH(); + FALLTHROUGH; case QuantifierNonGreedy: if (backTrack->begin == notFound) { backTrack->begin = input.getPos(); @@ -742,11 +864,12 @@ public: ASSERT((&term - term.atom.parenthesesWidth)->type == ByteTerm::TypeParenthesesSubpatternOnceBegin); ASSERT((&term - term.atom.parenthesesWidth)->inputPosition == term.inputPosition); unsigned subpatternId = term.atom.subpatternId; - output[subpatternId << 1] = input.getPos() + term.inputPosition; + output[subpatternId << 1] = input.getPos() - term.inputPosition; } context->term -= term.atom.parenthesesWidth; return true; } + FALLTHROUGH; case QuantifierFixedCount: break; } @@ -758,7 +881,7 @@ public: { ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin); ASSERT(term.atom.quantityType == QuantifierGreedy); - ASSERT(term.atom.quantityCount == quantifyInfinite); + ASSERT(term.atom.quantityMaxCount == quantifyInfinite); ASSERT(!term.capture()); BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation); @@ -775,7 +898,7 @@ public: if (backTrack->begin == input.getPos()) return false; - // Successful match! Okay, what's next? - loop around and try to match moar! + // Successful match! Okay, what's next? - loop around and try to match more! context->term -= (term.atom.parenthesesWidth + 1); return true; } @@ -784,7 +907,7 @@ public: { ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin); ASSERT(term.atom.quantityType == QuantifierGreedy); - ASSERT(term.atom.quantityCount == quantifyInfinite); + ASSERT(term.atom.quantityMaxCount == quantifyInfinite); ASSERT(!term.capture()); // If we backtrack to this point, we have failed to match this iteration of the parens. @@ -804,7 +927,7 @@ public: bool matchParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context) { ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin); - ASSERT(term.atom.quantityCount == 1); + ASSERT(term.atom.quantityMaxCount == 1); BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation); @@ -815,7 +938,7 @@ public: bool matchParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context) { ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd); - ASSERT(term.atom.quantityCount == 1); + ASSERT(term.atom.quantityMaxCount == 1); BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation); @@ -833,7 +956,7 @@ public: bool backtrackParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context) { ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin); - ASSERT(term.atom.quantityCount == 1); + ASSERT(term.atom.quantityMaxCount == 1); // We've failed to match parens; if they are inverted, this is win! if (term.invert()) { @@ -847,7 +970,7 @@ public: bool backtrackParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context) { ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd); - ASSERT(term.atom.quantityCount == 1); + ASSERT(term.atom.quantityMaxCount == 1); BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation); @@ -867,36 +990,45 @@ public: backTrack->matchAmount = 0; backTrack->lastContext = 0; - switch (term.atom.quantityType) { - case QuantifierFixedCount: { + ASSERT(term.atom.quantityType != QuantifierFixedCount || term.atom.quantityMinCount == term.atom.quantityMaxCount); + + unsigned minimumMatchCount = term.atom.quantityMinCount; + JSRegExpResult fixedMatchResult; + + // Handle fixed matches and the minimum part of a variable length match. + if (minimumMatchCount) { // While we haven't yet reached our fixed limit, - while (backTrack->matchAmount < term.atom.quantityCount) { + while (backTrack->matchAmount < minimumMatchCount) { // Try to do a match, and it it succeeds, add it to the list. ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); - JSRegExpResult result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term)); - if (result == JSRegExpMatch) + fixedMatchResult = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term)); + if (fixedMatchResult == JSRegExpMatch) appendParenthesesDisjunctionContext(backTrack, context); else { // The match failed; try to find an alternate point to carry on from. resetMatches(term, context); freeParenthesesDisjunctionContext(context); - - if (result != JSRegExpNoMatch) - return result; + + if (fixedMatchResult != JSRegExpNoMatch) + return fixedMatchResult; JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack); if (backtrackResult != JSRegExpMatch) return backtrackResult; } } - ASSERT(backTrack->matchAmount == term.atom.quantityCount); ParenthesesDisjunctionContext* context = backTrack->lastContext; recordParenthesesMatch(term, context); + } + + switch (term.atom.quantityType) { + case QuantifierFixedCount: { + ASSERT(backTrack->matchAmount == term.atom.quantityMaxCount); return JSRegExpMatch; } case QuantifierGreedy: { - while (backTrack->matchAmount < term.atom.quantityCount) { + while (backTrack->matchAmount < term.atom.quantityMaxCount) { ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)); if (result == JSRegExpMatch) @@ -946,7 +1078,7 @@ public: switch (term.atom.quantityType) { case QuantifierFixedCount: { - ASSERT(backTrack->matchAmount == term.atom.quantityCount); + ASSERT(backTrack->matchAmount == term.atom.quantityMaxCount); ParenthesesDisjunctionContext* context = 0; JSRegExpResult result = parenthesesDoBacktrack(term, backTrack); @@ -955,7 +1087,7 @@ public: return result; // While we haven't yet reached our fixed limit, - while (backTrack->matchAmount < term.atom.quantityCount) { + while (backTrack->matchAmount < term.atom.quantityMaxCount) { // Try to do a match, and it it succeeds, add it to the list. context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term)); @@ -975,7 +1107,7 @@ public: } } - ASSERT(backTrack->matchAmount == term.atom.quantityCount); + ASSERT(backTrack->matchAmount == term.atom.quantityMaxCount); context = backTrack->lastContext; recordParenthesesMatch(term, context); return JSRegExpMatch; @@ -988,7 +1120,7 @@ public: ParenthesesDisjunctionContext* context = backTrack->lastContext; JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true); if (result == JSRegExpMatch) { - while (backTrack->matchAmount < term.atom.quantityCount) { + while (backTrack->matchAmount < term.atom.quantityMaxCount) { ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); JSRegExpResult parenthesesResult = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)); if (parenthesesResult == JSRegExpMatch) @@ -1008,7 +1140,7 @@ public: popParenthesesDisjunctionContext(backTrack); freeParenthesesDisjunctionContext(context); - if (result != JSRegExpNoMatch) + if (result != JSRegExpNoMatch || backTrack->matchAmount < term.atom.quantityMinCount) return result; } @@ -1021,7 +1153,7 @@ public: case QuantifierNonGreedy: { // If we've not reached the limit, try to add one more match. - if (backTrack->matchAmount < term.atom.quantityCount) { + if (backTrack->matchAmount < term.atom.quantityMaxCount) { ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)); if (result == JSRegExpMatch) { @@ -1070,16 +1202,23 @@ public: bool matchDotStarEnclosure(ByteTerm& term, DisjunctionContext* context) { UNUSED_PARAM(term); + + if (pattern->dotAll()) { + context->matchBegin = startOffset; + context->matchEnd = input.end(); + return true; + } + unsigned matchBegin = context->matchBegin; - if (matchBegin) { + if (matchBegin > startOffset) { for (matchBegin--; true; matchBegin--) { if (testCharacterClass(pattern->newlineCharacterClass, input.reread(matchBegin))) { ++matchBegin; break; } - if (!matchBegin) + if (matchBegin == startOffset) break; } } @@ -1091,7 +1230,7 @@ public: if (((matchBegin && term.anchors.m_bol) || ((matchEnd != input.end()) && term.anchors.m_eol)) - && !pattern->m_multiline) + && !pattern->multiline()) return false; context->matchBegin = matchBegin; @@ -1156,21 +1295,37 @@ public: case ByteTerm::TypePatternCharacterOnce: case ByteTerm::TypePatternCharacterFixed: { - for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) { - if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - matchAmount)) + if (unicode) { + if (!U_IS_BMP(currentTerm().atom.patternCharacter)) { + for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityMaxCount; ++matchAmount) { + if (!checkSurrogatePair(currentTerm().atom.patternCharacter, currentTerm().inputPosition - 2 * matchAmount)) { + BACKTRACK(); + } + } + MATCH_NEXT(); + } + } + unsigned position = input.getPos(); // May need to back out reading a surrogate pair. + + for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityMaxCount; ++matchAmount) { + if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - matchAmount)) { + input.setPos(position); BACKTRACK(); + } } MATCH_NEXT(); } case ByteTerm::TypePatternCharacterGreedy: { BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); unsigned matchAmount = 0; - while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) { + unsigned position = input.getPos(); // May need to back out reading a surrogate pair. + while ((matchAmount < currentTerm().atom.quantityMaxCount) && input.checkInput(1)) { if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition + 1)) { - input.uncheckInput(1); + input.setPos(position); break; } ++matchAmount; + position = input.getPos(); } backTrack->matchAmount = matchAmount; @@ -1178,13 +1333,29 @@ public: } case ByteTerm::TypePatternCharacterNonGreedy: { BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); + backTrack->begin = input.getPos(); backTrack->matchAmount = 0; MATCH_NEXT(); } case ByteTerm::TypePatternCasedCharacterOnce: case ByteTerm::TypePatternCasedCharacterFixed: { - for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) { + if (unicode) { + // Case insensitive matching of unicode characters is handled as TypeCharacterClass. + ASSERT(U_IS_BMP(currentTerm().atom.patternCharacter)); + + unsigned position = input.getPos(); // May need to back out reading a surrogate pair. + + for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityMaxCount; ++matchAmount) { + if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - matchAmount)) { + input.setPos(position); + BACKTRACK(); + } + } + MATCH_NEXT(); + } + + for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityMaxCount; ++matchAmount) { if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - matchAmount)) BACKTRACK(); } @@ -1192,8 +1363,12 @@ public: } case ByteTerm::TypePatternCasedCharacterGreedy: { BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); + + // Case insensitive matching of unicode characters is handled as TypeCharacterClass. + ASSERT(!unicode || U_IS_BMP(currentTerm().atom.patternCharacter)); + unsigned matchAmount = 0; - while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) { + while ((matchAmount < currentTerm().atom.quantityMaxCount) && input.checkInput(1)) { if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition + 1)) { input.uncheckInput(1); break; @@ -1206,6 +1381,10 @@ public: } case ByteTerm::TypePatternCasedCharacterNonGreedy: { BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); + + // Case insensitive matching of unicode characters is handled as TypeCharacterClass. + ASSERT(!unicode || U_IS_BMP(currentTerm().atom.patternCharacter)); + backTrack->matchAmount = 0; MATCH_NEXT(); } @@ -1287,7 +1466,7 @@ public: if (offset > 0) MATCH_NEXT(); - if (input.atEnd()) + if (input.atEnd() || pattern->sticky()) return JSRegExpNoMatch; input.next(); @@ -1417,6 +1596,9 @@ public: if (!input.isAvailableInput(0)) return offsetNoMatch; + if (pattern->m_lock) + pattern->m_lock->lock(); + for (unsigned i = 0; i < pattern->m_body->m_numSubpatterns + 1; ++i) output[i << 1] = offsetNoMatch; @@ -1436,23 +1618,31 @@ public: pattern->m_allocator->stopAllocator(); ASSERT((result == JSRegExpMatch) == (output[0] != offsetNoMatch)); + + if (pattern->m_lock) + pattern->m_lock->unlock(); + return output[0]; } Interpreter(BytecodePattern* pattern, unsigned* output, const CharType* input, unsigned length, unsigned start) : pattern(pattern) + , unicode(pattern->unicode()) , output(output) - , input(input, start, length) + , input(input, start, length, pattern->unicode()) , allocatorPool(0) + , startOffset(start) , remainingMatchCount(matchLimit) { } private: BytecodePattern* pattern; + bool unicode; unsigned* output; InputStream input; BumpPointerPool* allocatorPool; + unsigned startOffset; unsigned remainingMatchCount; }; @@ -1474,13 +1664,18 @@ public: m_currentAlternativeIndex = 0; } - PassOwnPtr<BytecodePattern> compile(BumpPointerAllocator* allocator) + std::unique_ptr<BytecodePattern> compile(BumpPointerAllocator* allocator, ConcurrentJSLock* lock) { regexBegin(m_pattern.m_numSubpatterns, m_pattern.m_body->m_callFrameSize, m_pattern.m_body->m_alternatives[0]->onceThrough()); emitDisjunction(m_pattern.m_body); regexEnd(); - return adoptPtr(new BytecodePattern(m_bodyDisjunction.release(), m_allParenthesesInfo, m_pattern, allocator)); +#ifndef NDEBUG + if (Options::dumpCompiledRegExpPatterns()) + dumpDisjunction(m_bodyDisjunction.get()); +#endif + + return std::make_unique<BytecodePattern>(WTFMove(m_bodyDisjunction), m_allParenthesesInfo, m_pattern, allocator, lock); } void checkInput(unsigned count) @@ -1508,45 +1703,44 @@ public: m_bodyDisjunction->terms.append(ByteTerm::WordBoundary(invert, inputPosition)); } - void atomPatternCharacter(UChar ch, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) + void atomPatternCharacter(UChar32 ch, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMaxCount, QuantifierType quantityType) { - if (m_pattern.m_ignoreCase) { - UChar lo = Unicode::toLower(ch); - UChar hi = Unicode::toUpper(ch); + if (m_pattern.ignoreCase()) { + UChar32 lo = u_tolower(ch); + UChar32 hi = u_toupper(ch); if (lo != hi) { - m_bodyDisjunction->terms.append(ByteTerm(lo, hi, inputPosition, frameLocation, quantityCount, quantityType)); + m_bodyDisjunction->terms.append(ByteTerm(lo, hi, inputPosition, frameLocation, quantityMaxCount, quantityType)); return; } } - m_bodyDisjunction->terms.append(ByteTerm(ch, inputPosition, frameLocation, quantityCount, quantityType)); + m_bodyDisjunction->terms.append(ByteTerm(ch, inputPosition, frameLocation, quantityMaxCount, quantityType)); } - void atomCharacterClass(CharacterClass* characterClass, bool invert, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) + void atomCharacterClass(CharacterClass* characterClass, bool invert, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMaxCount, QuantifierType quantityType) { m_bodyDisjunction->terms.append(ByteTerm(characterClass, invert, inputPosition)); - m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount.unsafeGet(); + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityMaxCount = quantityMaxCount.unsafeGet(); m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType; m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; } - void atomBackReference(unsigned subpatternId, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) + void atomBackReference(unsigned subpatternId, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMaxCount, QuantifierType quantityType) { ASSERT(subpatternId); m_bodyDisjunction->terms.append(ByteTerm::BackReference(subpatternId, inputPosition)); - m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount.unsafeGet(); + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityMaxCount = quantityMaxCount.unsafeGet(); m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType; m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; } void atomParenthesesOnceBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) { - ASSERT(m_bodyDisjunction->terms.size() <= INT_MAX); - int beginTerm = static_cast<int>(m_bodyDisjunction->terms.size()); + unsigned beginTerm = m_bodyDisjunction->terms.size(); m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition)); m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; @@ -1559,8 +1753,7 @@ public: void atomParenthesesTerminalBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) { - ASSERT(m_bodyDisjunction->terms.size() <= INT_MAX); - int beginTerm = static_cast<int>(m_bodyDisjunction->terms.size()); + int beginTerm = m_bodyDisjunction->terms.size(); m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalBegin, subpatternId, capture, false, inputPosition)); m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; @@ -1577,8 +1770,7 @@ public: // then fix this up at the end! - simplifying this should make it much clearer. // https://bugs.webkit.org/show_bug.cgi?id=50136 - ASSERT(m_bodyDisjunction->terms.size() <= INT_MAX); - int beginTerm = static_cast<int>(m_bodyDisjunction->terms.size()); + int beginTerm = m_bodyDisjunction->terms.size(); m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition)); m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; @@ -1591,8 +1783,7 @@ public: void atomParentheticalAssertionBegin(unsigned subpatternId, bool invert, unsigned frameLocation, unsigned alternativeFrameLocation) { - ASSERT(m_bodyDisjunction->terms.size() <= INT_MAX); - int beginTerm = static_cast<int>(m_bodyDisjunction->terms.size()); + int beginTerm = m_bodyDisjunction->terms.size(); m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionBegin, subpatternId, false, invert, 0)); m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; @@ -1603,12 +1794,11 @@ public: m_currentAlternativeIndex = beginTerm + 1; } - void atomParentheticalAssertionEnd(unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) + void atomParentheticalAssertionEnd(unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMaxCount, QuantifierType quantityType) { unsigned beginTerm = popParenthesesStack(); closeAlternative(beginTerm + 1); - ASSERT(m_bodyDisjunction->terms.size() <= INT_MAX); - unsigned endTerm = static_cast<int>(m_bodyDisjunction->terms.size()); + unsigned endTerm = m_bodyDisjunction->terms.size(); ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParentheticalAssertionBegin); @@ -1620,9 +1810,9 @@ public: m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation; - m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet(); + m_bodyDisjunction->terms[beginTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet(); m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; - m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount.unsafeGet(); + m_bodyDisjunction->terms[endTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet(); m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; } @@ -1634,8 +1824,7 @@ public: unsigned popParenthesesStack() { ASSERT(m_parenthesesStack.size()); - ASSERT(m_parenthesesStack.size() <= INT_MAX); - int stackEnd = static_cast<int>(m_parenthesesStack.size()) - 1; + int stackEnd = m_parenthesesStack.size() - 1; unsigned beginTerm = m_parenthesesStack[stackEnd].beginTerm; m_currentAlternativeIndex = m_parenthesesStack[stackEnd].savedAlternativeIndex; m_parenthesesStack.shrink(stackEnd); @@ -1646,22 +1835,11 @@ public: return beginTerm; } -#ifndef NDEBUG - void dumpDisjunction(ByteDisjunction* disjunction) - { - dataLogF("ByteDisjunction(%p):\n\t", disjunction); - for (unsigned i = 0; i < disjunction->terms.size(); ++i) - dataLogF("{ %d } ", disjunction->terms[i].type); - dataLogF("\n"); - } -#endif - void closeAlternative(int beginTerm) { int origBeginTerm = beginTerm; ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeBegin); - ASSERT(m_bodyDisjunction->terms.size() <= INT_MAX); - int endIndex = static_cast<int>(m_bodyDisjunction->terms.size()); + int endIndex = m_bodyDisjunction->terms.size(); unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation; @@ -1687,8 +1865,7 @@ public: int beginTerm = 0; int origBeginTerm = 0; ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeBegin); - ASSERT(m_bodyDisjunction->terms.size() <= INT_MAX); - int endIndex = static_cast<int>(m_bodyDisjunction->terms.size()); + int endIndex = m_bodyDisjunction->terms.size(); unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation; @@ -1705,12 +1882,11 @@ public: m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation; } - void atomParenthesesSubpatternEnd(unsigned lastSubpatternId, int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType, unsigned callFrameSize = 0) + void atomParenthesesSubpatternEnd(unsigned lastSubpatternId, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMinCount, Checked<unsigned> quantityMaxCount, QuantifierType quantityType, unsigned callFrameSize = 0) { unsigned beginTerm = popParenthesesStack(); closeAlternative(beginTerm + 1); - ASSERT(m_bodyDisjunction->terms.size() <= INT_MAX); - unsigned endTerm = static_cast<int>(m_bodyDisjunction->terms.size()); + unsigned endTerm = m_bodyDisjunction->terms.size(); ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin); @@ -1720,7 +1896,7 @@ public: unsigned subpatternId = parenthesesBegin.atom.subpatternId; unsigned numSubpatterns = lastSubpatternId - subpatternId + 1; - OwnPtr<ByteDisjunction> parenthesesDisjunction = adoptPtr(new ByteDisjunction(numSubpatterns, callFrameSize)); + auto parenthesesDisjunction = std::make_unique<ByteDisjunction>(numSubpatterns, callFrameSize); unsigned firstTermInParentheses = beginTerm + 1; parenthesesDisjunction->terms.reserveInitialCapacity(endTerm - firstTermInParentheses + 2); @@ -1733,19 +1909,19 @@ public: m_bodyDisjunction->terms.shrink(beginTerm); m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction.get(), capture, inputPosition)); - m_allParenthesesInfo.append(parenthesesDisjunction.release()); + m_allParenthesesInfo.append(WTFMove(parenthesesDisjunction)); - m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet(); + m_bodyDisjunction->terms[beginTerm].atom.quantityMinCount = quantityMinCount.unsafeGet(); + m_bodyDisjunction->terms[beginTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet(); m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; } - void atomParenthesesOnceEnd(int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) + void atomParenthesesOnceEnd(unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMinCount, Checked<unsigned> quantityMaxCount, QuantifierType quantityType) { unsigned beginTerm = popParenthesesStack(); closeAlternative(beginTerm + 1); - ASSERT(m_bodyDisjunction->terms.size() <= INT_MAX); - unsigned endTerm = static_cast<int>(m_bodyDisjunction->terms.size()); + unsigned endTerm = m_bodyDisjunction->terms.size(); ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin); @@ -1757,18 +1933,19 @@ public: m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation; - m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet(); + m_bodyDisjunction->terms[beginTerm].atom.quantityMinCount = quantityMinCount.unsafeGet(); + m_bodyDisjunction->terms[beginTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet(); m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; - m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount.unsafeGet(); + m_bodyDisjunction->terms[endTerm].atom.quantityMinCount = quantityMinCount.unsafeGet(); + m_bodyDisjunction->terms[endTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet(); m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; } - void atomParenthesesTerminalEnd(int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) + void atomParenthesesTerminalEnd(unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMinCount, Checked<unsigned> quantityMaxCount, QuantifierType quantityType) { unsigned beginTerm = popParenthesesStack(); closeAlternative(beginTerm + 1); - ASSERT(m_bodyDisjunction->terms.size() <= INT_MAX); - unsigned endTerm = static_cast<int>(m_bodyDisjunction->terms.size()); + unsigned endTerm = m_bodyDisjunction->terms.size(); ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternTerminalBegin); @@ -1780,15 +1957,17 @@ public: m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation; - m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet(); + m_bodyDisjunction->terms[beginTerm].atom.quantityMinCount = quantityMinCount.unsafeGet(); + m_bodyDisjunction->terms[beginTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet(); m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; - m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount.unsafeGet(); + m_bodyDisjunction->terms[endTerm].atom.quantityMinCount = quantityMinCount.unsafeGet(); + m_bodyDisjunction->terms[endTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet(); m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; } void regexBegin(unsigned numSubpatterns, unsigned callFrameSize, bool onceThrough) { - m_bodyDisjunction = adoptPtr(new ByteDisjunction(numSubpatterns, callFrameSize)); + m_bodyDisjunction = std::make_unique<ByteDisjunction>(numSubpatterns, callFrameSize); m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeBegin(onceThrough)); m_bodyDisjunction->terms[0].frameLocation = 0; m_currentAlternativeIndex = 0; @@ -1801,8 +1980,7 @@ public: void alternativeBodyDisjunction(bool onceThrough) { - ASSERT(m_bodyDisjunction->terms.size() <= INT_MAX); - int newAlternativeIndex = static_cast<int>(m_bodyDisjunction->terms.size()); + int newAlternativeIndex = m_bodyDisjunction->terms.size(); m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex; m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeDisjunction(onceThrough)); @@ -1811,8 +1989,7 @@ public: void alternativeDisjunction() { - ASSERT(m_bodyDisjunction->terms.size() <= INT_MAX); - int newAlternativeIndex = static_cast<int>(m_bodyDisjunction->terms.size()); + int newAlternativeIndex = m_bodyDisjunction->terms.size(); m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex; m_bodyDisjunction->terms.append(ByteTerm::AlternativeDisjunction()); @@ -1842,9 +2019,7 @@ public: currentCountAlreadyChecked += countToCheck; } - for (unsigned i = 0; i < alternative->m_terms.size(); ++i) { - PatternTerm& term = alternative->m_terms[i]; - + for (auto& term : alternative->m_terms) { switch (term.type) { case PatternTerm::TypeAssertionBOL: assertionBOL(currentCountAlreadyChecked - term.inputPosition); @@ -1859,15 +2034,15 @@ public: break; case PatternTerm::TypePatternCharacter: - atomPatternCharacter(term.patternCharacter, currentCountAlreadyChecked - term.inputPosition, term.frameLocation, term.quantityCount, term.quantityType); + atomPatternCharacter(term.patternCharacter, currentCountAlreadyChecked - term.inputPosition, term.frameLocation, term.quantityMaxCount, term.quantityType); break; case PatternTerm::TypeCharacterClass: - atomCharacterClass(term.characterClass, term.invert(), currentCountAlreadyChecked- term.inputPosition, term.frameLocation, term.quantityCount, term.quantityType); + atomCharacterClass(term.characterClass, term.invert(), currentCountAlreadyChecked- term.inputPosition, term.frameLocation, term.quantityMaxCount, term.quantityType); break; case PatternTerm::TypeBackReference: - atomBackReference(term.backReferenceSubpatternId, currentCountAlreadyChecked - term.inputPosition, term.frameLocation, term.quantityCount, term.quantityType); + atomBackReference(term.backReferenceSubpatternId, currentCountAlreadyChecked - term.inputPosition, term.frameLocation, term.quantityMaxCount, term.quantityType); break; case PatternTerm::TypeForwardReference: @@ -1875,27 +2050,30 @@ public: case PatternTerm::TypeParenthesesSubpattern: { unsigned disjunctionAlreadyCheckedCount = 0; - if (term.quantityCount == 1 && !term.parentheses.isCopy) { + if (term.quantityMaxCount == 1 && !term.parentheses.isCopy) { unsigned alternativeFrameLocation = term.frameLocation; // For QuantifierFixedCount we pre-check the minimum size; for greedy/non-greedy we reserve a slot in the frame. if (term.quantityType == QuantifierFixedCount) disjunctionAlreadyCheckedCount = term.parentheses.disjunction->m_minimumSize; else alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce; - unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; - atomParenthesesOnceBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount - delegateEndInputOffset, term.frameLocation, alternativeFrameLocation); + ASSERT(currentCountAlreadyChecked >= term.inputPosition); + unsigned delegateEndInputOffset = currentCountAlreadyChecked - term.inputPosition; + atomParenthesesOnceBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount + delegateEndInputOffset, term.frameLocation, alternativeFrameLocation); emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount); - atomParenthesesOnceEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType); + atomParenthesesOnceEnd(delegateEndInputOffset, term.frameLocation, term.quantityMinCount, term.quantityMaxCount, term.quantityType); } else if (term.parentheses.isTerminal) { - unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; - atomParenthesesTerminalBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount - delegateEndInputOffset, term.frameLocation, term.frameLocation + YarrStackSpaceForBackTrackInfoParenthesesOnce); + ASSERT(currentCountAlreadyChecked >= term.inputPosition); + unsigned delegateEndInputOffset = currentCountAlreadyChecked - term.inputPosition; + atomParenthesesTerminalBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount + delegateEndInputOffset, term.frameLocation, term.frameLocation + YarrStackSpaceForBackTrackInfoParenthesesTerminal); emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount); - atomParenthesesTerminalEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType); + atomParenthesesTerminalEnd(delegateEndInputOffset, term.frameLocation, term.quantityMinCount, term.quantityMaxCount, term.quantityType); } else { - unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; - atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount - delegateEndInputOffset, term.frameLocation, 0); + ASSERT(currentCountAlreadyChecked >= term.inputPosition); + unsigned delegateEndInputOffset = currentCountAlreadyChecked - term.inputPosition; + atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount + delegateEndInputOffset, term.frameLocation, 0); emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0); - atomParenthesesSubpatternEnd(term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize); + atomParenthesesSubpatternEnd(term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityMinCount, term.quantityMaxCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize); } break; } @@ -1903,8 +2081,8 @@ public: case PatternTerm::TypeParentheticalAssertion: { unsigned alternativeFrameLocation = term.frameLocation + YarrStackSpaceForBackTrackInfoParentheticalAssertion; - ASSERT(currentCountAlreadyChecked >= static_cast<unsigned>(term.inputPosition)); - unsigned positiveInputOffset = currentCountAlreadyChecked - static_cast<unsigned>(term.inputPosition); + ASSERT(currentCountAlreadyChecked >= term.inputPosition); + unsigned positiveInputOffset = currentCountAlreadyChecked - term.inputPosition; unsigned uncheckAmount = 0; if (positiveInputOffset > term.parentheses.disjunction->m_minimumSize) { uncheckAmount = positiveInputOffset - term.parentheses.disjunction->m_minimumSize; @@ -1914,7 +2092,7 @@ public: atomParentheticalAssertionBegin(term.parentheses.subpatternId, term.invert(), term.frameLocation, alternativeFrameLocation); emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, positiveInputOffset - uncheckAmount); - atomParentheticalAssertionEnd(0, term.frameLocation, term.quantityCount, term.quantityType); + atomParentheticalAssertionEnd(0, term.frameLocation, term.quantityMaxCount, term.quantityType); if (uncheckAmount) { checkInput(uncheckAmount); currentCountAlreadyChecked += uncheckAmount; @@ -1929,22 +2107,283 @@ public: } } } +#ifndef NDEBUG + void dumpDisjunction(ByteDisjunction* disjunction, unsigned nesting = 0) + { + PrintStream& out = WTF::dataFile(); + + unsigned termIndexNest = 0; + + if (!nesting) { + out.printf("ByteDisjunction(%p):\n", disjunction); + nesting = 1; + } else { + termIndexNest = nesting - 1; + nesting = 2; + } + + auto outputTermIndexAndNest = [&](size_t index, unsigned termNesting) { + for (unsigned nestingDepth = 0; nestingDepth < termIndexNest; nestingDepth++) + out.print(" "); + out.printf("%4zu", index); + for (unsigned nestingDepth = 0; nestingDepth < termNesting; nestingDepth++) + out.print(" "); + }; + + auto dumpQuantity = [&](ByteTerm& term) { + if (term.atom.quantityType == QuantifierFixedCount && term.atom.quantityMinCount == 1 && term.atom.quantityMaxCount == 1) + return; + + out.print(" {", term.atom.quantityMinCount); + if (term.atom.quantityMinCount != term.atom.quantityMaxCount) { + if (term.atom.quantityMaxCount == UINT_MAX) + out.print(",inf"); + else + out.print(",", term.atom.quantityMaxCount); + } + out.print("}"); + if (term.atom.quantityType == QuantifierGreedy) + out.print(" greedy"); + else if (term.atom.quantityType == QuantifierNonGreedy) + out.print(" non-greedy"); + }; + + auto dumpCaptured = [&](ByteTerm& term) { + if (term.capture()) + out.print(" captured (#", term.atom.subpatternId, ")"); + }; + + auto dumpInverted = [&](ByteTerm& term) { + if (term.invert()) + out.print(" inverted"); + }; + + auto dumpInputPosition = [&](ByteTerm& term) { + out.printf(" inputPosition %u", term.inputPosition); + }; + + auto dumpFrameLocation = [&](ByteTerm& term) { + out.printf(" frameLocation %u", term.frameLocation); + }; + + auto dumpCharacter = [&](ByteTerm& term) { + out.print(" "); + dumpUChar32(out, term.atom.patternCharacter); + }; + + auto dumpCharClass = [&](ByteTerm& term) { + out.print(" "); + dumpCharacterClass(out, &m_pattern, term.atom.characterClass); + }; + + for (size_t idx = 0; idx < disjunction->terms.size(); ++idx) { + ByteTerm term = disjunction->terms[idx]; + + bool outputNewline = true; + + switch (term.type) { + case ByteTerm::TypeBodyAlternativeBegin: + outputTermIndexAndNest(idx, nesting++); + out.print("BodyAlternativeBegin"); + if (term.alternative.onceThrough) + out.print(" onceThrough"); + dumpFrameLocation(term); + break; + case ByteTerm::TypeBodyAlternativeDisjunction: + outputTermIndexAndNest(idx, nesting - 1); + out.print("BodyAlternativeDisjunction"); + dumpFrameLocation(term); + break; + case ByteTerm::TypeBodyAlternativeEnd: + outputTermIndexAndNest(idx, --nesting); + out.print("BodyAlternativeEnd"); + dumpFrameLocation(term); + break; + case ByteTerm::TypeAlternativeBegin: + outputTermIndexAndNest(idx, nesting++); + out.print("AlternativeBegin"); + dumpFrameLocation(term); + break; + case ByteTerm::TypeAlternativeDisjunction: + outputTermIndexAndNest(idx, nesting - 1); + out.print("AlternativeDisjunction"); + dumpFrameLocation(term); + break; + case ByteTerm::TypeAlternativeEnd: + outputTermIndexAndNest(idx, --nesting); + out.print("AlternativeEnd"); + dumpFrameLocation(term); + break; + case ByteTerm::TypeSubpatternBegin: + outputTermIndexAndNest(idx, nesting++); + out.print("SubpatternBegin"); + break; + case ByteTerm::TypeSubpatternEnd: + outputTermIndexAndNest(idx, --nesting); + out.print("SubpatternEnd"); + break; + case ByteTerm::TypeAssertionBOL: + outputTermIndexAndNest(idx, nesting); + out.print("AssertionBOL"); + break; + case ByteTerm::TypeAssertionEOL: + outputTermIndexAndNest(idx, nesting); + out.print("AssertionEOL"); + break; + case ByteTerm::TypeAssertionWordBoundary: + outputTermIndexAndNest(idx, nesting); + out.print("AssertionWordBoundary"); + break; + case ByteTerm::TypePatternCharacterOnce: + outputTermIndexAndNest(idx, nesting); + out.print("PatternCharacterOnce"); + dumpInverted(term); + dumpInputPosition(term); + dumpFrameLocation(term); + dumpCharacter(term); + dumpQuantity(term); + break; + case ByteTerm::TypePatternCharacterFixed: + outputTermIndexAndNest(idx, nesting); + out.print("PatternCharacterFixed"); + dumpInverted(term); + dumpInputPosition(term); + dumpFrameLocation(term); + dumpCharacter(term); + out.print(" {", term.atom.quantityMinCount, "}"); + break; + case ByteTerm::TypePatternCharacterGreedy: + outputTermIndexAndNest(idx, nesting); + out.print("PatternCharacterGreedy"); + dumpInverted(term); + dumpInputPosition(term); + dumpFrameLocation(term); + dumpCharacter(term); + dumpQuantity(term); + break; + case ByteTerm::TypePatternCharacterNonGreedy: + outputTermIndexAndNest(idx, nesting); + out.print("PatternCharacterNonGreedy"); + dumpInverted(term); + dumpInputPosition(term); + dumpFrameLocation(term); + dumpCharacter(term); + dumpQuantity(term); + break; + case ByteTerm::TypePatternCasedCharacterOnce: + outputTermIndexAndNest(idx, nesting); + out.print("PatternCasedCharacterOnce"); + break; + case ByteTerm::TypePatternCasedCharacterFixed: + outputTermIndexAndNest(idx, nesting); + out.print("PatternCasedCharacterFixed"); + break; + case ByteTerm::TypePatternCasedCharacterGreedy: + outputTermIndexAndNest(idx, nesting); + out.print("PatternCasedCharacterGreedy"); + break; + case ByteTerm::TypePatternCasedCharacterNonGreedy: + outputTermIndexAndNest(idx, nesting); + out.print("PatternCasedCharacterNonGreedy"); + break; + case ByteTerm::TypeCharacterClass: + outputTermIndexAndNest(idx, nesting); + out.print("CharacterClass"); + dumpInverted(term); + dumpInputPosition(term); + dumpFrameLocation(term); + dumpCharClass(term); + dumpQuantity(term); + break; + case ByteTerm::TypeBackReference: + outputTermIndexAndNest(idx, nesting); + out.print("BackReference #", term.atom.subpatternId); + dumpQuantity(term); + break; + case ByteTerm::TypeParenthesesSubpattern: + outputTermIndexAndNest(idx, nesting); + out.print("ParenthesesSubpattern"); + dumpCaptured(term); + dumpInverted(term); + dumpInputPosition(term); + dumpFrameLocation(term); + dumpQuantity(term); + out.print("\n"); + outputNewline = false; + dumpDisjunction(term.atom.parenthesesDisjunction, nesting); + break; + case ByteTerm::TypeParenthesesSubpatternOnceBegin: + outputTermIndexAndNest(idx, nesting++); + out.print("ParenthesesSubpatternOnceBegin"); + dumpCaptured(term); + dumpInverted(term); + dumpInputPosition(term); + dumpFrameLocation(term); + break; + case ByteTerm::TypeParenthesesSubpatternOnceEnd: + outputTermIndexAndNest(idx, --nesting); + out.print("ParenthesesSubpatternOnceEnd"); + dumpFrameLocation(term); + break; + case ByteTerm::TypeParenthesesSubpatternTerminalBegin: + outputTermIndexAndNest(idx, nesting++); + out.print("ParenthesesSubpatternTerminalBegin"); + dumpInverted(term); + dumpInputPosition(term); + dumpFrameLocation(term); + break; + case ByteTerm::TypeParenthesesSubpatternTerminalEnd: + outputTermIndexAndNest(idx, --nesting); + out.print("ParenthesesSubpatternTerminalEnd"); + dumpFrameLocation(term); + break; + case ByteTerm::TypeParentheticalAssertionBegin: + outputTermIndexAndNest(idx, nesting++); + out.print("ParentheticalAssertionBegin"); + dumpInverted(term); + dumpInputPosition(term); + dumpFrameLocation(term); + break; + case ByteTerm::TypeParentheticalAssertionEnd: + outputTermIndexAndNest(idx, --nesting); + out.print("ParentheticalAssertionEnd"); + dumpFrameLocation(term); + break; + case ByteTerm::TypeCheckInput: + outputTermIndexAndNest(idx, nesting); + out.print("CheckInput ", term.checkInputCount); + break; + case ByteTerm::TypeUncheckInput: + outputTermIndexAndNest(idx, nesting); + out.print("UncheckInput ", term.checkInputCount); + break; + case ByteTerm::TypeDotStarEnclosure: + outputTermIndexAndNest(idx, nesting); + out.print("DotStarEnclosure"); + break; + } + if (outputNewline) + out.print("\n"); + } + } +#endif private: YarrPattern& m_pattern; - OwnPtr<ByteDisjunction> m_bodyDisjunction; + std::unique_ptr<ByteDisjunction> m_bodyDisjunction; unsigned m_currentAlternativeIndex; Vector<ParenthesesStackEntry> m_parenthesesStack; - Vector<OwnPtr<ByteDisjunction> > m_allParenthesesInfo; + Vector<std::unique_ptr<ByteDisjunction>> m_allParenthesesInfo; }; -PassOwnPtr<BytecodePattern> byteCompile(YarrPattern& pattern, BumpPointerAllocator* allocator) +std::unique_ptr<BytecodePattern> byteCompile(YarrPattern& pattern, BumpPointerAllocator* allocator, ConcurrentJSLock* lock) { - return ByteCompiler(pattern).compile(allocator); + return ByteCompiler(pattern).compile(allocator, lock); } unsigned interpret(BytecodePattern* bytecode, const String& input, unsigned start, unsigned* output) { + SuperSamplerScope superSamplerScope(false); if (input.is8Bit()) return Interpreter<LChar>(bytecode, output, input.characters8(), input.length(), start).interpret(); return Interpreter<UChar>(bytecode, output, input.characters16(), input.length(), start).interpret(); @@ -1952,22 +2391,24 @@ unsigned interpret(BytecodePattern* bytecode, const String& input, unsigned star unsigned interpret(BytecodePattern* bytecode, const LChar* input, unsigned length, unsigned start, unsigned* output) { + SuperSamplerScope superSamplerScope(false); return Interpreter<LChar>(bytecode, output, input, length, start).interpret(); } unsigned interpret(BytecodePattern* bytecode, const UChar* input, unsigned length, unsigned start, unsigned* output) { + SuperSamplerScope superSamplerScope(false); return Interpreter<UChar>(bytecode, output, input, length, start).interpret(); } // These should be the same for both UChar & LChar. -COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoPatternCharacter) == (YarrStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoPatternCharacter); -COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoCharacterClass) == (YarrStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoCharacterClass); -COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoBackReference) == (YarrStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoBackReference); -COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoAlternative) == (YarrStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoAlternative); -COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoParentheticalAssertion) == (YarrStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheticalAssertion); -COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoParenthesesOnce) == (YarrStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParenthesesOnce); -COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoParentheses) == (YarrStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheses); +COMPILE_ASSERT(sizeof(BackTrackInfoPatternCharacter) == (YarrStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoPatternCharacter); +COMPILE_ASSERT(sizeof(BackTrackInfoCharacterClass) == (YarrStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoCharacterClass); +COMPILE_ASSERT(sizeof(BackTrackInfoBackReference) == (YarrStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoBackReference); +COMPILE_ASSERT(sizeof(BackTrackInfoAlternative) == (YarrStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoAlternative); +COMPILE_ASSERT(sizeof(BackTrackInfoParentheticalAssertion) == (YarrStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheticalAssertion); +COMPILE_ASSERT(sizeof(BackTrackInfoParenthesesOnce) == (YarrStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParenthesesOnce); +COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoParentheses) <= (YarrStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheses); } } diff --git a/src/3rdparty/masm/yarr/YarrInterpreter.h b/src/3rdparty/masm/yarr/YarrInterpreter.h index 3b44acbd2b..a319cb3461 100644 --- a/src/3rdparty/masm/yarr/YarrInterpreter.h +++ b/src/3rdparty/masm/yarr/YarrInterpreter.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2009, 2010 Apple Inc. All rights reserved. + * Copyright (C) 2009, 2010-2012, 2014, 2016 Apple Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -23,12 +23,10 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef YarrInterpreter_h -#define YarrInterpreter_h +#pragma once +#include "ConcurrentJSLock.h" #include "YarrPattern.h" -#include <wtf/PassOwnPtr.h> -#include <wtf/unicode/Unicode.h> namespace WTF { class BumpPointerAllocator; @@ -76,10 +74,10 @@ struct ByteTerm { union { struct { union { - UChar patternCharacter; + UChar32 patternCharacter; struct { - UChar lo; - UChar hi; + UChar32 lo; + UChar32 hi; } casedCharacter; CharacterClass* characterClass; unsigned subpatternId; @@ -89,7 +87,8 @@ struct ByteTerm { unsigned parenthesesWidth; }; QuantifierType quantityType; - unsigned quantityCount; + unsigned quantityMinCount; + unsigned quantityMaxCount; } atom; struct { int next; @@ -107,11 +106,17 @@ struct ByteTerm { bool m_invert : 1; unsigned inputPosition; - ByteTerm(UChar ch, int inputPos, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) + ByteTerm(UChar32 ch, unsigned inputPos, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) : frameLocation(frameLocation) , m_capture(false) , m_invert(false) { + atom.patternCharacter = ch; + atom.quantityType = quantityType; + atom.quantityMinCount = quantityCount.unsafeGet(); + atom.quantityMaxCount = quantityCount.unsafeGet(); + inputPosition = inputPos; + switch (quantityType) { case QuantifierFixedCount: type = (quantityCount == 1) ? ByteTerm::TypePatternCharacterOnce : ByteTerm::TypePatternCharacterFixed; @@ -123,14 +128,9 @@ struct ByteTerm { type = ByteTerm::TypePatternCharacterNonGreedy; break; } - - atom.patternCharacter = ch; - atom.quantityType = quantityType; - atom.quantityCount = quantityCount.unsafeGet(); - inputPosition = inputPos; } - ByteTerm(UChar lo, UChar hi, int inputPos, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) + ByteTerm(UChar32 lo, UChar32 hi, unsigned inputPos, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) : frameLocation(frameLocation) , m_capture(false) , m_invert(false) @@ -150,22 +150,24 @@ struct ByteTerm { atom.casedCharacter.lo = lo; atom.casedCharacter.hi = hi; atom.quantityType = quantityType; - atom.quantityCount = quantityCount.unsafeGet(); + atom.quantityMinCount = quantityCount.unsafeGet(); + atom.quantityMaxCount = quantityCount.unsafeGet(); inputPosition = inputPos; } - ByteTerm(CharacterClass* characterClass, bool invert, int inputPos) + ByteTerm(CharacterClass* characterClass, bool invert, unsigned inputPos) : type(ByteTerm::TypeCharacterClass) , m_capture(false) , m_invert(invert) { atom.characterClass = characterClass; atom.quantityType = QuantifierFixedCount; - atom.quantityCount = 1; + atom.quantityMinCount = 1; + atom.quantityMaxCount = 1; inputPosition = inputPos; } - ByteTerm(Type type, unsigned subpatternId, ByteDisjunction* parenthesesInfo, bool capture, int inputPos) + ByteTerm(Type type, unsigned subpatternId, ByteDisjunction* parenthesesInfo, bool capture, unsigned inputPos) : type(type) , m_capture(capture) , m_invert(false) @@ -173,7 +175,8 @@ struct ByteTerm { atom.subpatternId = subpatternId; atom.parenthesesDisjunction = parenthesesInfo; atom.quantityType = QuantifierFixedCount; - atom.quantityCount = 1; + atom.quantityMinCount = 1; + atom.quantityMaxCount = 1; inputPosition = inputPos; } @@ -183,21 +186,23 @@ struct ByteTerm { , m_invert(invert) { atom.quantityType = QuantifierFixedCount; - atom.quantityCount = 1; + atom.quantityMinCount = 1; + atom.quantityMaxCount = 1; } - ByteTerm(Type type, unsigned subpatternId, bool capture, bool invert, int inputPos) + ByteTerm(Type type, unsigned subpatternId, bool capture, bool invert, unsigned inputPos) : type(type) , m_capture(capture) , m_invert(invert) { atom.subpatternId = subpatternId; atom.quantityType = QuantifierFixedCount; - atom.quantityCount = 1; + atom.quantityMinCount = 1; + atom.quantityMaxCount = 1; inputPosition = inputPos; } - static ByteTerm BOL(int inputPos) + static ByteTerm BOL(unsigned inputPos) { ByteTerm term(TypeAssertionBOL); term.inputPosition = inputPos; @@ -218,21 +223,21 @@ struct ByteTerm { return term; } - static ByteTerm EOL(int inputPos) + static ByteTerm EOL(unsigned inputPos) { ByteTerm term(TypeAssertionEOL); term.inputPosition = inputPos; return term; } - static ByteTerm WordBoundary(bool invert, int inputPos) + static ByteTerm WordBoundary(bool invert, unsigned inputPos) { ByteTerm term(TypeAssertionWordBoundary, invert); term.inputPosition = inputPos; return term; } - static ByteTerm BackReference(unsigned subpatternId, int inputPos) + static ByteTerm BackReference(unsigned subpatternId, unsigned inputPos) { return ByteTerm(TypeBackReference, subpatternId, false, false, inputPos); } @@ -329,6 +334,8 @@ public: { } + size_t estimatedSizeInBytes() const { return terms.capacity() * sizeof(ByteTerm); } + Vector<ByteTerm> terms; unsigned m_numSubpatterns; unsigned m_frameSize; @@ -337,16 +344,19 @@ public: struct BytecodePattern { WTF_MAKE_FAST_ALLOCATED; public: - BytecodePattern(PassOwnPtr<ByteDisjunction> body, Vector<OwnPtr<ByteDisjunction> >& parenthesesInfoToAdopt, YarrPattern& pattern, BumpPointerAllocator* allocator) - : m_body(body) - , m_ignoreCase(pattern.m_ignoreCase) - , m_multiline(pattern.m_multiline) + BytecodePattern(std::unique_ptr<ByteDisjunction> body, Vector<std::unique_ptr<ByteDisjunction>>& parenthesesInfoToAdopt, YarrPattern& pattern, BumpPointerAllocator* allocator, ConcurrentJSLock* lock) + : m_body(WTFMove(body)) + , m_flags(pattern.m_flags) , m_allocator(allocator) + , m_lock(lock) { m_body->terms.shrinkToFit(); newlineCharacterClass = pattern.newlineCharacterClass(); - wordcharCharacterClass = pattern.wordcharCharacterClass(); + if (unicode() && ignoreCase()) + wordcharCharacterClass = pattern.wordUnicodeIgnoreCaseCharCharacterClass(); + else + wordcharCharacterClass = pattern.wordcharCharacterClass(); m_allParenthesesInfo.swap(parenthesesInfoToAdopt); m_allParenthesesInfo.shrinkToFit(); @@ -355,26 +365,32 @@ public: m_userCharacterClasses.shrinkToFit(); } - OwnPtr<ByteDisjunction> m_body; - bool m_ignoreCase; - bool m_multiline; + size_t estimatedSizeInBytes() const { return m_body->estimatedSizeInBytes(); } + + bool ignoreCase() const { return m_flags & FlagIgnoreCase; } + bool multiline() const { return m_flags & FlagMultiline; } + bool sticky() const { return m_flags & FlagSticky; } + bool unicode() const { return m_flags & FlagUnicode; } + bool dotAll() const { return m_flags & FlagDotAll; } + + std::unique_ptr<ByteDisjunction> m_body; + RegExpFlags m_flags; // Each BytecodePattern is associated with a RegExp, each RegExp is associated - // with a JSGlobalData. Cache a pointer to out JSGlobalData's m_regExpAllocator. + // with a VM. Cache a pointer to out VM's m_regExpAllocator. BumpPointerAllocator* m_allocator; + ConcurrentJSLock* m_lock; CharacterClass* newlineCharacterClass; CharacterClass* wordcharCharacterClass; private: - Vector<OwnPtr<ByteDisjunction> > m_allParenthesesInfo; - Vector<OwnPtr<CharacterClass> > m_userCharacterClasses; + Vector<std::unique_ptr<ByteDisjunction>> m_allParenthesesInfo; + Vector<std::unique_ptr<CharacterClass>> m_userCharacterClasses; }; -JS_EXPORT_PRIVATE PassOwnPtr<BytecodePattern> byteCompile(YarrPattern&, BumpPointerAllocator*); +JS_EXPORT_PRIVATE std::unique_ptr<BytecodePattern> byteCompile(YarrPattern&, BumpPointerAllocator*, ConcurrentJSLock* = nullptr); JS_EXPORT_PRIVATE unsigned interpret(BytecodePattern*, const String& input, unsigned start, unsigned* output); unsigned interpret(BytecodePattern*, const LChar* input, unsigned length, unsigned start, unsigned* output); unsigned interpret(BytecodePattern*, const UChar* input, unsigned length, unsigned start, unsigned* output); } } // namespace JSC::Yarr - -#endif // YarrInterpreter_h diff --git a/src/3rdparty/masm/yarr/YarrJIT.cpp b/src/3rdparty/masm/yarr/YarrJIT.cpp index 71123b7be7..056de2dbde 100644 --- a/src/3rdparty/masm/yarr/YarrJIT.cpp +++ b/src/3rdparty/masm/yarr/YarrJIT.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2009, 2013 Apple Inc. All rights reserved. + * Copyright (C) 2009-2018 Apple Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -25,22 +25,23 @@ #include "config.h" #include "YarrJIT.h" + #include <wtf/ASCIICType.h> +#include "LinkBuffer.h" #include "Options.h" +#include "VM.h" #include "Yarr.h" -#include "YarrCanonicalizeUCS2.h" +#include "YarrCanonicalize.h" #if ENABLE(YARR_JIT) -#include "LinkBuffer.h" - using namespace WTF; namespace JSC { namespace Yarr { template<YarrJITCompileMode compileMode> class YarrGenerator : private DefaultMacroAssembler { - friend void jitCompile(JSGlobalData*, YarrCodeBlock& jitObject, const String& pattern, unsigned& numSubpatterns, const char*& error, bool ignoreCase, bool multiline); + friend void jitCompile(VM*, YarrCodeBlock&, const String& pattern, unsigned& numSubpatterns, const char*& error, bool ignoreCase, bool multiline); #if CPU(ARM) static const RegisterID input = ARMRegisters::r0; @@ -50,20 +51,38 @@ class YarrGenerator : private DefaultMacroAssembler { static const RegisterID regT0 = ARMRegisters::r4; static const RegisterID regT1 = ARMRegisters::r5; + static const RegisterID initialStart = ARMRegisters::r8; static const RegisterID returnRegister = ARMRegisters::r0; static const RegisterID returnRegister2 = ARMRegisters::r1; + +#define HAVE_INITIAL_START_REG #elif CPU(ARM64) + // Argument registers static const RegisterID input = ARM64Registers::x0; static const RegisterID index = ARM64Registers::x1; static const RegisterID length = ARM64Registers::x2; static const RegisterID output = ARM64Registers::x3; - - static const RegisterID regT0 = ARM64Registers::x4; - static const RegisterID regT1 = ARM64Registers::x5; + static const RegisterID freelistRegister = ARM64Registers::x4; + static const RegisterID freelistSizeRegister = ARM64Registers::x5; + + // Scratch registers + static const RegisterID regT0 = ARM64Registers::x6; + static const RegisterID regT1 = ARM64Registers::x7; + static const RegisterID regT2 = ARM64Registers::x8; + static const RegisterID remainingMatchCount = ARM64Registers::x9; + static const RegisterID regUnicodeInputAndTrail = ARM64Registers::x10; + static const RegisterID initialStart = ARM64Registers::x11; + static const RegisterID supplementaryPlanesBase = ARM64Registers::x12; + static const RegisterID surrogateTagMask = ARM64Registers::x13; + static const RegisterID leadingSurrogateTag = ARM64Registers::x14; + static const RegisterID trailingSurrogateTag = ARM64Registers::x15; static const RegisterID returnRegister = ARM64Registers::x0; static const RegisterID returnRegister2 = ARM64Registers::x1; + +#define HAVE_INITIAL_START_REG +#define JIT_UNICODE_EXPRESSIONS #elif CPU(MIPS) static const RegisterID input = MIPSRegisters::a0; static const RegisterID index = MIPSRegisters::a1; @@ -72,20 +91,12 @@ class YarrGenerator : private DefaultMacroAssembler { static const RegisterID regT0 = MIPSRegisters::t4; static const RegisterID regT1 = MIPSRegisters::t5; + static const RegisterID initialStart = MIPSRegisters::t6; static const RegisterID returnRegister = MIPSRegisters::v0; static const RegisterID returnRegister2 = MIPSRegisters::v1; -#elif CPU(SH4) - static const RegisterID input = SH4Registers::r4; - static const RegisterID index = SH4Registers::r5; - static const RegisterID length = SH4Registers::r6; - static const RegisterID output = SH4Registers::r7; - static const RegisterID regT0 = SH4Registers::r0; - static const RegisterID regT1 = SH4Registers::r1; - - static const RegisterID returnRegister = SH4Registers::r0; - static const RegisterID returnRegister2 = SH4Registers::r1; +#define HAVE_INITIAL_START_REG #elif CPU(X86) static const RegisterID input = X86Registers::eax; static const RegisterID index = X86Registers::edx; @@ -99,10 +110,13 @@ class YarrGenerator : private DefaultMacroAssembler { static const RegisterID returnRegister2 = X86Registers::edx; #elif CPU(X86_64) #if !OS(WINDOWS) + // Argument registers static const RegisterID input = X86Registers::edi; static const RegisterID index = X86Registers::esi; static const RegisterID length = X86Registers::edx; static const RegisterID output = X86Registers::ecx; + static const RegisterID freelistRegister = X86Registers::r8; + static const RegisterID freelistSizeRegister = X86Registers::r9; // Only used during initialization. #else // If the return value doesn't fit in 64bits, its destination is pointed by rcx and the parameters are shifted. // http://msdn.microsoft.com/en-us/library/7572ztz4.aspx @@ -113,11 +127,186 @@ class YarrGenerator : private DefaultMacroAssembler { static const RegisterID output = X86Registers::r10; #endif + // Scratch registers static const RegisterID regT0 = X86Registers::eax; - static const RegisterID regT1 = X86Registers::ebx; +#if !OS(WINDOWS) + static const RegisterID regT1 = X86Registers::r9; + static const RegisterID regT2 = X86Registers::r10; +#else + static const RegisterID regT1 = X86Registers::ecx; + static const RegisterID regT2 = X86Registers::edi; +#endif + + static const RegisterID initialStart = X86Registers::ebx; +#if !OS(WINDOWS) + static const RegisterID remainingMatchCount = X86Registers::r12; +#else + static const RegisterID remainingMatchCount = X86Registers::esi; +#endif + static const RegisterID regUnicodeInputAndTrail = X86Registers::r13; + static const RegisterID leadingSurrogateTag = X86Registers::r14; + static const RegisterID trailingSurrogateTag = X86Registers::r15; static const RegisterID returnRegister = X86Registers::eax; static const RegisterID returnRegister2 = X86Registers::edx; + + const TrustedImm32 supplementaryPlanesBase = TrustedImm32(0x10000); + const TrustedImm32 surrogateTagMask = TrustedImm32(0xfffffc00); +#define HAVE_INITIAL_START_REG +#define JIT_UNICODE_EXPRESSIONS +#endif + +#if ENABLE(YARR_JIT_ALL_PARENS_EXPRESSIONS) + struct ParenContextSizes { + size_t m_numSubpatterns; + size_t m_frameSlots; + + ParenContextSizes(size_t numSubpatterns, size_t frameSlots) + : m_numSubpatterns(numSubpatterns) + , m_frameSlots(frameSlots) + { + } + + size_t numSubpatterns() { return m_numSubpatterns; } + + size_t frameSlots() { return m_frameSlots; } + }; + + struct ParenContext { + struct ParenContext* next; + uint32_t begin; + uint32_t matchAmount; + uintptr_t returnAddress; + struct Subpatterns { + unsigned start; + unsigned end; + } subpatterns[0]; + uintptr_t frameSlots[0]; + + static size_t sizeFor(ParenContextSizes& parenContextSizes) + { + return sizeof(ParenContext) + sizeof(Subpatterns) * parenContextSizes.numSubpatterns() + sizeof(uintptr_t) * parenContextSizes.frameSlots(); + } + + static ptrdiff_t nextOffset() + { + return offsetof(ParenContext, next); + } + + static ptrdiff_t beginOffset() + { + return offsetof(ParenContext, begin); + } + + static ptrdiff_t matchAmountOffset() + { + return offsetof(ParenContext, matchAmount); + } + + static ptrdiff_t returnAddressOffset() + { + return offsetof(ParenContext, returnAddress); + } + + static ptrdiff_t subpatternOffset(size_t subpattern) + { + return offsetof(ParenContext, subpatterns) + (subpattern - 1) * sizeof(Subpatterns); + } + + static ptrdiff_t savedFrameOffset(ParenContextSizes& parenContextSizes) + { + return offsetof(ParenContext, subpatterns) + (parenContextSizes.numSubpatterns()) * sizeof(Subpatterns); + } + }; + + void initParenContextFreeList() + { + RegisterID parenContextPointer = regT0; + RegisterID nextParenContextPointer = regT2; + + size_t parenContextSize = ParenContext::sizeFor(m_parenContextSizes); + + parenContextSize = WTF::roundUpToMultipleOf<sizeof(uintptr_t)>(parenContextSize); + + // Check that the paren context is a reasonable size. + if (parenContextSize > INT16_MAX) + m_abortExecution.append(jump()); + + Jump emptyFreeList = branchTestPtr(Zero, freelistRegister); + move(freelistRegister, parenContextPointer); + addPtr(TrustedImm32(parenContextSize), freelistRegister, nextParenContextPointer); + addPtr(freelistRegister, freelistSizeRegister); + subPtr(TrustedImm32(parenContextSize), freelistSizeRegister); + + Label loopTop(this); + Jump initDone = branchPtr(Above, nextParenContextPointer, freelistSizeRegister); + storePtr(nextParenContextPointer, Address(parenContextPointer, ParenContext::nextOffset())); + move(nextParenContextPointer, parenContextPointer); + addPtr(TrustedImm32(parenContextSize), parenContextPointer, nextParenContextPointer); + jump(loopTop); + + initDone.link(this); + storePtr(TrustedImmPtr(nullptr), Address(parenContextPointer, ParenContext::nextOffset())); + emptyFreeList.link(this); + } + + void allocateParenContext(RegisterID result) + { + m_abortExecution.append(branchTestPtr(Zero, freelistRegister)); + sub32(TrustedImm32(1), remainingMatchCount); + m_hitMatchLimit.append(branchTestPtr(Zero, remainingMatchCount)); + move(freelistRegister, result); + loadPtr(Address(freelistRegister, ParenContext::nextOffset()), freelistRegister); + } + + void freeParenContext(RegisterID headPtrRegister, RegisterID newHeadPtrRegister) + { + loadPtr(Address(headPtrRegister, ParenContext::nextOffset()), newHeadPtrRegister); + storePtr(freelistRegister, Address(headPtrRegister, ParenContext::nextOffset())); + move(headPtrRegister, freelistRegister); + } + + void saveParenContext(RegisterID parenContextReg, RegisterID tempReg, unsigned firstSubpattern, unsigned lastSubpattern, unsigned subpatternBaseFrameLocation) + { + store32(index, Address(parenContextReg, ParenContext::beginOffset())); + loadFromFrame(subpatternBaseFrameLocation + BackTrackInfoParentheses::matchAmountIndex(), tempReg); + store32(tempReg, Address(parenContextReg, ParenContext::matchAmountOffset())); + loadFromFrame(subpatternBaseFrameLocation + BackTrackInfoParentheses::returnAddressIndex(), tempReg); + storePtr(tempReg, Address(parenContextReg, ParenContext::returnAddressOffset())); + if (compileMode == IncludeSubpatterns) { + for (unsigned subpattern = firstSubpattern; subpattern <= lastSubpattern; subpattern++) { + loadPtr(Address(output, (subpattern << 1) * sizeof(unsigned)), tempReg); + storePtr(tempReg, Address(parenContextReg, ParenContext::subpatternOffset(subpattern))); + clearSubpatternStart(subpattern); + } + } + subpatternBaseFrameLocation += YarrStackSpaceForBackTrackInfoParentheses; + for (unsigned frameLocation = subpatternBaseFrameLocation; frameLocation < m_parenContextSizes.frameSlots(); frameLocation++) { + loadFromFrame(frameLocation, tempReg); + storePtr(tempReg, Address(parenContextReg, ParenContext::savedFrameOffset(m_parenContextSizes) + frameLocation * sizeof(uintptr_t))); + } + } + + void restoreParenContext(RegisterID parenContextReg, RegisterID tempReg, unsigned firstSubpattern, unsigned lastSubpattern, unsigned subpatternBaseFrameLocation) + { + load32(Address(parenContextReg, ParenContext::beginOffset()), index); + storeToFrame(index, subpatternBaseFrameLocation + BackTrackInfoParentheses::beginIndex()); + load32(Address(parenContextReg, ParenContext::matchAmountOffset()), tempReg); + storeToFrame(tempReg, subpatternBaseFrameLocation + BackTrackInfoParentheses::matchAmountIndex()); + loadPtr(Address(parenContextReg, ParenContext::returnAddressOffset()), tempReg); + storeToFrame(tempReg, subpatternBaseFrameLocation + BackTrackInfoParentheses::returnAddressIndex()); + if (compileMode == IncludeSubpatterns) { + for (unsigned subpattern = firstSubpattern; subpattern <= lastSubpattern; subpattern++) { + loadPtr(Address(parenContextReg, ParenContext::subpatternOffset(subpattern)), tempReg); + storePtr(tempReg, Address(output, (subpattern << 1) * sizeof(unsigned))); + } + } + subpatternBaseFrameLocation += YarrStackSpaceForBackTrackInfoParentheses; + for (unsigned frameLocation = subpatternBaseFrameLocation; frameLocation < m_parenContextSizes.frameSlots(); frameLocation++) { + loadPtr(Address(parenContextReg, ParenContext::savedFrameOffset(m_parenContextSizes) + frameLocation * sizeof(uintptr_t)), tempReg); + storeToFrame(tempReg, frameLocation); + } + } #endif void optimizeAlternative(PatternAlternative* alternative) @@ -129,8 +318,10 @@ class YarrGenerator : private DefaultMacroAssembler { PatternTerm& term = alternative->m_terms[i]; PatternTerm& nextTerm = alternative->m_terms[i + 1]; + // We can move BMP only character classes after fixed character terms. if ((term.type == PatternTerm::TypeCharacterClass) && (term.quantityType == QuantifierFixedCount) + && (!m_decodeSurrogatePairs || (!term.characterClass->m_hasNonBMPCharacters && !term.m_invert)) && (nextTerm.type == PatternTerm::TypePatternCharacter) && (nextTerm.quantityType == QuantifierFixedCount)) { PatternTerm termCopy = term; @@ -140,7 +331,7 @@ class YarrGenerator : private DefaultMacroAssembler { } } - void matchCharacterClassRange(RegisterID character, JumpList& failures, JumpList& matchDest, const CharacterRange* ranges, unsigned count, unsigned* matchIndex, const UChar* matches, unsigned matchCount) + void matchCharacterClassRange(RegisterID character, JumpList& failures, JumpList& matchDest, const CharacterRange* ranges, unsigned count, unsigned* matchIndex, const UChar32* matches, unsigned matchCount) { do { // pick which range we're going to generate @@ -189,26 +380,28 @@ class YarrGenerator : private DefaultMacroAssembler { void matchCharacterClass(RegisterID character, JumpList& matchDest, const CharacterClass* charClass) { - if (charClass->m_table) { + if (charClass->m_table && !m_decodeSurrogatePairs) { ExtendedAddress tableEntry(character, reinterpret_cast<intptr_t>(charClass->m_table)); matchDest.append(branchTest8(charClass->m_tableInverted ? Zero : NonZero, tableEntry)); return; } - Jump unicodeFail; + JumpList unicodeFail; if (charClass->m_matchesUnicode.size() || charClass->m_rangesUnicode.size()) { - Jump isAscii = branch32(LessThanOrEqual, character, TrustedImm32(0x7f)); + JumpList isAscii; + if (charClass->m_matches.size() || charClass->m_ranges.size()) + isAscii.append(branch32(LessThanOrEqual, character, TrustedImm32(0x7f))); if (charClass->m_matchesUnicode.size()) { for (unsigned i = 0; i < charClass->m_matchesUnicode.size(); ++i) { - UChar ch = charClass->m_matchesUnicode[i]; + UChar32 ch = charClass->m_matchesUnicode[i]; matchDest.append(branch32(Equal, character, Imm32(ch))); } } if (charClass->m_rangesUnicode.size()) { for (unsigned i = 0; i < charClass->m_rangesUnicode.size(); ++i) { - UChar lo = charClass->m_rangesUnicode[i].begin; - UChar hi = charClass->m_rangesUnicode[i].end; + UChar32 lo = charClass->m_rangesUnicode[i].begin; + UChar32 hi = charClass->m_rangesUnicode[i].end; Jump below = branch32(LessThan, character, Imm32(lo)); matchDest.append(branch32(LessThanOrEqual, character, Imm32(hi))); @@ -216,18 +409,16 @@ class YarrGenerator : private DefaultMacroAssembler { } } - unicodeFail = jump(); + if (charClass->m_matches.size() || charClass->m_ranges.size()) + unicodeFail = jump(); isAscii.link(this); } if (charClass->m_ranges.size()) { unsigned matchIndex = 0; JumpList failures; - ASSERT(charClass->m_ranges.size() <= UINT_MAX); - matchCharacterClassRange(character, failures, matchDest, &charClass->m_ranges[0], - static_cast<unsigned>(charClass->m_ranges.size()), - &matchIndex, charClass->m_matches.isEmpty() ? 0 : &charClass->m_matches[0], - static_cast<unsigned>(charClass->m_matches.size())); + matchCharacterClassRange(character, failures, matchDest, charClass->m_ranges.data(), charClass->m_ranges.size(), + &matchIndex, charClass->m_matches.data(), charClass->m_matches.size()); while (matchIndex < charClass->m_matches.size()) matchDest.append(branch32(Equal, character, Imm32((unsigned short)charClass->m_matches[matchIndex++]))); @@ -238,7 +429,7 @@ class YarrGenerator : private DefaultMacroAssembler { for (unsigned i = 0; i < charClass->m_matches.size(); ++i) { char ch = charClass->m_matches[i]; - if (m_pattern.m_ignoreCase) { + if (m_pattern.ignoreCase()) { if (isASCIILower(ch)) { matchesAZaz.append(ch); continue; @@ -249,8 +440,7 @@ class YarrGenerator : private DefaultMacroAssembler { matchDest.append(branch32(Equal, character, Imm32((unsigned short)ch))); } - ASSERT(matchesAZaz.size() <= UINT_MAX); - if (unsigned countAZaz = static_cast<int>(matchesAZaz.size())) { + if (unsigned countAZaz = matchesAZaz.size()) { or32(TrustedImm32(32), character); for (unsigned i = 0; i < countAZaz; ++i) matchDest.append(branch32(Equal, character, TrustedImm32(matchesAZaz[i]))); @@ -290,29 +480,102 @@ class YarrGenerator : private DefaultMacroAssembler { return branch32(NotEqual, index, length); } - Jump jumpIfCharNotEquals(UChar ch, int inputPosition, RegisterID character) + BaseIndex negativeOffsetIndexedAddress(Checked<unsigned> negativeCharacterOffset, RegisterID tempReg, RegisterID indexReg = index) { - readCharacter(inputPosition, character); - - // For case-insesitive compares, non-ascii characters that have different - // upper & lower case representations are converted to a character class. - ASSERT(!m_pattern.m_ignoreCase || isASCIIAlpha(ch) || isCanonicallyUnique(ch)); - if (m_pattern.m_ignoreCase && isASCIIAlpha(ch)) { - or32(TrustedImm32(0x20), character); - ch |= 0x20; + RegisterID base = input; + + // BaseIndex() addressing can take a int32_t offset. Given that we can have a regular + // expression that has unsigned character offsets, BaseIndex's signed offset is insufficient + // for addressing in extreme cases where we might underflow. Therefore we check to see if + // negativeCharacterOffset will underflow directly or after converting for 16 bit characters. + // If so, we do our own address calculating by adjusting the base, using the result register + // as a temp address register. + unsigned maximumNegativeOffsetForCharacterSize = m_charSize == Char8 ? 0x7fffffff : 0x3fffffff; + unsigned offsetAdjustAmount = 0x40000000; + if (negativeCharacterOffset.unsafeGet() > maximumNegativeOffsetForCharacterSize) { + base = tempReg; + move(input, base); + while (negativeCharacterOffset.unsafeGet() > maximumNegativeOffsetForCharacterSize) { + subPtr(TrustedImm32(offsetAdjustAmount), base); + if (m_charSize != Char8) + subPtr(TrustedImm32(offsetAdjustAmount), base); + negativeCharacterOffset -= offsetAdjustAmount; + } } - return branch32(NotEqual, character, Imm32(ch)); + Checked<int32_t> characterOffset(-static_cast<int32_t>(negativeCharacterOffset.unsafeGet())); + + if (m_charSize == Char8) + return BaseIndex(input, indexReg, TimesOne, (characterOffset * static_cast<int32_t>(sizeof(char))).unsafeGet()); + + return BaseIndex(input, indexReg, TimesTwo, (characterOffset * static_cast<int32_t>(sizeof(UChar))).unsafeGet()); + } + +#ifdef JIT_UNICODE_EXPRESSIONS + void tryReadUnicodeCharImpl(RegisterID resultReg) + { + ASSERT(m_charSize == Char16); + + JumpList notUnicode; + load16Unaligned(regUnicodeInputAndTrail, resultReg); + and32(surrogateTagMask, resultReg, regT2); + notUnicode.append(branch32(NotEqual, regT2, leadingSurrogateTag)); + addPtr(TrustedImm32(2), regUnicodeInputAndTrail); + getEffectiveAddress(BaseIndex(input, length, TimesTwo), regT2); + notUnicode.append(branch32(AboveOrEqual, regUnicodeInputAndTrail, regT2)); + load16Unaligned(Address(regUnicodeInputAndTrail), regUnicodeInputAndTrail); + and32(surrogateTagMask, regUnicodeInputAndTrail, regT2); + notUnicode.append(branch32(NotEqual, regT2, trailingSurrogateTag)); + sub32(leadingSurrogateTag, resultReg); + sub32(trailingSurrogateTag, regUnicodeInputAndTrail); + lshift32(TrustedImm32(10), resultReg); + or32(regUnicodeInputAndTrail, resultReg); + add32(supplementaryPlanesBase, resultReg); + notUnicode.link(this); + } + + void tryReadUnicodeChar(BaseIndex address, RegisterID resultReg) + { + ASSERT(m_charSize == Char16); + + getEffectiveAddress(address, regUnicodeInputAndTrail); + + if (resultReg == regT0) + m_tryReadUnicodeCharacterCalls.append(nearCall()); + else + tryReadUnicodeCharImpl(resultReg); } +#endif - void readCharacter(int inputPosition, RegisterID reg) + void readCharacter(Checked<unsigned> negativeCharacterOffset, RegisterID resultReg, RegisterID indexReg = index) { + BaseIndex address = negativeOffsetIndexedAddress(negativeCharacterOffset, resultReg, indexReg); + if (m_charSize == Char8) - load8(BaseIndex(input, index, TimesOne, inputPosition * sizeof(char)), reg); + load8(address, resultReg); +#ifdef JIT_UNICODE_EXPRESSIONS + else if (m_decodeSurrogatePairs) + tryReadUnicodeChar(address, resultReg); +#endif else - load16(BaseIndex(input, index, TimesTwo, inputPosition * sizeof(UChar)), reg); + load16Unaligned(address, resultReg); } + Jump jumpIfCharNotEquals(UChar32 ch, Checked<unsigned> negativeCharacterOffset, RegisterID character) + { + readCharacter(negativeCharacterOffset, character); + + // For case-insesitive compares, non-ascii characters that have different + // upper & lower case representations are converted to a character class. + ASSERT(!m_pattern.ignoreCase() || isASCIIAlpha(ch) || isCanonicallyUnique(ch, m_canonicalMode)); + if (m_pattern.ignoreCase() && isASCIIAlpha(ch)) { + or32(TrustedImm32(0x20), character); + ch |= 0x20; + } + + return branch32(NotEqual, character, Imm32(ch)); + } + void storeToFrame(RegisterID reg, unsigned frameLocation) { poke(reg, frameLocation); @@ -323,9 +586,16 @@ class YarrGenerator : private DefaultMacroAssembler { poke(imm, frameLocation); } +#if CPU(ARM64) || CPU(X86_64) + void storeToFrame(TrustedImmPtr imm, unsigned frameLocation) + { + poke(imm, frameLocation); + } +#endif + DataLabelPtr storeToFrameWithPatch(unsigned frameLocation) { - return storePtrWithPatch(TrustedImmPtr(0), Address(stackPointerRegister, frameLocation * sizeof(void*))); + return storePtrWithPatch(TrustedImmPtr(nullptr), Address(stackPointerRegister, frameLocation * sizeof(void*))); } void loadFromFrame(unsigned frameLocation, RegisterID reg) @@ -340,32 +610,82 @@ class YarrGenerator : private DefaultMacroAssembler { unsigned alignCallFrameSizeInBytes(unsigned callFrameSize) { + if (!callFrameSize) + return 0; + callFrameSize *= sizeof(void*); if (callFrameSize / sizeof(void*) != m_pattern.m_body->m_callFrameSize) CRASH(); - // Originally, the code was: -// callFrameSize = (callFrameSize + 0x3f) & ~0x3f; - // However, 64 bytes is a bit surprising. The biggest "alignment" requirement is on Aarch64, where: - // "SP mod 16 = 0. The stack must be quad-word aligned." (IHI0055B_aapcs64.pdf) - callFrameSize = (callFrameSize + 0xf) & ~0xf; - if (!callFrameSize) - CRASH(); + callFrameSize = (callFrameSize + 0x3f) & ~0x3f; return callFrameSize; } void initCallFrame() { - unsigned callFrameSize = m_pattern.m_body->m_callFrameSize; - if (callFrameSize) - subPtr(Imm32(alignCallFrameSizeInBytes(callFrameSize)), stackPointerRegister); + unsigned callFrameSizeInBytes = alignCallFrameSizeInBytes(m_pattern.m_body->m_callFrameSize); + if (callFrameSizeInBytes) { +#if CPU(X86_64) || CPU(ARM64) + if (Options::zeroStackFrame()) { + // We need to start from the stack pointer, because we could have spilled callee saves + move(stackPointerRegister, regT0); + subPtr(Imm32(callFrameSizeInBytes), stackPointerRegister); + if (callFrameSizeInBytes <= 128) { + for (unsigned offset = 0; offset < callFrameSizeInBytes; offset += sizeof(intptr_t)) + storePtr(TrustedImmPtr(0), Address(regT0, -8 - offset)); + } else { + Label zeroLoop = label(); + subPtr(TrustedImm32(sizeof(intptr_t) * 2), regT0); +#if CPU(ARM64) + storePair64(ARM64Registers::zr, ARM64Registers::zr, regT0); +#else + storePtr(TrustedImmPtr(0), Address(regT0)); + storePtr(TrustedImmPtr(0), Address(regT0, sizeof(intptr_t))); +#endif + branchPtr(NotEqual, regT0, stackPointerRegister).linkTo(zeroLoop, this); + } + } else +#endif + subPtr(Imm32(callFrameSizeInBytes), stackPointerRegister); + + } } void removeCallFrame() { - unsigned callFrameSize = m_pattern.m_body->m_callFrameSize; - if (callFrameSize) - addPtr(Imm32(alignCallFrameSizeInBytes(callFrameSize)), stackPointerRegister); + unsigned callFrameSizeInBytes = alignCallFrameSizeInBytes(m_pattern.m_body->m_callFrameSize); + if (callFrameSizeInBytes) + addPtr(Imm32(callFrameSizeInBytes), stackPointerRegister); + } + + void generateFailReturn() + { + move(TrustedImmPtr((void*)WTF::notFound), returnRegister); + move(TrustedImm32(0), returnRegister2); + generateReturn(); + } + + void generateJITFailReturn() + { + if (m_abortExecution.empty() && m_hitMatchLimit.empty()) + return; + + JumpList finishExiting; + if (!m_abortExecution.empty()) { + m_abortExecution.link(this); + move(TrustedImmPtr((void*)static_cast<size_t>(-2)), returnRegister); + finishExiting.append(jump()); + } + + if (!m_hitMatchLimit.empty()) { + m_hitMatchLimit.link(this); + move(TrustedImmPtr((void*)static_cast<size_t>(-1)), returnRegister); + } + + finishExiting.link(this); + removeCallFrame(); + move(TrustedImm32(0), returnRegister2); + generateReturn(); } - // Used to record subpatters, should only be called if compileMode is IncludeSubpatterns. + // Used to record subpatterns, should only be called if compileMode is IncludeSubpatterns. void setSubpatternStart(RegisterID reg, unsigned subpattern) { ASSERT(subpattern); @@ -385,6 +705,12 @@ class YarrGenerator : private DefaultMacroAssembler { store32(TrustedImm32(-1), Address(output, (subpattern << 1) * sizeof(int))); } + void clearMatches(unsigned subpattern, unsigned lastSubpattern) + { + for (; subpattern <= lastSubpattern; subpattern++) + clearSubpatternStart(subpattern); + } + // We use one of three different strategies to track the start of the current match, // while matching. // 1) If the pattern has a fixed size, do nothing! - we calculate the value lazily @@ -427,18 +753,21 @@ class YarrGenerator : private DefaultMacroAssembler { OpNestedAlternativeNext, OpNestedAlternativeEnd, // Used for alternatives in subpatterns where there is only a single - // alternative (backtrackingis easier in these cases), or for alternatives + // alternative (backtracking is easier in these cases), or for alternatives // which never need to be backtracked (those in parenthetical assertions, // terminal subpatterns). OpSimpleNestedAlternativeBegin, OpSimpleNestedAlternativeNext, OpSimpleNestedAlternativeEnd, - // Used to wrap 'Once' subpattern matches (quantityCount == 1). + // Used to wrap 'Once' subpattern matches (quantityMaxCount == 1). OpParenthesesSubpatternOnceBegin, OpParenthesesSubpatternOnceEnd, // Used to wrap 'Terminal' subpattern matches (at the end of the regexp). OpParenthesesSubpatternTerminalBegin, OpParenthesesSubpatternTerminalEnd, + // Used to wrap generic captured matches + OpParenthesesSubpatternBegin, + OpParenthesesSubpatternEnd, // Used to wrap parenthetical assertions. OpParentheticalAssertionBegin, OpParentheticalAssertionEnd, @@ -468,16 +797,16 @@ class YarrGenerator : private DefaultMacroAssembler { // The operation, as a YarrOpCode, and also a reference to the PatternTerm. YarrOpCode m_op; - PatternTerm* m_term = nullptr; + PatternTerm* m_term; // For alternatives, this holds the PatternAlternative and doubly linked // references to this alternative's siblings. In the case of the // OpBodyAlternativeEnd node at the end of a section of repeating nodes, // m_nextOp will reference the OpBodyAlternativeBegin node of the first // repeating alternative. - PatternAlternative* m_alternative = nullptr; - size_t m_previousOp = 0; - size_t m_nextOp = 0; + PatternAlternative* m_alternative; + size_t m_previousOp; + size_t m_nextOp; // Used to record a set of Jumps out of the generated code, typically // used for jumps out to backtracking code, and a single reentry back @@ -495,9 +824,9 @@ class YarrGenerator : private DefaultMacroAssembler { bool m_isDeadCode; // Currently used in the case of some of the more complex management of - // 'm_checked', to cache the offset used in this alternative, to avoid + // 'm_checkedOffset', to cache the offset used in this alternative, to avoid // recalculating it. - int m_checkAdjust; + Checked<unsigned> m_checkAdjust; // Used by OpNestedAlternativeNext/End to hold the pointer to the // value that will be pushed into the pattern's frame to return to, @@ -599,7 +928,7 @@ class YarrGenerator : private DefaultMacroAssembler { } // Called at the end of code generation to link all return addresses. - void linkDataLabels(LinkBuffer<JSC::DefaultMacroAssembler>& linkBuffer) + void linkDataLabels(DefaultLinkBuffer& linkBuffer) { ASSERT(isEmpty()); for (unsigned i = 0; i < m_backtrackRecords.size(); ++i) @@ -642,14 +971,14 @@ class YarrGenerator : private DefaultMacroAssembler { YarrOp& op = m_ops[opIndex]; PatternTerm* term = op.m_term; - if (m_pattern.m_multiline) { + if (m_pattern.multiline()) { const RegisterID character = regT0; JumpList matchDest; if (!term->inputPosition) - matchDest.append(branch32(Equal, index, Imm32(m_checked))); + matchDest.append(branch32(Equal, index, Imm32(m_checkedOffset.unsafeGet()))); - readCharacter((term->inputPosition - m_checked) - 1, character); + readCharacter(m_checkedOffset - term->inputPosition + 1, character); matchCharacterClass(character, matchDest, m_pattern.newlineCharacterClass()); op.m_jumps.append(jump()); @@ -659,7 +988,7 @@ class YarrGenerator : private DefaultMacroAssembler { if (term->inputPosition) op.m_jumps.append(jump()); else - op.m_jumps.append(branch32(NotEqual, index, Imm32(m_checked))); + op.m_jumps.append(branch32(NotEqual, index, Imm32(m_checkedOffset.unsafeGet()))); } } void backtrackAssertionBOL(size_t opIndex) @@ -672,20 +1001,20 @@ class YarrGenerator : private DefaultMacroAssembler { YarrOp& op = m_ops[opIndex]; PatternTerm* term = op.m_term; - if (m_pattern.m_multiline) { + if (m_pattern.multiline()) { const RegisterID character = regT0; JumpList matchDest; - if (term->inputPosition == m_checked) + if (term->inputPosition == m_checkedOffset.unsafeGet()) matchDest.append(atEndOfInput()); - readCharacter(term->inputPosition - m_checked, character); + readCharacter(m_checkedOffset - term->inputPosition, character); matchCharacterClass(character, matchDest, m_pattern.newlineCharacterClass()); op.m_jumps.append(jump()); matchDest.link(this); } else { - if (term->inputPosition == m_checked) + if (term->inputPosition == m_checkedOffset.unsafeGet()) op.m_jumps.append(notAtEndOfInput()); // Erk, really should poison out these alternatives early. :-/ else @@ -705,11 +1034,19 @@ class YarrGenerator : private DefaultMacroAssembler { const RegisterID character = regT0; - if (term->inputPosition == m_checked) + if (term->inputPosition == m_checkedOffset.unsafeGet()) nextIsNotWordChar.append(atEndOfInput()); - readCharacter((term->inputPosition - m_checked), character); - matchCharacterClass(character, nextIsWordChar, m_pattern.wordcharCharacterClass()); + readCharacter(m_checkedOffset - term->inputPosition, character); + + CharacterClass* wordcharCharacterClass; + + if (m_unicodeIgnoreCase) + wordcharCharacterClass = m_pattern.wordUnicodeIgnoreCaseCharCharacterClass(); + else + wordcharCharacterClass = m_pattern.wordcharCharacterClass(); + + matchCharacterClass(character, nextIsWordChar, wordcharCharacterClass); } void generateAssertionWordBoundary(size_t opIndex) @@ -722,9 +1059,17 @@ class YarrGenerator : private DefaultMacroAssembler { Jump atBegin; JumpList matchDest; if (!term->inputPosition) - atBegin = branch32(Equal, index, Imm32(m_checked)); - readCharacter((term->inputPosition - m_checked) - 1, character); - matchCharacterClass(character, matchDest, m_pattern.wordcharCharacterClass()); + atBegin = branch32(Equal, index, Imm32(m_checkedOffset.unsafeGet())); + readCharacter(m_checkedOffset - term->inputPosition + 1, character); + + CharacterClass* wordcharCharacterClass; + + if (m_unicodeIgnoreCase) + wordcharCharacterClass = m_pattern.wordUnicodeIgnoreCaseCharCharacterClass(); + else + wordcharCharacterClass = m_pattern.wordcharCharacterClass(); + + matchCharacterClass(character, matchDest, wordcharCharacterClass); if (!term->inputPosition) atBegin.link(this); @@ -775,7 +1120,7 @@ class YarrGenerator : private DefaultMacroAssembler { YarrOp* nextOp = &m_ops[opIndex + 1]; PatternTerm* term = op.m_term; - UChar ch = term->patternCharacter; + UChar32 ch = term->patternCharacter; if ((ch > 0xff) && (m_charSize == Char8)) { // Have a 16 bit pattern character and an 8 bit string - short circuit @@ -784,21 +1129,21 @@ class YarrGenerator : private DefaultMacroAssembler { } const RegisterID character = regT0; - int maxCharactersAtOnce = m_charSize == Char8 ? 4 : 2; + unsigned maxCharactersAtOnce = m_charSize == Char8 ? 4 : 2; unsigned ignoreCaseMask = 0; #if CPU(BIG_ENDIAN) int allCharacters = ch << (m_charSize == Char8 ? 24 : 16); #else int allCharacters = ch; #endif - int numberCharacters; - int startTermPosition = term->inputPosition; + unsigned numberCharacters; + unsigned startTermPosition = term->inputPosition; // For case-insesitive compares, non-ascii characters that have different // upper & lower case representations are converted to a character class. - ASSERT(!m_pattern.m_ignoreCase || isASCIIAlpha(ch) || isCanonicallyUnique(ch)); + ASSERT(!m_pattern.ignoreCase() || isASCIIAlpha(ch) || isCanonicallyUnique(ch, m_canonicalMode)); - if (m_pattern.m_ignoreCase && isASCIIAlpha(ch)) + if (m_pattern.ignoreCase() && isASCIIAlpha(ch)) #if CPU(BIG_ENDIAN) ignoreCaseMask |= 32 << (m_charSize == Char8 ? 24 : 16); #else @@ -810,8 +1155,9 @@ class YarrGenerator : private DefaultMacroAssembler { if (nextTerm->type != PatternTerm::TypePatternCharacter || nextTerm->quantityType != QuantifierFixedCount - || nextTerm->quantityCount != 1 - || nextTerm->inputPosition != (startTermPosition + numberCharacters)) + || nextTerm->quantityMaxCount != 1 + || nextTerm->inputPosition != (startTermPosition + numberCharacters) + || (U16_LENGTH(nextTerm->patternCharacter) != 1 && m_decodeSurrogatePairs)) break; nextOp->m_isDeadCode = true; @@ -822,7 +1168,7 @@ class YarrGenerator : private DefaultMacroAssembler { int shiftAmount = (m_charSize == Char8 ? 8 : 16) * numberCharacters; #endif - UChar currentCharacter = nextTerm->patternCharacter; + UChar32 currentCharacter = nextTerm->patternCharacter; if ((currentCharacter > 0xff) && (m_charSize == Char8)) { // Have a 16 bit pattern character and an 8 bit string - short circuit @@ -832,47 +1178,43 @@ class YarrGenerator : private DefaultMacroAssembler { // For case-insesitive compares, non-ascii characters that have different // upper & lower case representations are converted to a character class. - ASSERT(!m_pattern.m_ignoreCase || isASCIIAlpha(currentCharacter) || isCanonicallyUnique(currentCharacter)); + ASSERT(!m_pattern.ignoreCase() || isASCIIAlpha(currentCharacter) || isCanonicallyUnique(currentCharacter, m_canonicalMode)); allCharacters |= (currentCharacter << shiftAmount); - if ((m_pattern.m_ignoreCase) && (isASCIIAlpha(currentCharacter))) + if ((m_pattern.ignoreCase()) && (isASCIIAlpha(currentCharacter))) ignoreCaseMask |= 32 << shiftAmount; } if (m_charSize == Char8) { switch (numberCharacters) { case 1: - op.m_jumps.append(jumpIfCharNotEquals(ch, startTermPosition - m_checked, character)); + op.m_jumps.append(jumpIfCharNotEquals(ch, m_checkedOffset - startTermPosition, character)); return; case 2: { - BaseIndex address(input, index, TimesOne, (startTermPosition - m_checked) * sizeof(LChar)); - load16Unaligned(address, character); + load16Unaligned(negativeOffsetIndexedAddress(m_checkedOffset - startTermPosition, character), character); break; } case 3: { - BaseIndex highAddress(input, index, TimesOne, (startTermPosition - m_checked) * sizeof(LChar)); - load16Unaligned(highAddress, character); + load16Unaligned(negativeOffsetIndexedAddress(m_checkedOffset - startTermPosition, character), character); if (ignoreCaseMask) or32(Imm32(ignoreCaseMask), character); op.m_jumps.append(branch32(NotEqual, character, Imm32((allCharacters & 0xffff) | ignoreCaseMask))); - op.m_jumps.append(jumpIfCharNotEquals(allCharacters >> 16, startTermPosition + 2 - m_checked, character)); + op.m_jumps.append(jumpIfCharNotEquals(allCharacters >> 16, m_checkedOffset - startTermPosition - 2, character)); return; } case 4: { - BaseIndex address(input, index, TimesOne, (startTermPosition - m_checked) * sizeof(LChar)); - load32WithUnalignedHalfWords(address, character); + load32WithUnalignedHalfWords(negativeOffsetIndexedAddress(m_checkedOffset- startTermPosition, character), character); break; } } } else { switch (numberCharacters) { case 1: - op.m_jumps.append(jumpIfCharNotEquals(ch, term->inputPosition - m_checked, character)); + op.m_jumps.append(jumpIfCharNotEquals(ch, m_checkedOffset - term->inputPosition, character)); return; case 2: - BaseIndex address(input, index, TimesTwo, (term->inputPosition - m_checked) * sizeof(UChar)); - load32WithUnalignedHalfWords(address, character); + load32WithUnalignedHalfWords(negativeOffsetIndexedAddress(m_checkedOffset- term->inputPosition, character), character); break; } } @@ -891,32 +1233,33 @@ class YarrGenerator : private DefaultMacroAssembler { { YarrOp& op = m_ops[opIndex]; PatternTerm* term = op.m_term; - UChar ch = term->patternCharacter; + UChar32 ch = term->patternCharacter; const RegisterID character = regT0; const RegisterID countRegister = regT1; move(index, countRegister); - sub32(Imm32(term->quantityCount.unsafeGet()), countRegister); + Checked<unsigned> scaledMaxCount = term->quantityMaxCount; + scaledMaxCount *= U_IS_BMP(ch) ? 1 : 2; + sub32(Imm32(scaledMaxCount.unsafeGet()), countRegister); Label loop(this); - BaseIndex address(input, countRegister, m_charScale, (Checked<int>(term->inputPosition - m_checked + Checked<int64_t>(term->quantityCount)) * static_cast<int>(m_charSize == Char8 ? sizeof(char) : sizeof(UChar))).unsafeGet()); - - if (m_charSize == Char8) - load8(address, character); - else - load16(address, character); - + readCharacter(m_checkedOffset - term->inputPosition - scaledMaxCount, character, countRegister); // For case-insesitive compares, non-ascii characters that have different // upper & lower case representations are converted to a character class. - ASSERT(!m_pattern.m_ignoreCase || isASCIIAlpha(ch) || isCanonicallyUnique(ch)); - if (m_pattern.m_ignoreCase && isASCIIAlpha(ch)) { + ASSERT(!m_pattern.ignoreCase() || isASCIIAlpha(ch) || isCanonicallyUnique(ch, m_canonicalMode)); + if (m_pattern.ignoreCase() && isASCIIAlpha(ch)) { or32(TrustedImm32(0x20), character); ch |= 0x20; } op.m_jumps.append(branch32(NotEqual, character, Imm32(ch))); - add32(TrustedImm32(1), countRegister); +#ifdef JIT_UNICODE_EXPRESSIONS + if (m_decodeSurrogatePairs && !U_IS_BMP(ch)) + add32(TrustedImm32(2), countRegister); + else +#endif + add32(TrustedImm32(1), countRegister); branch32(NotEqual, countRegister, index).linkTo(loop, this); } void backtrackPatternCharacterFixed(size_t opIndex) @@ -928,7 +1271,7 @@ class YarrGenerator : private DefaultMacroAssembler { { YarrOp& op = m_ops[opIndex]; PatternTerm* term = op.m_term; - UChar ch = term->patternCharacter; + UChar32 ch = term->patternCharacter; const RegisterID character = regT0; const RegisterID countRegister = regT1; @@ -940,20 +1283,30 @@ class YarrGenerator : private DefaultMacroAssembler { JumpList failures; Label loop(this); failures.append(atEndOfInput()); - failures.append(jumpIfCharNotEquals(ch, term->inputPosition - m_checked, character)); + failures.append(jumpIfCharNotEquals(ch, m_checkedOffset - term->inputPosition, character)); - add32(TrustedImm32(1), countRegister); add32(TrustedImm32(1), index); - if (term->quantityCount == quantifyInfinite) +#ifdef JIT_UNICODE_EXPRESSIONS + if (m_decodeSurrogatePairs && !U_IS_BMP(ch)) { + Jump surrogatePairOk = notAtEndOfInput(); + sub32(TrustedImm32(1), index); + failures.append(jump()); + surrogatePairOk.link(this); + add32(TrustedImm32(1), index); + } +#endif + add32(TrustedImm32(1), countRegister); + + if (term->quantityMaxCount == quantifyInfinite) jump(loop); else - branch32(NotEqual, countRegister, Imm32(term->quantityCount.unsafeGet())).linkTo(loop, this); + branch32(NotEqual, countRegister, Imm32(term->quantityMaxCount.unsafeGet())).linkTo(loop, this); failures.link(this); } op.m_reentry = label(); - storeToFrame(countRegister, term->frameLocation); + storeToFrame(countRegister, term->frameLocation + BackTrackInfoPatternCharacter::matchAmountIndex()); } void backtrackPatternCharacterGreedy(size_t opIndex) { @@ -964,10 +1317,13 @@ class YarrGenerator : private DefaultMacroAssembler { m_backtrackingState.link(this); - loadFromFrame(term->frameLocation, countRegister); + loadFromFrame(term->frameLocation + BackTrackInfoPatternCharacter::matchAmountIndex(), countRegister); m_backtrackingState.append(branchTest32(Zero, countRegister)); sub32(TrustedImm32(1), countRegister); - sub32(TrustedImm32(1), index); + if (!m_decodeSurrogatePairs || U_IS_BMP(term->patternCharacter)) + sub32(TrustedImm32(1), index); + else + sub32(TrustedImm32(2), index); jump(op.m_reentry); } @@ -980,36 +1336,50 @@ class YarrGenerator : private DefaultMacroAssembler { move(TrustedImm32(0), countRegister); op.m_reentry = label(); - storeToFrame(countRegister, term->frameLocation); + storeToFrame(countRegister, term->frameLocation + BackTrackInfoPatternCharacter::matchAmountIndex()); } void backtrackPatternCharacterNonGreedy(size_t opIndex) { YarrOp& op = m_ops[opIndex]; PatternTerm* term = op.m_term; - UChar ch = term->patternCharacter; + UChar32 ch = term->patternCharacter; const RegisterID character = regT0; const RegisterID countRegister = regT1; m_backtrackingState.link(this); - loadFromFrame(term->frameLocation, countRegister); + loadFromFrame(term->frameLocation + BackTrackInfoPatternCharacter::matchAmountIndex(), countRegister); // Unless have a 16 bit pattern character and an 8 bit string - short circuit if (!((ch > 0xff) && (m_charSize == Char8))) { JumpList nonGreedyFailures; nonGreedyFailures.append(atEndOfInput()); - if (term->quantityCount != quantifyInfinite) - nonGreedyFailures.append(branch32(Equal, countRegister, Imm32(term->quantityCount.unsafeGet()))); - nonGreedyFailures.append(jumpIfCharNotEquals(ch, term->inputPosition - m_checked, character)); + if (term->quantityMaxCount != quantifyInfinite) + nonGreedyFailures.append(branch32(Equal, countRegister, Imm32(term->quantityMaxCount.unsafeGet()))); + nonGreedyFailures.append(jumpIfCharNotEquals(ch, m_checkedOffset - term->inputPosition, character)); - add32(TrustedImm32(1), countRegister); add32(TrustedImm32(1), index); +#ifdef JIT_UNICODE_EXPRESSIONS + if (m_decodeSurrogatePairs && !U_IS_BMP(ch)) { + Jump surrogatePairOk = notAtEndOfInput(); + sub32(TrustedImm32(1), index); + nonGreedyFailures.append(jump()); + surrogatePairOk.link(this); + add32(TrustedImm32(1), index); + } +#endif + add32(TrustedImm32(1), countRegister); jump(op.m_reentry); nonGreedyFailures.link(this); } + if (m_decodeSurrogatePairs && !U_IS_BMP(ch)) { + // subtract countRegister*2 for non-BMP characters + lshift32(TrustedImm32(1), countRegister); + } + sub32(countRegister, index); m_backtrackingState.fallthrough(); } @@ -1021,19 +1391,43 @@ class YarrGenerator : private DefaultMacroAssembler { const RegisterID character = regT0; + if (m_decodeSurrogatePairs) + storeToFrame(index, term->frameLocation + BackTrackInfoCharacterClass::beginIndex()); + JumpList matchDest; - readCharacter(term->inputPosition - m_checked, character); - matchCharacterClass(character, matchDest, term->characterClass); + readCharacter(m_checkedOffset - term->inputPosition, character); + // If we are matching the "any character" builtin class we only need to read the + // character and don't need to match as it will always succeed. + if (term->invert() || !term->characterClass->m_anyCharacter) { + matchCharacterClass(character, matchDest, term->characterClass); - if (term->invert()) - op.m_jumps.append(matchDest); - else { - op.m_jumps.append(jump()); - matchDest.link(this); + if (term->invert()) + op.m_jumps.append(matchDest); + else { + op.m_jumps.append(jump()); + matchDest.link(this); + } } +#ifdef JIT_UNICODE_EXPRESSIONS + if (m_decodeSurrogatePairs) { + Jump isBMPChar = branch32(LessThan, character, supplementaryPlanesBase); + add32(TrustedImm32(1), index); + isBMPChar.link(this); + } +#endif } void backtrackCharacterClassOnce(size_t opIndex) { +#ifdef JIT_UNICODE_EXPRESSIONS + if (m_decodeSurrogatePairs) { + YarrOp& op = m_ops[opIndex]; + PatternTerm* term = op.m_term; + + m_backtrackingState.link(this); + loadFromFrame(term->frameLocation + BackTrackInfoCharacterClass::beginIndex(), index); + m_backtrackingState.fallthrough(); + } +#endif backtrackTermDefault(opIndex); } @@ -1046,24 +1440,34 @@ class YarrGenerator : private DefaultMacroAssembler { const RegisterID countRegister = regT1; move(index, countRegister); - sub32(Imm32(term->quantityCount.unsafeGet()), countRegister); + sub32(Imm32(term->quantityMaxCount.unsafeGet()), countRegister); Label loop(this); JumpList matchDest; - if (m_charSize == Char8) - load8(BaseIndex(input, countRegister, TimesOne, (Checked<int>(term->inputPosition - m_checked + Checked<int64_t>(term->quantityCount)) * static_cast<int>(sizeof(char))).unsafeGet()), character); - else - load16(BaseIndex(input, countRegister, TimesTwo, (Checked<int>(term->inputPosition - m_checked + Checked<int64_t>(term->quantityCount)) * static_cast<int>(sizeof(UChar))).unsafeGet()), character); - matchCharacterClass(character, matchDest, term->characterClass); + readCharacter(m_checkedOffset - term->inputPosition - term->quantityMaxCount, character, countRegister); + // If we are matching the "any character" builtin class we only need to read the + // character and don't need to match as it will always succeed. + if (term->invert() || !term->characterClass->m_anyCharacter) { + matchCharacterClass(character, matchDest, term->characterClass); - if (term->invert()) - op.m_jumps.append(matchDest); - else { - op.m_jumps.append(jump()); - matchDest.link(this); + if (term->invert()) + op.m_jumps.append(matchDest); + else { + op.m_jumps.append(jump()); + matchDest.link(this); + } } add32(TrustedImm32(1), countRegister); +#ifdef JIT_UNICODE_EXPRESSIONS + if (m_decodeSurrogatePairs) { + Jump isBMPChar = branch32(LessThan, character, supplementaryPlanesBase); + op.m_jumps.append(atEndOfInput()); + add32(TrustedImm32(1), countRegister); + add32(TrustedImm32(1), index); + isBMPChar.link(this); + } +#endif branch32(NotEqual, countRegister, index).linkTo(loop, this); } void backtrackCharacterClassFixed(size_t opIndex) @@ -1079,6 +1483,8 @@ class YarrGenerator : private DefaultMacroAssembler { const RegisterID character = regT0; const RegisterID countRegister = regT1; + if (m_decodeSurrogatePairs) + storeToFrame(index, term->frameLocation + BackTrackInfoCharacterClass::beginIndex()); move(TrustedImm32(0), countRegister); JumpList failures; @@ -1086,20 +1492,33 @@ class YarrGenerator : private DefaultMacroAssembler { failures.append(atEndOfInput()); if (term->invert()) { - readCharacter(term->inputPosition - m_checked, character); + readCharacter(m_checkedOffset - term->inputPosition, character); matchCharacterClass(character, failures, term->characterClass); } else { JumpList matchDest; - readCharacter(term->inputPosition - m_checked, character); - matchCharacterClass(character, matchDest, term->characterClass); - failures.append(jump()); + readCharacter(m_checkedOffset - term->inputPosition, character); + // If we are matching the "any character" builtin class we only need to read the + // character and don't need to match as it will always succeed. + if (!term->characterClass->m_anyCharacter) { + matchCharacterClass(character, matchDest, term->characterClass); + failures.append(jump()); + } matchDest.link(this); } - add32(TrustedImm32(1), countRegister); add32(TrustedImm32(1), index); - if (term->quantityCount != quantifyInfinite) { - branch32(NotEqual, countRegister, Imm32(term->quantityCount.unsafeGet())).linkTo(loop, this); +#ifdef JIT_UNICODE_EXPRESSIONS + if (m_decodeSurrogatePairs) { + failures.append(atEndOfInput()); + Jump isBMPChar = branch32(LessThan, character, supplementaryPlanesBase); + add32(TrustedImm32(1), index); + isBMPChar.link(this); + } +#endif + add32(TrustedImm32(1), countRegister); + + if (term->quantityMaxCount != quantifyInfinite) { + branch32(NotEqual, countRegister, Imm32(term->quantityMaxCount.unsafeGet())).linkTo(loop, this); failures.append(jump()); } else jump(loop); @@ -1107,7 +1526,7 @@ class YarrGenerator : private DefaultMacroAssembler { failures.link(this); op.m_reentry = label(); - storeToFrame(countRegister, term->frameLocation); + storeToFrame(countRegister, term->frameLocation + BackTrackInfoCharacterClass::matchAmountIndex()); } void backtrackCharacterClassGreedy(size_t opIndex) { @@ -1118,10 +1537,34 @@ class YarrGenerator : private DefaultMacroAssembler { m_backtrackingState.link(this); - loadFromFrame(term->frameLocation, countRegister); + loadFromFrame(term->frameLocation + BackTrackInfoCharacterClass::matchAmountIndex(), countRegister); m_backtrackingState.append(branchTest32(Zero, countRegister)); sub32(TrustedImm32(1), countRegister); - sub32(TrustedImm32(1), index); + if (!m_decodeSurrogatePairs) + sub32(TrustedImm32(1), index); + else { + const RegisterID character = regT0; + + loadFromFrame(term->frameLocation + BackTrackInfoCharacterClass::beginIndex(), index); + // Rematch one less + storeToFrame(countRegister, term->frameLocation + BackTrackInfoCharacterClass::matchAmountIndex()); + + Label rematchLoop(this); + readCharacter(m_checkedOffset - term->inputPosition, character); + + sub32(TrustedImm32(1), countRegister); + add32(TrustedImm32(1), index); + +#ifdef JIT_UNICODE_EXPRESSIONS + Jump isBMPChar = branch32(LessThan, character, supplementaryPlanesBase); + add32(TrustedImm32(1), index); + isBMPChar.link(this); +#endif + + branchTest32(Zero, countRegister).linkTo(rematchLoop, this); + + loadFromFrame(term->frameLocation + BackTrackInfoCharacterClass::matchAmountIndex(), countRegister); + } jump(op.m_reentry); } @@ -1134,8 +1577,11 @@ class YarrGenerator : private DefaultMacroAssembler { move(TrustedImm32(0), countRegister); op.m_reentry = label(); - storeToFrame(countRegister, term->frameLocation); + if (m_decodeSurrogatePairs) + storeToFrame(index, term->frameLocation + BackTrackInfoCharacterClass::beginIndex()); + storeToFrame(countRegister, term->frameLocation + BackTrackInfoCharacterClass::matchAmountIndex()); } + void backtrackCharacterClassNonGreedy(size_t opIndex) { YarrOp& op = m_ops[opIndex]; @@ -1148,24 +1594,38 @@ class YarrGenerator : private DefaultMacroAssembler { m_backtrackingState.link(this); - loadFromFrame(term->frameLocation, countRegister); + if (m_decodeSurrogatePairs) + loadFromFrame(term->frameLocation + BackTrackInfoCharacterClass::beginIndex(), index); + loadFromFrame(term->frameLocation + BackTrackInfoCharacterClass::matchAmountIndex(), countRegister); nonGreedyFailures.append(atEndOfInput()); - nonGreedyFailures.append(branch32(Equal, countRegister, Imm32(term->quantityCount.unsafeGet()))); + nonGreedyFailures.append(branch32(Equal, countRegister, Imm32(term->quantityMaxCount.unsafeGet()))); JumpList matchDest; - readCharacter(term->inputPosition - m_checked, character); - matchCharacterClass(character, matchDest, term->characterClass); + readCharacter(m_checkedOffset - term->inputPosition, character); + // If we are matching the "any character" builtin class we only need to read the + // character and don't need to match as it will always succeed. + if (term->invert() || !term->characterClass->m_anyCharacter) { + matchCharacterClass(character, matchDest, term->characterClass); - if (term->invert()) - nonGreedyFailures.append(matchDest); - else { - nonGreedyFailures.append(jump()); - matchDest.link(this); + if (term->invert()) + nonGreedyFailures.append(matchDest); + else { + nonGreedyFailures.append(jump()); + matchDest.link(this); + } } - add32(TrustedImm32(1), countRegister); add32(TrustedImm32(1), index); +#ifdef JIT_UNICODE_EXPRESSIONS + if (m_decodeSurrogatePairs) { + nonGreedyFailures.append(atEndOfInput()); + Jump isBMPChar = branch32(LessThan, character, supplementaryPlanesBase); + add32(TrustedImm32(1), index); + isBMPChar.link(this); + } +#endif + add32(TrustedImm32(1), countRegister); jump(op.m_reentry); @@ -1181,15 +1641,28 @@ class YarrGenerator : private DefaultMacroAssembler { const RegisterID character = regT0; const RegisterID matchPos = regT1; +#ifndef HAVE_INITIAL_START_REG + const RegisterID initialStart = character; +#endif JumpList foundBeginningNewLine; JumpList saveStartIndex; JumpList foundEndingNewLine; + if (m_pattern.dotAll()) { + move(TrustedImm32(0), matchPos); + setMatchStart(matchPos); + move(length, index); + return; + } + ASSERT(!m_pattern.m_body->m_hasFixedSize); getMatchStart(matchPos); - saveStartIndex.append(branchTest32(Zero, matchPos)); +#ifndef HAVE_INITIAL_START_REG + loadFromFrame(m_pattern.m_initialStartValueFrameLocation, initialStart); +#endif + saveStartIndex.append(branch32(BelowOrEqual, matchPos, initialStart)); Label findBOLLoop(this); sub32(TrustedImm32(1), matchPos); if (m_charSize == Char8) @@ -1197,14 +1670,18 @@ class YarrGenerator : private DefaultMacroAssembler { else load16(BaseIndex(input, matchPos, TimesTwo, 0), character); matchCharacterClass(character, foundBeginningNewLine, m_pattern.newlineCharacterClass()); - branchTest32(NonZero, matchPos).linkTo(findBOLLoop, this); + +#ifndef HAVE_INITIAL_START_REG + loadFromFrame(m_pattern.m_initialStartValueFrameLocation, initialStart); +#endif + branch32(Above, matchPos, initialStart).linkTo(findBOLLoop, this); saveStartIndex.append(jump()); foundBeginningNewLine.link(this); add32(TrustedImm32(1), matchPos); // Advance past newline saveStartIndex.link(this); - if (!m_pattern.m_multiline && term->anchors.bolAnchor) + if (!m_pattern.multiline() && term->anchors.bolAnchor) op.m_jumps.append(branchTest32(NonZero, matchPos)); ASSERT(!m_pattern.m_body->m_hasFixedSize); @@ -1224,7 +1701,7 @@ class YarrGenerator : private DefaultMacroAssembler { foundEndingNewLine.link(this); - if (!m_pattern.m_multiline && term->anchors.eolAnchor) + if (!m_pattern.multiline() && term->anchors.eolAnchor) op.m_jumps.append(branch32(NotEqual, matchPos, length)); move(matchPos, index); @@ -1247,7 +1724,7 @@ class YarrGenerator : private DefaultMacroAssembler { case PatternTerm::TypePatternCharacter: switch (term->quantityType) { case QuantifierFixedCount: - if (term->quantityCount == 1) + if (term->quantityMaxCount == 1) generatePatternCharacterOnce(opIndex); else generatePatternCharacterFixed(opIndex); @@ -1264,7 +1741,7 @@ class YarrGenerator : private DefaultMacroAssembler { case PatternTerm::TypeCharacterClass: switch (term->quantityType) { case QuantifierFixedCount: - if (term->quantityCount == 1) + if (term->quantityMaxCount == 1) generateCharacterClassOnce(opIndex); else generateCharacterClassFixed(opIndex); @@ -1297,7 +1774,7 @@ class YarrGenerator : private DefaultMacroAssembler { case PatternTerm::TypeParentheticalAssertion: RELEASE_ASSERT_NOT_REACHED(); case PatternTerm::TypeBackReference: - m_shouldFallBack = true; + m_failureReason = JITFailureReason::BackReference; break; case PatternTerm::TypeDotStarEnclosure: generateDotStarEnclosure(opIndex); @@ -1313,7 +1790,7 @@ class YarrGenerator : private DefaultMacroAssembler { case PatternTerm::TypePatternCharacter: switch (term->quantityType) { case QuantifierFixedCount: - if (term->quantityCount == 1) + if (term->quantityMaxCount == 1) backtrackPatternCharacterOnce(opIndex); else backtrackPatternCharacterFixed(opIndex); @@ -1330,7 +1807,7 @@ class YarrGenerator : private DefaultMacroAssembler { case PatternTerm::TypeCharacterClass: switch (term->quantityType) { case QuantifierFixedCount: - if (term->quantityCount == 1) + if (term->quantityMaxCount == 1) backtrackCharacterClassOnce(opIndex); else backtrackCharacterClassFixed(opIndex); @@ -1368,7 +1845,7 @@ class YarrGenerator : private DefaultMacroAssembler { break; case PatternTerm::TypeBackReference: - m_shouldFallBack = true; + m_failureReason = JITFailureReason::BackReference; break; } } @@ -1419,7 +1896,7 @@ class YarrGenerator : private DefaultMacroAssembler { // set as appropriate to this alternative. op.m_reentry = label(); - m_checked += alternative->m_minimumSize; + m_checkedOffset += alternative->m_minimumSize; break; } case OpBodyAlternativeNext: @@ -1472,8 +1949,8 @@ class YarrGenerator : private DefaultMacroAssembler { } if (op.m_op == OpBodyAlternativeNext) - m_checked += alternative->m_minimumSize; - m_checked -= priorAlternative->m_minimumSize; + m_checkedOffset += alternative->m_minimumSize; + m_checkedOffset -= priorAlternative->m_minimumSize; break; } @@ -1500,13 +1977,13 @@ class YarrGenerator : private DefaultMacroAssembler { PatternDisjunction* disjunction = term->parentheses.disjunction; // Calculate how much input we need to check for, and if non-zero check. - op.m_checkAdjust = alternative->m_minimumSize; + op.m_checkAdjust = Checked<unsigned>(alternative->m_minimumSize); if ((term->quantityType == QuantifierFixedCount) && (term->type != PatternTerm::TypeParentheticalAssertion)) op.m_checkAdjust -= disjunction->m_minimumSize; if (op.m_checkAdjust) - op.m_jumps.append(jumpIfNoAvailableInput(op.m_checkAdjust)); + op.m_jumps.append(jumpIfNoAvailableInput(op.m_checkAdjust.unsafeGet())); - m_checked += op.m_checkAdjust; + m_checkedOffset += op.m_checkAdjust; break; } case OpSimpleNestedAlternativeNext: @@ -1518,10 +1995,7 @@ class YarrGenerator : private DefaultMacroAssembler { // In the non-simple case, store a 'return address' so we can backtrack correctly. if (op.m_op == OpNestedAlternativeNext) { unsigned parenthesesFrameLocation = term->frameLocation; - unsigned alternativeFrameLocation = parenthesesFrameLocation; - if (term->quantityType != QuantifierFixedCount) - alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce; - op.m_returnAddress = storeToFrameWithPatch(alternativeFrameLocation); + op.m_returnAddress = storeToFrameWithPatch(parenthesesFrameLocation + BackTrackInfoParentheses::returnAddressIndex()); } if (term->quantityType != QuantifierFixedCount && !m_ops[op.m_previousOp].m_alternative->m_minimumSize) { @@ -1554,11 +2028,11 @@ class YarrGenerator : private DefaultMacroAssembler { if ((term->quantityType == QuantifierFixedCount) && (term->type != PatternTerm::TypeParentheticalAssertion)) op.m_checkAdjust -= disjunction->m_minimumSize; if (op.m_checkAdjust) - op.m_jumps.append(jumpIfNoAvailableInput(op.m_checkAdjust)); + op.m_jumps.append(jumpIfNoAvailableInput(op.m_checkAdjust.unsafeGet())); YarrOp& lastOp = m_ops[op.m_previousOp]; - m_checked -= lastOp.m_checkAdjust; - m_checked += op.m_checkAdjust; + m_checkedOffset -= lastOp.m_checkAdjust; + m_checkedOffset += op.m_checkAdjust; break; } case OpSimpleNestedAlternativeEnd: @@ -1568,10 +2042,7 @@ class YarrGenerator : private DefaultMacroAssembler { // In the non-simple case, store a 'return address' so we can backtrack correctly. if (op.m_op == OpNestedAlternativeEnd) { unsigned parenthesesFrameLocation = term->frameLocation; - unsigned alternativeFrameLocation = parenthesesFrameLocation; - if (term->quantityType != QuantifierFixedCount) - alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce; - op.m_returnAddress = storeToFrameWithPatch(alternativeFrameLocation); + op.m_returnAddress = storeToFrameWithPatch(parenthesesFrameLocation + BackTrackInfoParentheses::returnAddressIndex()); } if (term->quantityType != QuantifierFixedCount && !m_ops[op.m_previousOp].m_alternative->m_minimumSize) { @@ -1587,7 +2058,7 @@ class YarrGenerator : private DefaultMacroAssembler { op.m_jumps.clear(); YarrOp& lastOp = m_ops[op.m_previousOp]; - m_checked -= lastOp.m_checkAdjust; + m_checkedOffset -= lastOp.m_checkAdjust; break; } @@ -1599,7 +2070,7 @@ class YarrGenerator : private DefaultMacroAssembler { PatternTerm* term = op.m_term; unsigned parenthesesFrameLocation = term->frameLocation; const RegisterID indexTemporary = regT0; - ASSERT(term->quantityCount == 1); + ASSERT(term->quantityMaxCount == 1); // Upon entry to a Greedy quantified set of parenthese store the index. // We'll use this for two purposes: @@ -1616,12 +2087,12 @@ class YarrGenerator : private DefaultMacroAssembler { // // FIXME: for capturing parens, could use the index in the capture array? if (term->quantityType == QuantifierGreedy) - storeToFrame(index, parenthesesFrameLocation); + storeToFrame(index, parenthesesFrameLocation + BackTrackInfoParenthesesOnce::beginIndex()); else if (term->quantityType == QuantifierNonGreedy) { - storeToFrame(TrustedImm32(-1), parenthesesFrameLocation); + storeToFrame(TrustedImm32(-1), parenthesesFrameLocation + BackTrackInfoParenthesesOnce::beginIndex()); op.m_jumps.append(jump()); op.m_reentry = label(); - storeToFrame(index, parenthesesFrameLocation); + storeToFrame(index, parenthesesFrameLocation + BackTrackInfoParenthesesOnce::beginIndex()); } // If the parenthese are capturing, store the starting index value to the @@ -1631,12 +2102,12 @@ class YarrGenerator : private DefaultMacroAssembler { // offsets only afterwards, at the point the results array is // being accessed. if (term->capture() && compileMode == IncludeSubpatterns) { - int inputOffset = term->inputPosition - m_checked; + unsigned inputOffset = (m_checkedOffset - term->inputPosition).unsafeGet(); if (term->quantityType == QuantifierFixedCount) - inputOffset -= term->parentheses.disjunction->m_minimumSize; + inputOffset += term->parentheses.disjunction->m_minimumSize; if (inputOffset) { move(index, indexTemporary); - add32(Imm32(inputOffset), indexTemporary); + sub32(Imm32(inputOffset), indexTemporary); setSubpatternStart(indexTemporary, term->parentheses.subpatternId); } else setSubpatternStart(index, term->parentheses.subpatternId); @@ -1646,18 +2117,16 @@ class YarrGenerator : private DefaultMacroAssembler { case OpParenthesesSubpatternOnceEnd: { PatternTerm* term = op.m_term; const RegisterID indexTemporary = regT0; - ASSERT(term->quantityCount == 1); + ASSERT(term->quantityMaxCount == 1); -#ifndef NDEBUG // Runtime ASSERT to make sure that the nested alternative handled the // "no input consumed" check. - if (term->quantityType != QuantifierFixedCount && !term->parentheses.disjunction->m_minimumSize) { + if (!ASSERT_DISABLED && term->quantityType != QuantifierFixedCount && !term->parentheses.disjunction->m_minimumSize) { Jump pastBreakpoint; pastBreakpoint = branch32(NotEqual, index, Address(stackPointerRegister, term->frameLocation * sizeof(void*))); - breakpoint(); + // ### abortWithReason(YARRNoInputConsumed); pastBreakpoint.link(this); } -#endif // If the parenthese are capturing, store the ending index value to the // captures array, offsetting as necessary. @@ -1666,10 +2135,10 @@ class YarrGenerator : private DefaultMacroAssembler { // offsets only afterwards, at the point the results array is // being accessed. if (term->capture() && compileMode == IncludeSubpatterns) { - int inputOffset = term->inputPosition - m_checked; + unsigned inputOffset = (m_checkedOffset - term->inputPosition).unsafeGet(); if (inputOffset) { move(index, indexTemporary); - add32(Imm32(inputOffset), indexTemporary); + sub32(Imm32(inputOffset), indexTemporary); setSubpatternEnd(indexTemporary, term->parentheses.subpatternId); } else setSubpatternEnd(index, term->parentheses.subpatternId); @@ -1691,7 +2160,7 @@ class YarrGenerator : private DefaultMacroAssembler { case OpParenthesesSubpatternTerminalBegin: { PatternTerm* term = op.m_term; ASSERT(term->quantityType == QuantifierGreedy); - ASSERT(term->quantityCount == quantifyInfinite); + ASSERT(term->quantityMaxCount == quantifyInfinite); ASSERT(!term->capture()); // Upon entry set a label to loop back to. @@ -1699,23 +2168,23 @@ class YarrGenerator : private DefaultMacroAssembler { // Store the start index of the current match; we need to reject zero // length matches. - storeToFrame(index, term->frameLocation); + storeToFrame(index, term->frameLocation + BackTrackInfoParenthesesTerminal::beginIndex()); break; } case OpParenthesesSubpatternTerminalEnd: { YarrOp& beginOp = m_ops[op.m_previousOp]; -#ifndef NDEBUG - PatternTerm* term = op.m_term; - - // Runtime ASSERT to make sure that the nested alternative handled the - // "no input consumed" check. - Jump pastBreakpoint; - pastBreakpoint = branch32(NotEqual, index, Address(stackPointerRegister, term->frameLocation * sizeof(void*))); - breakpoint(); - pastBreakpoint.link(this); -#endif + if (!ASSERT_DISABLED) { + PatternTerm* term = op.m_term; + + // Runtime ASSERT to make sure that the nested alternative handled the + // "no input consumed" check. + Jump pastBreakpoint; + pastBreakpoint = branch32(NotEqual, index, Address(stackPointerRegister, term->frameLocation * sizeof(void*))); + // ### abortWithReason(YARRNoInputConsumed); + pastBreakpoint.link(this); + } - // We know that the match is non-zero, we can accept it and + // We know that the match is non-zero, we can accept it and // loop back up to the head of the subpattern. jump(beginOp.m_reentry); @@ -1725,6 +2194,131 @@ class YarrGenerator : private DefaultMacroAssembler { break; } + // OpParenthesesSubpatternBegin/End + // + // These nodes support generic subpatterns. + case OpParenthesesSubpatternBegin: { +#if ENABLE(YARR_JIT_ALL_PARENS_EXPRESSIONS) + PatternTerm* term = op.m_term; + unsigned parenthesesFrameLocation = term->frameLocation; + + // Upon entry to a Greedy quantified set of parenthese store the index. + // We'll use this for two purposes: + // - To indicate which iteration we are on of mathing the remainder of + // the expression after the parentheses - the first, including the + // match within the parentheses, or the second having skipped over them. + // - To check for empty matches, which must be rejected. + // + // At the head of a NonGreedy set of parentheses we'll immediately set the + // value on the stack to -1 (indicating a match skipping the subpattern), + // and plant a jump to the end. We'll also plant a label to backtrack to + // to reenter the subpattern later, with a store to set up index on the + // second iteration. + // + // FIXME: for capturing parens, could use the index in the capture array? + if (term->quantityType == QuantifierGreedy || term->quantityType == QuantifierNonGreedy) { + storeToFrame(TrustedImm32(0), parenthesesFrameLocation + BackTrackInfoParentheses::matchAmountIndex()); + storeToFrame(TrustedImmPtr(nullptr), parenthesesFrameLocation + BackTrackInfoParentheses::parenContextHeadIndex()); + + if (term->quantityType == QuantifierNonGreedy) { + storeToFrame(TrustedImm32(-1), parenthesesFrameLocation + BackTrackInfoParentheses::beginIndex()); + op.m_jumps.append(jump()); + } + + op.m_reentry = label(); + RegisterID currParenContextReg = regT0; + RegisterID newParenContextReg = regT1; + + loadFromFrame(parenthesesFrameLocation + BackTrackInfoParentheses::parenContextHeadIndex(), currParenContextReg); + allocateParenContext(newParenContextReg); + storePtr(currParenContextReg, newParenContextReg); + storeToFrame(newParenContextReg, parenthesesFrameLocation + BackTrackInfoParentheses::parenContextHeadIndex()); + saveParenContext(newParenContextReg, regT2, term->parentheses.subpatternId, term->parentheses.lastSubpatternId, parenthesesFrameLocation); + storeToFrame(index, parenthesesFrameLocation + BackTrackInfoParentheses::beginIndex()); + } + + // If the parenthese are capturing, store the starting index value to the + // captures array, offsetting as necessary. + // + // FIXME: could avoid offsetting this value in JIT code, apply + // offsets only afterwards, at the point the results array is + // being accessed. + if (term->capture() && compileMode == IncludeSubpatterns) { + const RegisterID indexTemporary = regT0; + unsigned inputOffset = (m_checkedOffset - term->inputPosition).unsafeGet(); + if (term->quantityType == QuantifierFixedCount) + inputOffset += term->parentheses.disjunction->m_minimumSize; + if (inputOffset) { + move(index, indexTemporary); + sub32(Imm32(inputOffset), indexTemporary); + setSubpatternStart(indexTemporary, term->parentheses.subpatternId); + } else + setSubpatternStart(index, term->parentheses.subpatternId); + } +#else // !YARR_JIT_ALL_PARENS_EXPRESSIONS + RELEASE_ASSERT_NOT_REACHED(); +#endif + break; + } + case OpParenthesesSubpatternEnd: { +#if ENABLE(YARR_JIT_ALL_PARENS_EXPRESSIONS) + PatternTerm* term = op.m_term; + unsigned parenthesesFrameLocation = term->frameLocation; + + // Runtime ASSERT to make sure that the nested alternative handled the + // "no input consumed" check. + if (!ASSERT_DISABLED && term->quantityType != QuantifierFixedCount && !term->parentheses.disjunction->m_minimumSize) { + Jump pastBreakpoint; + pastBreakpoint = branch32(NotEqual, index, Address(stackPointerRegister, parenthesesFrameLocation * sizeof(void*))); + // ### abortWithReason(YARRNoInputConsumed); + pastBreakpoint.link(this); + } + + const RegisterID countTemporary = regT1; + + YarrOp& beginOp = m_ops[op.m_previousOp]; + loadFromFrame(parenthesesFrameLocation + BackTrackInfoParentheses::matchAmountIndex(), countTemporary); + add32(TrustedImm32(1), countTemporary); + storeToFrame(countTemporary, parenthesesFrameLocation + BackTrackInfoParentheses::matchAmountIndex()); + + // If the parenthese are capturing, store the ending index value to the + // captures array, offsetting as necessary. + // + // FIXME: could avoid offsetting this value in JIT code, apply + // offsets only afterwards, at the point the results array is + // being accessed. + if (term->capture() && compileMode == IncludeSubpatterns) { + const RegisterID indexTemporary = regT0; + + unsigned inputOffset = (m_checkedOffset - term->inputPosition).unsafeGet(); + if (inputOffset) { + move(index, indexTemporary); + sub32(Imm32(inputOffset), indexTemporary); + setSubpatternEnd(indexTemporary, term->parentheses.subpatternId); + } else + setSubpatternEnd(index, term->parentheses.subpatternId); + } + + // If the parentheses are quantified Greedy then add a label to jump back + // to if get a failed match from after the parentheses. For NonGreedy + // parentheses, link the jump from before the subpattern to here. + if (term->quantityType == QuantifierGreedy) { + if (term->quantityMaxCount != quantifyInfinite) + branch32(Below, countTemporary, Imm32(term->quantityMaxCount.unsafeGet())).linkTo(beginOp.m_reentry, this); + else + jump(beginOp.m_reentry); + + op.m_reentry = label(); + } else if (term->quantityType == QuantifierNonGreedy) { + YarrOp& beginOp = m_ops[op.m_previousOp]; + beginOp.m_jumps.link(this); + } +#else // !YARR_JIT_ALL_PARENS_EXPRESSIONS + RELEASE_ASSERT_NOT_REACHED(); +#endif + break; + } + // OpParentheticalAssertionBegin/End case OpParentheticalAssertionBegin: { PatternTerm* term = op.m_term; @@ -1732,14 +2326,14 @@ class YarrGenerator : private DefaultMacroAssembler { // Store the current index - assertions should not update index, so // we will need to restore it upon a successful match. unsigned parenthesesFrameLocation = term->frameLocation; - storeToFrame(index, parenthesesFrameLocation); + storeToFrame(index, parenthesesFrameLocation + BackTrackInfoParentheticalAssertion::beginIndex()); // Check - op.m_checkAdjust = m_checked - term->inputPosition; + op.m_checkAdjust = m_checkedOffset - term->inputPosition; if (op.m_checkAdjust) - sub32(Imm32(op.m_checkAdjust), index); + sub32(Imm32(op.m_checkAdjust.unsafeGet()), index); - m_checked -= op.m_checkAdjust; + m_checkedOffset -= op.m_checkAdjust; break; } case OpParentheticalAssertionEnd: { @@ -1747,7 +2341,7 @@ class YarrGenerator : private DefaultMacroAssembler { // Restore the input index value. unsigned parenthesesFrameLocation = term->frameLocation; - loadFromFrame(parenthesesFrameLocation, index); + loadFromFrame(parenthesesFrameLocation + BackTrackInfoParentheticalAssertion::beginIndex(), index); // If inverted, a successful match of the assertion must be treated // as a failure, so jump to backtracking. @@ -1757,15 +2351,13 @@ class YarrGenerator : private DefaultMacroAssembler { } YarrOp& lastOp = m_ops[op.m_previousOp]; - m_checked += lastOp.m_checkAdjust; + m_checkedOffset += lastOp.m_checkAdjust; break; } case OpMatchFailed: removeCallFrame(); - move(TrustedImmPtr((void*)WTF::notFound), returnRegister); - move(TrustedImm32(0), returnRegister2); - generateReturn(); + generateFailReturn(); break; } @@ -1817,9 +2409,9 @@ class YarrGenerator : private DefaultMacroAssembler { if (op.m_op == OpBodyAlternativeNext) { PatternAlternative* priorAlternative = m_ops[op.m_previousOp].m_alternative; - m_checked += priorAlternative->m_minimumSize; + m_checkedOffset += priorAlternative->m_minimumSize; } - m_checked -= alternative->m_minimumSize; + m_checkedOffset -= alternative->m_minimumSize; // Is this the last alternative? If not, then if we backtrack to this point we just // need to jump to try to match the next alternative. @@ -1836,6 +2428,8 @@ class YarrGenerator : private DefaultMacroAssembler { } bool onceThrough = endOp.m_nextOp == notFound; + + JumpList lastStickyAlternativeFailures; // First, generate code to handle cases where we backtrack out of an attempted match // of the last alternative. If this is a 'once through' set of alternatives then we @@ -1851,43 +2445,49 @@ class YarrGenerator : private DefaultMacroAssembler { && (alternative->m_minimumSize > beginOp->m_alternative->m_minimumSize) && (alternative->m_minimumSize - beginOp->m_alternative->m_minimumSize == 1)) m_backtrackingState.linkTo(beginOp->m_reentry, this); - else { + else if (m_pattern.sticky() && m_ops[op.m_nextOp].m_op == OpBodyAlternativeEnd) { + // It is a sticky pattern and the last alternative failed, jump to the end. + m_backtrackingState.takeBacktracksToJumpList(lastStickyAlternativeFailures, this); + } else { // We need to generate a trampoline of code to execute before looping back // around to the first alternative. m_backtrackingState.link(this); - // If the pattern size is not fixed, then store the start index, for use if we match. - if (!m_pattern.m_body->m_hasFixedSize) { - if (alternative->m_minimumSize == 1) - setMatchStart(index); - else { - move(index, regT0); - if (alternative->m_minimumSize) - sub32(Imm32(alternative->m_minimumSize - 1), regT0); - else - add32(TrustedImm32(1), regT0); - setMatchStart(regT0); + // No need to advance and retry for a sticky pattern. + if (!m_pattern.sticky()) { + // If the pattern size is not fixed, then store the start index for use if we match. + if (!m_pattern.m_body->m_hasFixedSize) { + if (alternative->m_minimumSize == 1) + setMatchStart(index); + else { + move(index, regT0); + if (alternative->m_minimumSize) + sub32(Imm32(alternative->m_minimumSize - 1), regT0); + else + add32(TrustedImm32(1), regT0); + setMatchStart(regT0); + } } - } - // Generate code to loop. Check whether the last alternative is longer than the - // first (e.g. /a|xy/ or /a|xyz/). - if (alternative->m_minimumSize > beginOp->m_alternative->m_minimumSize) { - // We want to loop, and increment input position. If the delta is 1, it is - // already correctly incremented, if more than one then decrement as appropriate. - unsigned delta = alternative->m_minimumSize - beginOp->m_alternative->m_minimumSize; - ASSERT(delta); - if (delta != 1) - sub32(Imm32(delta - 1), index); - jump(beginOp->m_reentry); - } else { - // If the first alternative has minimum size 0xFFFFFFFFu, then there cannot - // be sufficent input available to handle this, so just fall through. - unsigned delta = beginOp->m_alternative->m_minimumSize - alternative->m_minimumSize; - if (delta != 0xFFFFFFFFu) { - // We need to check input because we are incrementing the input. - add32(Imm32(delta + 1), index); - checkInput().linkTo(beginOp->m_reentry, this); + // Generate code to loop. Check whether the last alternative is longer than the + // first (e.g. /a|xy/ or /a|xyz/). + if (alternative->m_minimumSize > beginOp->m_alternative->m_minimumSize) { + // We want to loop, and increment input position. If the delta is 1, it is + // already correctly incremented, if more than one then decrement as appropriate. + unsigned delta = alternative->m_minimumSize - beginOp->m_alternative->m_minimumSize; + ASSERT(delta); + if (delta != 1) + sub32(Imm32(delta - 1), index); + jump(beginOp->m_reentry); + } else { + // If the first alternative has minimum size 0xFFFFFFFFu, then there cannot + // be sufficent input available to handle this, so just fall through. + unsigned delta = beginOp->m_alternative->m_minimumSize - alternative->m_minimumSize; + if (delta != 0xFFFFFFFFu) { + // We need to check input because we are incrementing the input. + add32(Imm32(delta + 1), index); + checkInput().linkTo(beginOp->m_reentry, this); + } } } } @@ -1896,7 +2496,7 @@ class YarrGenerator : private DefaultMacroAssembler { // We can reach this point in the code in two ways: // - Fallthrough from the code above (a repeating alternative backtracked out of its // last alternative, and did not have sufficent input to run the first). - // - We will loop back up to the following label when a releating alternative loops, + // - We will loop back up to the following label when a repeating alternative loops, // following a failed input check. // // Either way, we have just failed the input check for the first alternative. @@ -1956,56 +2556,57 @@ class YarrGenerator : private DefaultMacroAssembler { needsToUpdateMatchStart = false; } - // Check whether there is sufficient input to loop. Increment the input position by - // one, and check. Also add in the minimum disjunction size before checking - there - // is no point in looping if we're just going to fail all the input checks around - // the next iteration. - ASSERT(alternative->m_minimumSize >= m_pattern.m_body->m_minimumSize); - if (alternative->m_minimumSize == m_pattern.m_body->m_minimumSize) { - // If the last alternative had the same minimum size as the disjunction, - // just simply increment input pos by 1, no adjustment based on minimum size. - add32(TrustedImm32(1), index); - } else { - // If the minumum for the last alternative was one greater than than that - // for the disjunction, we're already progressed by 1, nothing to do! - unsigned delta = (alternative->m_minimumSize - m_pattern.m_body->m_minimumSize) - 1; - if (delta) - sub32(Imm32(delta), index); - } - Jump matchFailed = jumpIfNoAvailableInput(); + if (!m_pattern.sticky()) { + // Check whether there is sufficient input to loop. Increment the input position by + // one, and check. Also add in the minimum disjunction size before checking - there + // is no point in looping if we're just going to fail all the input checks around + // the next iteration. + ASSERT(alternative->m_minimumSize >= m_pattern.m_body->m_minimumSize); + if (alternative->m_minimumSize == m_pattern.m_body->m_minimumSize) { + // If the last alternative had the same minimum size as the disjunction, + // just simply increment input pos by 1, no adjustment based on minimum size. + add32(TrustedImm32(1), index); + } else { + // If the minumum for the last alternative was one greater than than that + // for the disjunction, we're already progressed by 1, nothing to do! + unsigned delta = (alternative->m_minimumSize - m_pattern.m_body->m_minimumSize) - 1; + if (delta) + sub32(Imm32(delta), index); + } + Jump matchFailed = jumpIfNoAvailableInput(); + + if (needsToUpdateMatchStart) { + if (!m_pattern.m_body->m_minimumSize) + setMatchStart(index); + else { + move(index, regT0); + sub32(Imm32(m_pattern.m_body->m_minimumSize), regT0); + setMatchStart(regT0); + } + } - if (needsToUpdateMatchStart) { - if (!m_pattern.m_body->m_minimumSize) - setMatchStart(index); + // Calculate how much more input the first alternative requires than the minimum + // for the body as a whole. If no more is needed then we dont need an additional + // input check here - jump straight back up to the start of the first alternative. + if (beginOp->m_alternative->m_minimumSize == m_pattern.m_body->m_minimumSize) + jump(beginOp->m_reentry); else { - move(index, regT0); - sub32(Imm32(m_pattern.m_body->m_minimumSize), regT0); - setMatchStart(regT0); + if (beginOp->m_alternative->m_minimumSize > m_pattern.m_body->m_minimumSize) + add32(Imm32(beginOp->m_alternative->m_minimumSize - m_pattern.m_body->m_minimumSize), index); + else + sub32(Imm32(m_pattern.m_body->m_minimumSize - beginOp->m_alternative->m_minimumSize), index); + checkInput().linkTo(beginOp->m_reentry, this); + jump(firstInputCheckFailed); } - } - // Calculate how much more input the first alternative requires than the minimum - // for the body as a whole. If no more is needed then we dont need an additional - // input check here - jump straight back up to the start of the first alternative. - if (beginOp->m_alternative->m_minimumSize == m_pattern.m_body->m_minimumSize) - jump(beginOp->m_reentry); - else { - if (beginOp->m_alternative->m_minimumSize > m_pattern.m_body->m_minimumSize) - add32(Imm32(beginOp->m_alternative->m_minimumSize - m_pattern.m_body->m_minimumSize), index); - else - sub32(Imm32(m_pattern.m_body->m_minimumSize - beginOp->m_alternative->m_minimumSize), index); - checkInput().linkTo(beginOp->m_reentry, this); - jump(firstInputCheckFailed); + // We jump to here if we iterate to the point that there is insufficient input to + // run any matches, and need to return a failure state from JIT code. + matchFailed.link(this); } - // We jump to here if we iterate to the point that there is insufficient input to - // run any matches, and need to return a failure state from JIT code. - matchFailed.link(this); - + lastStickyAlternativeFailures.link(this); removeCallFrame(); - move(TrustedImmPtr((void*)WTF::notFound), returnRegister); - move(TrustedImm32(0), returnRegister2); - generateReturn(); + generateFailReturn(); break; } case OpBodyAlternativeEnd: { @@ -2013,7 +2614,7 @@ class YarrGenerator : private DefaultMacroAssembler { ASSERT(m_backtrackingState.isEmpty()); PatternAlternative* priorAlternative = m_ops[op.m_previousOp].m_alternative; - m_checked += priorAlternative->m_minimumSize; + m_checkedOffset += priorAlternative->m_minimumSize; break; } @@ -2064,7 +2665,7 @@ class YarrGenerator : private DefaultMacroAssembler { if (op.m_checkAdjust) { // Handle the cases where we need to link the backtracks here. m_backtrackingState.link(this); - sub32(Imm32(op.m_checkAdjust), index); + sub32(Imm32(op.m_checkAdjust.unsafeGet()), index); if (!isLastAlternative) { // An alternative that is not the last should jump to its successor. jump(nextOp.m_reentry); @@ -2114,9 +2715,9 @@ class YarrGenerator : private DefaultMacroAssembler { if (!isBegin) { YarrOp& lastOp = m_ops[op.m_previousOp]; - m_checked += lastOp.m_checkAdjust; + m_checkedOffset += lastOp.m_checkAdjust; } - m_checked -= op.m_checkAdjust; + m_checkedOffset -= op.m_checkAdjust; break; } case OpSimpleNestedAlternativeEnd: @@ -2136,10 +2737,7 @@ class YarrGenerator : private DefaultMacroAssembler { // Plant a jump to the return address. unsigned parenthesesFrameLocation = term->frameLocation; - unsigned alternativeFrameLocation = parenthesesFrameLocation; - if (term->quantityType != QuantifierFixedCount) - alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce; - loadFromFrameAndJump(alternativeFrameLocation); + loadFromFrameAndJump(parenthesesFrameLocation + BackTrackInfoParentheses::returnAddressIndex()); // Link the DataLabelPtr associated with the end of the last // alternative to this point. @@ -2147,7 +2745,7 @@ class YarrGenerator : private DefaultMacroAssembler { } YarrOp& lastOp = m_ops[op.m_previousOp]; - m_checked += lastOp.m_checkAdjust; + m_checkedOffset += lastOp.m_checkAdjust; break; } @@ -2168,9 +2766,9 @@ class YarrGenerator : private DefaultMacroAssembler { // matching start, depending of whether the match is Greedy or NonGreedy. case OpParenthesesSubpatternOnceBegin: { PatternTerm* term = op.m_term; - ASSERT(term->quantityCount == 1); + ASSERT(term->quantityMaxCount == 1); - // We only need to backtrack to thispoint if capturing or greedy. + // We only need to backtrack to this point if capturing or greedy. if ((term->capture() && compileMode == IncludeSubpatterns) || term->quantityType == QuantifierGreedy) { m_backtrackingState.link(this); @@ -2182,7 +2780,7 @@ class YarrGenerator : private DefaultMacroAssembler { if (term->quantityType == QuantifierGreedy) { // Clear the flag in the stackframe indicating we ran through the subpattern. unsigned parenthesesFrameLocation = term->frameLocation; - storeToFrame(TrustedImm32(-1), parenthesesFrameLocation); + storeToFrame(TrustedImm32(-1), parenthesesFrameLocation + BackTrackInfoParenthesesOnce::beginIndex()); // Jump to after the parentheses, skipping the subpattern. jump(m_ops[op.m_nextOp].m_reentry); // A backtrack from after the parentheses, when skipping the subpattern, @@ -2204,7 +2802,7 @@ class YarrGenerator : private DefaultMacroAssembler { // are currently in a state where we had skipped over the subpattern // (in which case the flag value on the stack will be -1). unsigned parenthesesFrameLocation = term->frameLocation; - Jump hadSkipped = branch32(Equal, Address(stackPointerRegister, parenthesesFrameLocation * sizeof(void*)), TrustedImm32(-1)); + Jump hadSkipped = branch32(Equal, Address(stackPointerRegister, (parenthesesFrameLocation + BackTrackInfoParenthesesOnce::beginIndex()) * sizeof(void*)), TrustedImm32(-1)); if (term->quantityType == QuantifierGreedy) { // For Greedy parentheses, we skip after having already tried going @@ -2248,6 +2846,108 @@ class YarrGenerator : private DefaultMacroAssembler { m_backtrackingState.append(op.m_jumps); break; + // OpParenthesesSubpatternBegin/End + // + // When we are backtracking back out of a capturing subpattern we need + // to clear the start index in the matches output array, to record that + // this subpattern has not been captured. + // + // When backtracking back out of a Greedy quantified subpattern we need + // to catch this, and try running the remainder of the alternative after + // the subpattern again, skipping the parentheses. + // + // Upon backtracking back into a quantified set of parentheses we need to + // check whether we were currently skipping the subpattern. If not, we + // can backtrack into them, if we were we need to either backtrack back + // out of the start of the parentheses, or jump back to the forwards + // matching start, depending of whether the match is Greedy or NonGreedy. + case OpParenthesesSubpatternBegin: { +#if ENABLE(YARR_JIT_ALL_PARENS_EXPRESSIONS) + PatternTerm* term = op.m_term; + unsigned parenthesesFrameLocation = term->frameLocation; + + if (term->quantityType != QuantifierFixedCount) { + m_backtrackingState.link(this); + + if (term->quantityType == QuantifierGreedy) { + RegisterID currParenContextReg = regT0; + RegisterID newParenContextReg = regT1; + + loadFromFrame(parenthesesFrameLocation + BackTrackInfoParentheses::parenContextHeadIndex(), currParenContextReg); + + restoreParenContext(currParenContextReg, regT2, term->parentheses.subpatternId, term->parentheses.lastSubpatternId, parenthesesFrameLocation); + + freeParenContext(currParenContextReg, newParenContextReg); + storeToFrame(newParenContextReg, parenthesesFrameLocation + BackTrackInfoParentheses::parenContextHeadIndex()); + const RegisterID countTemporary = regT0; + loadFromFrame(parenthesesFrameLocation + BackTrackInfoParentheses::matchAmountIndex(), countTemporary); + Jump zeroLengthMatch = branchTest32(Zero, countTemporary); + + sub32(TrustedImm32(1), countTemporary); + storeToFrame(countTemporary, parenthesesFrameLocation + BackTrackInfoParentheses::matchAmountIndex()); + + jump(m_ops[op.m_nextOp].m_reentry); + + zeroLengthMatch.link(this); + + // Clear the flag in the stackframe indicating we didn't run through the subpattern. + storeToFrame(TrustedImm32(-1), parenthesesFrameLocation + BackTrackInfoParentheses::beginIndex()); + + jump(m_ops[op.m_nextOp].m_reentry); + } + + // If Greedy, jump to the end. + if (term->quantityType == QuantifierGreedy) { + // A backtrack from after the parentheses, when skipping the subpattern, + // will jump back to here. + op.m_jumps.link(this); + } + + m_backtrackingState.fallthrough(); + } +#else // !YARR_JIT_ALL_PARENS_EXPRESSIONS + RELEASE_ASSERT_NOT_REACHED(); +#endif + break; + } + case OpParenthesesSubpatternEnd: { +#if ENABLE(YARR_JIT_ALL_PARENS_EXPRESSIONS) + PatternTerm* term = op.m_term; + + if (term->quantityType != QuantifierFixedCount) { + m_backtrackingState.link(this); + + // Check whether we should backtrack back into the parentheses, or if we + // are currently in a state where we had skipped over the subpattern + // (in which case the flag value on the stack will be -1). + unsigned parenthesesFrameLocation = term->frameLocation; + Jump hadSkipped = branch32(Equal, Address(stackPointerRegister, (parenthesesFrameLocation + BackTrackInfoParentheses::beginIndex()) * sizeof(void*)), TrustedImm32(-1)); + + if (term->quantityType == QuantifierGreedy) { + // For Greedy parentheses, we skip after having already tried going + // through the subpattern, so if we get here we're done. + YarrOp& beginOp = m_ops[op.m_previousOp]; + beginOp.m_jumps.append(hadSkipped); + } else { + // For NonGreedy parentheses, we try skipping the subpattern first, + // so if we get here we need to try running through the subpattern + // next. Jump back to the start of the parentheses in the forwards + // matching path. + ASSERT(term->quantityType == QuantifierNonGreedy); + YarrOp& beginOp = m_ops[op.m_previousOp]; + hadSkipped.linkTo(beginOp.m_reentry, this); + } + + m_backtrackingState.fallthrough(); + } + + m_backtrackingState.append(op.m_jumps); +#else // !YARR_JIT_ALL_PARENS_EXPRESSIONS + RELEASE_ASSERT_NOT_REACHED(); +#endif + break; + } + // OpParentheticalAssertionBegin/End case OpParentheticalAssertionBegin: { PatternTerm* term = op.m_term; @@ -2260,7 +2960,7 @@ class YarrGenerator : private DefaultMacroAssembler { m_backtrackingState.link(this); if (op.m_checkAdjust) - add32(Imm32(op.m_checkAdjust), index); + add32(Imm32(op.m_checkAdjust.unsafeGet()), index); // In an inverted assertion failure to match the subpattern // is treated as a successful match - jump to the end of the @@ -2277,7 +2977,7 @@ class YarrGenerator : private DefaultMacroAssembler { // added the failure caused by a successful match to this. m_backtrackingState.append(endOp.m_jumps); - m_checked += op.m_checkAdjust; + m_checkedOffset += op.m_checkAdjust; break; } case OpParentheticalAssertionEnd: { @@ -2289,7 +2989,7 @@ class YarrGenerator : private DefaultMacroAssembler { m_backtrackingState.takeBacktracksToJumpList(op.m_jumps, this); YarrOp& lastOp = m_ops[op.m_previousOp]; - m_checked -= lastOp.m_checkAdjust; + m_checkedOffset -= lastOp.m_checkAdjust; break; } @@ -2307,9 +3007,9 @@ class YarrGenerator : private DefaultMacroAssembler { // Emits ops for a subpattern (set of parentheses). These consist // of a set of alternatives wrapped in an outer set of nodes for // the parentheses. - // Supported types of parentheses are 'Once' (quantityCount == 1) - // and 'Terminal' (non-capturing parentheses quantified as greedy - // and infinite). + // Supported types of parentheses are 'Once' (quantityMaxCount == 1), + // 'Terminal' (non-capturing parentheses quantified as greedy + // and infinite), and 0 based greedy quantified parentheses. // Alternatives will use the 'Simple' set of ops if either the // subpattern is terminal (in which case we will never need to // backtrack), or if the subpattern only contains one alternative. @@ -2328,7 +3028,10 @@ class YarrGenerator : private DefaultMacroAssembler { // comes where the subpattern is capturing, in which case we would // need to restore the capture from the first subpattern upon a // failure in the second. - if (term->quantityCount == 1 && !term->parentheses.isCopy) { + if (term->quantityMinCount && term->quantityMinCount != term->quantityMaxCount) { + m_failureReason = JITFailureReason::VariableCountedParenthesisWithNonZeroMinimum; + return; + } if (term->quantityMaxCount == 1 && !term->parentheses.isCopy) { // Select the 'Once' nodes. parenthesesBeginOpCode = OpParenthesesSubpatternOnceBegin; parenthesesEndOpCode = OpParenthesesSubpatternOnceEnd; @@ -2344,9 +3047,31 @@ class YarrGenerator : private DefaultMacroAssembler { parenthesesBeginOpCode = OpParenthesesSubpatternTerminalBegin; parenthesesEndOpCode = OpParenthesesSubpatternTerminalEnd; } else { +#if ENABLE(YARR_JIT_ALL_PARENS_EXPRESSIONS) + // We only handle generic parenthesis with greedy counts. + if (term->quantityType != QuantifierGreedy) { + // This subpattern is not supported by the JIT. + m_failureReason = JITFailureReason::NonGreedyParenthesizedSubpattern; + return; + } + + m_containsNestedSubpatterns = true; + + // Select the 'Generic' nodes. + parenthesesBeginOpCode = OpParenthesesSubpatternBegin; + parenthesesEndOpCode = OpParenthesesSubpatternEnd; + + // If there is more than one alternative we cannot use the 'simple' nodes. + if (term->parentheses.disjunction->m_alternatives.size() != 1) { + alternativeBeginOpCode = OpNestedAlternativeBegin; + alternativeNextOpCode = OpNestedAlternativeNext; + alternativeEndOpCode = OpNestedAlternativeEnd; + } +#else // This subpattern is not supported by the JIT. - m_shouldFallBack = true; + m_failureReason = JITFailureReason::ParenthesizedSubpattern; return; +#endif } size_t parenBegin = m_ops.size(); @@ -2355,7 +3080,7 @@ class YarrGenerator : private DefaultMacroAssembler { m_ops.append(alternativeBeginOpCode); m_ops.last().m_previousOp = notFound; m_ops.last().m_term = term; - Vector<OwnPtr<PatternAlternative> >& alternatives = term->parentheses.disjunction->m_alternatives; + Vector<std::unique_ptr<PatternAlternative>>& alternatives = term->parentheses.disjunction->m_alternatives; for (unsigned i = 0; i < alternatives.size(); ++i) { size_t lastOpIndex = m_ops.size() - 1; @@ -2406,7 +3131,7 @@ class YarrGenerator : private DefaultMacroAssembler { m_ops.append(OpSimpleNestedAlternativeBegin); m_ops.last().m_previousOp = notFound; m_ops.last().m_term = term; - Vector<OwnPtr<PatternAlternative> >& alternatives = term->parentheses.disjunction->m_alternatives; + Vector<std::unique_ptr<PatternAlternative>>& alternatives = term->parentheses.disjunction->m_alternatives; for (unsigned i = 0; i < alternatives.size(); ++i) { size_t lastOpIndex = m_ops.size() - 1; @@ -2480,7 +3205,7 @@ class YarrGenerator : private DefaultMacroAssembler { // to return the failing result. void opCompileBody(PatternDisjunction* disjunction) { - Vector<OwnPtr<PatternAlternative> >& alternatives = disjunction->m_alternatives; + Vector<std::unique_ptr<PatternAlternative>>& alternatives = disjunction->m_alternatives; size_t currentAlternativeIndex = 0; // Emit the 'once through' alternatives. @@ -2548,18 +3273,59 @@ class YarrGenerator : private DefaultMacroAssembler { lastOp.m_nextOp = repeatLoop; } + void generateTryReadUnicodeCharacterHelper() + { +#ifdef JIT_UNICODE_EXPRESSIONS + if (m_tryReadUnicodeCharacterCalls.isEmpty()) + return; + + ASSERT(m_decodeSurrogatePairs); + + m_tryReadUnicodeCharacterEntry = label(); + + tryReadUnicodeCharImpl(regT0); + + ret(); +#endif + } + void generateEnter() { #if CPU(X86_64) push(X86Registers::ebp); move(stackPointerRegister, X86Registers::ebp); - push(X86Registers::ebx); + + if (m_pattern.m_saveInitialStartValue) + push(X86Registers::ebx); + +#if OS(WINDOWS) + push(X86Registers::edi); +#endif +#if ENABLE(YARR_JIT_ALL_PARENS_EXPRESSIONS) + if (m_containsNestedSubpatterns) { +#if OS(WINDOWS) + push(X86Registers::esi); +#endif + push(X86Registers::r12); + } +#endif + + if (m_decodeSurrogatePairs) { + push(X86Registers::r13); + push(X86Registers::r14); + push(X86Registers::r15); + + move(TrustedImm32(0xd800), leadingSurrogateTag); + move(TrustedImm32(0xdc00), trailingSurrogateTag); + } // The ABI doesn't guarantee the upper bits are zero on unsigned arguments, so clear them ourselves. zeroExtend32ToPtr(index, index); zeroExtend32ToPtr(length, length); #if OS(WINDOWS) if (compileMode == IncludeSubpatterns) loadPtr(Address(X86Registers::ebp, 6 * sizeof(void*)), output); + // rcx is the pointer to the allocated space for result in x64 Windows. + push(X86Registers::ecx); #endif #elif CPU(X86) push(X86Registers::ebp); @@ -2580,6 +3346,14 @@ class YarrGenerator : private DefaultMacroAssembler { loadPtr(Address(X86Registers::ebp, 2 * sizeof(void*)), output); #endif #elif CPU(ARM64) + if (m_decodeSurrogatePairs) { + pushPair(framePointerRegister, linkRegister); + move(TrustedImm32(0x10000), supplementaryPlanesBase); + move(TrustedImm32(0xfffffc00), surrogateTagMask); + move(TrustedImm32(0xd800), leadingSurrogateTag); + move(TrustedImm32(0xdc00), trailingSurrogateTag); + } + // The ABI doesn't guarantee the upper bits are zero on unsigned arguments, so clear them ourselves. zeroExtend32ToPtr(index, index); zeroExtend32ToPtr(length, length); @@ -2587,45 +3361,60 @@ class YarrGenerator : private DefaultMacroAssembler { push(ARMRegisters::r4); push(ARMRegisters::r5); push(ARMRegisters::r6); -#if CPU(ARM_TRADITIONAL) - push(ARMRegisters::r8); // scratch register -#endif - if (compileMode == IncludeSubpatterns) - move(ARMRegisters::r3, output); -#elif CPU(SH4) - push(SH4Registers::r11); - push(SH4Registers::r13); + push(ARMRegisters::r8); #elif CPU(MIPS) // Do nothing. #endif + + store8(TrustedImm32(1), &m_vm->isExecutingInRegExpJIT); } void generateReturn() { + store8(TrustedImm32(0), &m_vm->isExecutingInRegExpJIT); + #if CPU(X86_64) #if OS(WINDOWS) // Store the return value in the allocated space pointed by rcx. + pop(X86Registers::ecx); store64(returnRegister, Address(X86Registers::ecx)); store64(returnRegister2, Address(X86Registers::ecx, sizeof(void*))); move(X86Registers::ecx, returnRegister); #endif - pop(X86Registers::ebx); + if (m_decodeSurrogatePairs) { + pop(X86Registers::r15); + pop(X86Registers::r14); + pop(X86Registers::r13); + } + +#if ENABLE(YARR_JIT_ALL_PARENS_EXPRESSIONS) + if (m_containsNestedSubpatterns) { + pop(X86Registers::r12); +#if OS(WINDOWS) + pop(X86Registers::esi); +#endif + } +#endif +#if OS(WINDOWS) + pop(X86Registers::edi); +#endif + + if (m_pattern.m_saveInitialStartValue) + pop(X86Registers::ebx); pop(X86Registers::ebp); #elif CPU(X86) pop(X86Registers::esi); pop(X86Registers::edi); pop(X86Registers::ebx); pop(X86Registers::ebp); +#elif CPU(ARM64) + if (m_decodeSurrogatePairs) + popPair(framePointerRegister, linkRegister); #elif CPU(ARM) -#if CPU(ARM_TRADITIONAL) - pop(ARMRegisters::r8); // scratch register -#endif + pop(ARMRegisters::r8); pop(ARMRegisters::r6); pop(ARMRegisters::r5); pop(ARMRegisters::r4); -#elif CPU(SH4) - pop(SH4Registers::r13); - pop(SH4Registers::r11); #elif CPU(MIPS) // Do nothing #endif @@ -2633,25 +3422,57 @@ class YarrGenerator : private DefaultMacroAssembler { } public: - YarrGenerator(YarrPattern& pattern, YarrCharSize charSize) - : m_pattern(pattern) + YarrGenerator(VM* vm, YarrPattern& pattern, YarrCodeBlock& codeBlock, YarrCharSize charSize) + : m_vm(vm) + , m_pattern(pattern) + , m_codeBlock(codeBlock) , m_charSize(charSize) - , m_charScale(m_charSize == Char8 ? TimesOne: TimesTwo) - , m_shouldFallBack(false) - , m_checked(0) + , m_decodeSurrogatePairs(m_charSize == Char16 && m_pattern.unicode()) + , m_unicodeIgnoreCase(m_pattern.unicode() && m_pattern.ignoreCase()) + , m_canonicalMode(m_pattern.unicode() ? CanonicalMode::Unicode : CanonicalMode::UCS2) +#if ENABLE(YARR_JIT_ALL_PARENS_EXPRESSIONS) + , m_containsNestedSubpatterns(false) + , m_parenContextSizes(compileMode == IncludeSubpatterns ? m_pattern.m_numSubpatterns : 0, m_pattern.m_body->m_callFrameSize) +#endif { } - void compile(JSGlobalData* globalData, YarrCodeBlock& jitObject) + void compile() { + YarrCodeBlock& codeBlock = m_codeBlock; + +#ifndef JIT_UNICODE_EXPRESSIONS + if (m_decodeSurrogatePairs) { + codeBlock.setFallBackWithFailureReason(JITFailureReason::DecodeSurrogatePair); + return; + } +#endif + +#if ENABLE(YARR_JIT_ALL_PARENS_EXPRESSIONS) + if (m_containsNestedSubpatterns) + codeBlock.setUsesPaternContextBuffer(); +#endif + + // We need to compile before generating code since we set flags based on compilation that + // are used during generation. + opCompileBody(m_pattern.m_body); + + if (m_failureReason) { + codeBlock.setFallBackWithFailureReason(*m_failureReason); + return; + } + generateEnter(); Jump hasInput = checkInput(); - move(TrustedImmPtr((void*)WTF::notFound), returnRegister); - move(TrustedImm32(0), returnRegister2); - generateReturn(); + generateFailReturn(); hasInput.link(this); +#if ENABLE(YARR_JIT_ALL_PARENS_EXPRESSIONS) + if (m_containsNestedSubpatterns) + move(TrustedImm32(matchLimit), remainingMatchCount); +#endif + if (compileMode == IncludeSubpatterns) { for (unsigned i = 0; i < m_pattern.m_numSubpatterns + 1; ++i) store32(TrustedImm32(-1), Address(output, (i << 1) * sizeof(int))); @@ -2662,47 +3483,80 @@ public: initCallFrame(); - // Compile the pattern to the internal 'YarrOp' representation. - opCompileBody(m_pattern.m_body); - - // If we encountered anything we can't handle in the JIT code - // (e.g. backreferences) then return early. - if (m_shouldFallBack) { - jitObject.setFallBack(true); - return; +#if ENABLE(YARR_JIT_ALL_PARENS_EXPRESSIONS) + if (m_containsNestedSubpatterns) + initParenContextFreeList(); +#endif + + if (m_pattern.m_saveInitialStartValue) { +#ifdef HAVE_INITIAL_START_REG + move(index, initialStart); +#else + storeToFrame(index, m_pattern.m_initialStartValueFrameLocation); +#endif } generate(); backtrack(); - // Link & finalize the code. - LinkBuffer<JSC::DefaultMacroAssembler> linkBuffer(*globalData, this, REGEXP_CODE_ID); + generateTryReadUnicodeCharacterHelper(); + + generateJITFailReturn(); + + JSGlobalData data(m_vm->regExpAllocator); + DefaultLinkBuffer linkBuffer(data, this, REGEXP_CODE_ID, JITCompilationCanFail); + if (linkBuffer.didFailToAllocate()) { + codeBlock.setFallBackWithFailureReason(JITFailureReason::ExecutableMemoryAllocationFailure); + return; + } + + if (!m_tryReadUnicodeCharacterCalls.isEmpty()) { + CodeLocationLabel tryReadUnicodeCharacterHelper = linkBuffer.locationOf(m_tryReadUnicodeCharacterEntry); + + for (auto call : m_tryReadUnicodeCharacterCalls) + linkBuffer.link(call, tryReadUnicodeCharacterHelper); + } + m_backtrackingState.linkDataLabels(linkBuffer); if (compileMode == MatchOnly) { if (m_charSize == Char8) - jitObject.set8BitCodeMatchOnly(FINALIZE_CODE(linkBuffer, ("Match-only 8-bit regular expression"))); + codeBlock.set8BitCodeMatchOnly(FINALIZE_CODE(linkBuffer, "Match-only 8-bit regular expression")); else - jitObject.set16BitCodeMatchOnly(FINALIZE_CODE(linkBuffer, ("Match-only 16-bit regular expression"))); + codeBlock.set16BitCodeMatchOnly(FINALIZE_CODE(linkBuffer, "Match-only 16-bit regular expression")); } else { if (m_charSize == Char8) - jitObject.set8BitCode(FINALIZE_CODE(linkBuffer, ("8-bit regular expression"))); + codeBlock.set8BitCode(FINALIZE_CODE(linkBuffer, "8-bit regular expression")); else - jitObject.set16BitCode(FINALIZE_CODE(linkBuffer, ("16-bit regular expression"))); + codeBlock.set16BitCode(FINALIZE_CODE(linkBuffer, "16-bit regular expression")); } - jitObject.setFallBack(m_shouldFallBack); + if (m_failureReason) + codeBlock.setFallBackWithFailureReason(*m_failureReason); } private: + VM* m_vm; + YarrPattern& m_pattern; + YarrCodeBlock& m_codeBlock; YarrCharSize m_charSize; - Scale m_charScale; - // Used to detect regular expression constructs that are not currently // supported in the JIT; fall back to the interpreter when this is detected. - bool m_shouldFallBack; + std::optional<JITFailureReason> m_failureReason; + + bool m_decodeSurrogatePairs; + bool m_unicodeIgnoreCase; + CanonicalMode m_canonicalMode; +#if ENABLE(YARR_JIT_ALL_PARENS_EXPRESSIONS) + bool m_containsNestedSubpatterns; + ParenContextSizes m_parenContextSizes; +#endif + JumpList m_abortExecution; + JumpList m_hitMatchLimit; + Vector<Call> m_tryReadUnicodeCharacterCalls; + Label m_tryReadUnicodeCharacterEntry; // The regular expression expressed as a linear sequence of operations. Vector<YarrOp, 128> m_ops; @@ -2717,18 +3571,47 @@ private: // FIXME: This should go away. Rather than tracking this value throughout // code generation, we should gather this information up front & store it // on the YarrOp structure. - int m_checked; + Checked<unsigned> m_checkedOffset; // This class records state whilst generating the backtracking path of code. BacktrackingState m_backtrackingState; }; -void jitCompile(YarrPattern& pattern, YarrCharSize charSize, JSGlobalData* globalData, YarrCodeBlock& jitObject, YarrJITCompileMode mode) +static void dumpCompileFailure(JITFailureReason failure) +{ + switch (failure) { + case JITFailureReason::DecodeSurrogatePair: + dataLog("Can't JIT a pattern decoding surrogate pairs\n"); + break; + case JITFailureReason::BackReference: + dataLog("Can't JIT a pattern containing back references\n"); + break; + case JITFailureReason::VariableCountedParenthesisWithNonZeroMinimum: + dataLog("Can't JIT a pattern containing a variable counted parenthesis with a non-zero minimum\n"); + break; + case JITFailureReason::ParenthesizedSubpattern: + dataLog("Can't JIT a pattern containing parenthesized subpatterns\n"); + break; + case JITFailureReason::NonGreedyParenthesizedSubpattern: + dataLog("Can't JIT a pattern containing non-greedy parenthesized subpatterns\n"); + break; + case JITFailureReason::ExecutableMemoryAllocationFailure: + dataLog("Can't JIT because of failure of allocation of executable memory\n"); + break; + } +} + +void jitCompile(YarrPattern& pattern, YarrCharSize charSize, VM* vm, YarrCodeBlock& codeBlock, YarrJITCompileMode mode) { if (mode == MatchOnly) - YarrGenerator<MatchOnly>(pattern, charSize).compile(globalData, jitObject); + YarrGenerator<MatchOnly>(vm, pattern, codeBlock, charSize).compile(); else - YarrGenerator<IncludeSubpatterns>(pattern, charSize).compile(globalData, jitObject); + YarrGenerator<IncludeSubpatterns>(vm, pattern, codeBlock, charSize).compile(); + + if (auto failureReason = codeBlock.failureReason()) { + if (Options::dumpCompiledRegExpPatterns()) + dumpCompileFailure(*failureReason); + } } }} diff --git a/src/3rdparty/masm/yarr/YarrJIT.h b/src/3rdparty/masm/yarr/YarrJIT.h index bb7033fdea..8b6b3a7577 100644 --- a/src/3rdparty/masm/yarr/YarrJIT.h +++ b/src/3rdparty/masm/yarr/YarrJIT.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2009 Apple Inc. All rights reserved. + * Copyright (C) 2009-2018 Apple Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -23,12 +23,12 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef YarrJIT_h -#define YarrJIT_h +#pragma once + +#include <Platform.h> #if ENABLE(YARR_JIT) -#include "JSGlobalData.h" #include "MacroAssemblerCodeRef.h" #include "MatchResult.h" #include "Yarr.h" @@ -40,19 +40,39 @@ #define YARR_CALL #endif +#if ENABLE(YARR_JIT_ALL_PARENS_EXPRESSIONS) +constexpr size_t patternContextBufferSize = 8192; // Space caller allocates to save nested parenthesis context +#endif + namespace JSC { -class JSGlobalData; +class VM; class ExecutablePool; namespace Yarr { +enum class JITFailureReason : uint8_t { + DecodeSurrogatePair, + BackReference, + VariableCountedParenthesisWithNonZeroMinimum, + ParenthesizedSubpattern, + NonGreedyParenthesizedSubpattern, + ExecutableMemoryAllocationFailure, +}; + class YarrCodeBlock { -#if CPU(X86_64) +#if CPU(X86_64) || CPU(ARM64) +#if ENABLE(YARR_JIT_ALL_PARENS_EXPRESSIONS) + typedef MatchResult (*YarrJITCode8)(const LChar* input, unsigned start, unsigned length, int* output, void* freeParenContext, unsigned parenContextSize) YARR_CALL; + typedef MatchResult (*YarrJITCode16)(const UChar* input, unsigned start, unsigned length, int* output, void* freeParenContext, unsigned parenContextSize) YARR_CALL; + typedef MatchResult (*YarrJITCodeMatchOnly8)(const LChar* input, unsigned start, unsigned length, void*, void* freeParenContext, unsigned parenContextSize) YARR_CALL; + typedef MatchResult (*YarrJITCodeMatchOnly16)(const UChar* input, unsigned start, unsigned length, void*, void* freeParenContext, unsigned parenContextSize) YARR_CALL; +#else typedef MatchResult (*YarrJITCode8)(const LChar* input, unsigned start, unsigned length, int* output) YARR_CALL; typedef MatchResult (*YarrJITCode16)(const UChar* input, unsigned start, unsigned length, int* output) YARR_CALL; typedef MatchResult (*YarrJITCodeMatchOnly8)(const LChar* input, unsigned start, unsigned length) YARR_CALL; typedef MatchResult (*YarrJITCodeMatchOnly16)(const UChar* input, unsigned start, unsigned length) YARR_CALL; +#endif #else typedef EncodedMatchResult (*YarrJITCode8)(const LChar* input, unsigned start, unsigned length, int* output) YARR_CALL; typedef EncodedMatchResult (*YarrJITCode16)(const UChar* input, unsigned start, unsigned length, int* output) YARR_CALL; @@ -61,17 +81,10 @@ class YarrCodeBlock { #endif public: - YarrCodeBlock() - : m_needFallBack(false) - { - } - - ~YarrCodeBlock() - { - } + YarrCodeBlock() = default; - void setFallBack(bool fallback) { m_needFallBack = fallback; } - bool isFallBack() { return m_needFallBack; } + void setFallBackWithFailureReason(JITFailureReason failureReason) { m_failureReason = failureReason; } + std::optional<JITFailureReason> failureReason() { return m_failureReason; } bool has8BitCode() { return m_ref8.size(); } bool has16BitCode() { return m_ref16.size(); } @@ -83,6 +96,34 @@ public: void set8BitCodeMatchOnly(MacroAssemblerCodeRef matchOnly) { m_matchOnly8 = matchOnly; } void set16BitCodeMatchOnly(MacroAssemblerCodeRef matchOnly) { m_matchOnly16 = matchOnly; } +#if ENABLE(YARR_JIT_ALL_PARENS_EXPRESSIONS) + bool usesPatternContextBuffer() { return m_usesPatternContextBuffer; } + void setUsesPaternContextBuffer() { m_usesPatternContextBuffer = true; } + + MatchResult execute(const LChar* input, unsigned start, unsigned length, int* output, void* freeParenContext, unsigned parenContextSize) + { + ASSERT(has8BitCode()); + return MatchResult(reinterpret_cast<YarrJITCode8>(m_ref8.code().executableAddress())(input, start, length, output, freeParenContext, parenContextSize)); + } + + MatchResult execute(const UChar* input, unsigned start, unsigned length, int* output, void* freeParenContext, unsigned parenContextSize) + { + ASSERT(has16BitCode()); + return MatchResult(reinterpret_cast<YarrJITCode16>(m_ref16.code().executableAddress())(input, start, length, output, freeParenContext, parenContextSize)); + } + + MatchResult execute(const LChar* input, unsigned start, unsigned length, void* freeParenContext, unsigned parenContextSize) + { + ASSERT(has8BitCodeMatchOnly()); + return MatchResult(reinterpret_cast<YarrJITCodeMatchOnly8>(m_matchOnly8.code().executableAddress())(input, start, length, 0, freeParenContext, parenContextSize)); + } + + MatchResult execute(const UChar* input, unsigned start, unsigned length, void* freeParenContext, unsigned parenContextSize) + { + ASSERT(has16BitCodeMatchOnly()); + return MatchResult(reinterpret_cast<YarrJITCodeMatchOnly16>(m_matchOnly16.code().executableAddress())(input, start, length, 0, freeParenContext, parenContextSize)); + } +#else MatchResult execute(const LChar* input, unsigned start, unsigned length, int* output) { ASSERT(has8BitCode()); @@ -106,18 +147,54 @@ public: ASSERT(has16BitCodeMatchOnly()); return MatchResult(reinterpret_cast<YarrJITCodeMatchOnly16>(m_matchOnly16.code().executableAddress())(input, start, length)); } +#endif #if ENABLE(REGEXP_TRACING) - void *getAddr() { return m_ref.code().executableAddress(); } + void *get8BitMatchOnlyAddr() + { + if (!has8BitCodeMatchOnly()) + return 0; + + return m_matchOnly8.code().executableAddress(); + } + + void *get16BitMatchOnlyAddr() + { + if (!has16BitCodeMatchOnly()) + return 0; + + return m_matchOnly16.code().executableAddress(); + } + + void *get8BitMatchAddr() + { + if (!has8BitCode()) + return 0; + + return m_ref8.code().executableAddress(); + } + + void *get16BitMatchAddr() + { + if (!has16BitCode()) + return 0; + + return m_ref16.code().executableAddress(); + } #endif + size_t size() const + { + return m_ref8.size() + m_ref16.size() + m_matchOnly8.size() + m_matchOnly16.size(); + } + void clear() { m_ref8 = MacroAssemblerCodeRef(); m_ref16 = MacroAssemblerCodeRef(); m_matchOnly8 = MacroAssemblerCodeRef(); m_matchOnly16 = MacroAssemblerCodeRef(); - m_needFallBack = false; + m_failureReason = std::nullopt; } private: @@ -125,17 +202,18 @@ private: MacroAssemblerCodeRef m_ref16; MacroAssemblerCodeRef m_matchOnly8; MacroAssemblerCodeRef m_matchOnly16; - bool m_needFallBack; +#if ENABLE(YARR_JIT_ALL_PARENS_EXPRESSIONS) + bool m_usesPatternContextBuffer; +#endif + std::optional<JITFailureReason> m_failureReason; }; enum YarrJITCompileMode { MatchOnly, IncludeSubpatterns }; -void jitCompile(YarrPattern&, YarrCharSize, JSGlobalData*, YarrCodeBlock& jitObject, YarrJITCompileMode = IncludeSubpatterns); +void jitCompile(YarrPattern&, YarrCharSize, VM*, YarrCodeBlock& jitObject, YarrJITCompileMode = IncludeSubpatterns); } } // namespace JSC::Yarr #endif - -#endif // YarrJIT_h diff --git a/src/3rdparty/masm/yarr/YarrParser.h b/src/3rdparty/masm/yarr/YarrParser.h index 13ffd3a1d6..3e5311f1fb 100644 --- a/src/3rdparty/masm/yarr/YarrParser.h +++ b/src/3rdparty/masm/yarr/YarrParser.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2009 Apple Inc. All rights reserved. + * Copyright (C) 2009, 2014-2016 Apple Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -23,46 +23,25 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef YarrParser_h -#define YarrParser_h +#pragma once #include "Yarr.h" +#include "YarrPattern.h" +#include "YarrUnicodeProperties.h" #include <wtf/ASCIICType.h> +#include <wtf/HashSet.h> +#include <wtf/Optional.h> +#include <wtf/text/StringBuilder.h> #include <wtf/text/WTFString.h> -#include <wtf/unicode/Unicode.h> namespace JSC { namespace Yarr { -#define REGEXP_ERROR_PREFIX "Invalid regular expression: " - -enum BuiltInCharacterClassID { - DigitClassID, - SpaceClassID, - WordClassID, - NewlineClassID, -}; - // The Parser class should not be used directly - only via the Yarr::parse() method. template<class Delegate, typename CharType> class Parser { private: template<class FriendDelegate> - friend const char* parse(FriendDelegate&, const String& pattern, unsigned backReferenceLimit); - - enum ErrorCode { - NoError, - PatternTooLarge, - QuantifierOutOfOrder, - QuantifierWithoutAtom, - QuantifierTooLarge, - MissingParentheses, - ParenthesesUnmatched, - ParenthesesTypeInvalid, - CharacterClassUnmatched, - CharacterClassOutOfOrder, - EscapeUnterminated, - NumberOfErrorCodes - }; + friend ErrorCode parse(FriendDelegate&, const String& pattern, bool isUnicode, unsigned backReferenceLimit); /* * CharacterClassParserDelegate: @@ -77,7 +56,7 @@ private: public: CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err) : m_delegate(delegate) - , m_err(err) + , m_errorCode(err) , m_state(Empty) , m_character(0) { @@ -102,7 +81,7 @@ private: * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/ * is different to /[a\-z]/). */ - void atomPatternCharacter(UChar ch, bool hyphenIsRange = false) + void atomPatternCharacter(UChar32 ch, bool hyphenIsRange = false) { switch (m_state) { case AfterCharacterClass: @@ -118,7 +97,8 @@ private: m_state = AfterCharacterClassHyphen; return; } - Q_FALLTHROUGH(); // cached character, so treat this as Empty. + // Otherwise just fall through - cached character so treat this as Empty. + FALLTHROUGH; case Empty: m_character = ch; @@ -136,7 +116,7 @@ private: case CachedCharacterHyphen: if (ch < m_character) { - m_err = CharacterClassOutOfOrder; + m_errorCode = ErrorCode::CharacterClassOutOfOrder; return; } m_delegate.atomCharacterClassRange(m_character, ch); @@ -168,8 +148,7 @@ private: case CachedCharacter: // Flush the currently cached character, then fall through. m_delegate.atomCharacterClassAtom(m_character); - Q_FALLTHROUGH(); - + FALLTHROUGH; case Empty: case AfterCharacterClass: m_state = AfterCharacterClass; @@ -187,7 +166,7 @@ private: case CachedCharacterHyphen: m_delegate.atomCharacterClassAtom(m_character); m_delegate.atomCharacterClassAtom('-'); - // fall through + FALLTHROUGH; case AfterCharacterClassHyphen: m_delegate.atomCharacterClassBuiltIn(classID, invert); m_state = Empty; @@ -215,10 +194,11 @@ private: // invoked with inCharacterClass set. NO_RETURN_DUE_TO_ASSERT void assertionWordBoundary(bool) { RELEASE_ASSERT_NOT_REACHED(); } NO_RETURN_DUE_TO_ASSERT void atomBackReference(unsigned) { RELEASE_ASSERT_NOT_REACHED(); } + NO_RETURN_DUE_TO_ASSERT void atomNamedBackReference(String) { RELEASE_ASSERT_NOT_REACHED(); } private: Delegate& m_delegate; - ErrorCode& m_err; + ErrorCode& m_errorCode; enum CharacterClassConstructionState { Empty, CachedCharacter, @@ -226,20 +206,31 @@ private: AfterCharacterClass, AfterCharacterClassHyphen, } m_state; - UChar m_character; + UChar32 m_character; }; - Parser(Delegate& delegate, const String& pattern, unsigned backReferenceLimit) + Parser(Delegate& delegate, const String& pattern, bool isUnicode, unsigned backReferenceLimit) : m_delegate(delegate) , m_backReferenceLimit(backReferenceLimit) - , m_err(NoError) - , m_data(pattern.getCharacters<CharType>()) + , m_data(pattern.characters<CharType>()) , m_size(pattern.length()) - , m_index(0) - , m_parenthesesNestingDepth(0) + , m_isUnicode(isUnicode) { } + // The handling of IdentityEscapes is different depending on the unicode flag. + // For Unicode patterns, IdentityEscapes only include SyntaxCharacters or '/'. + // For non-unicode patterns, most any character can be escaped. + bool isIdentityEscapeAnError(int ch) + { + if (m_isUnicode && !strchr("^$\\.*+?()[]{}|/", ch)) { + m_errorCode = ErrorCode::InvalidIdentityEscape; + return true; + } + + return false; + } + /* * parseEscape(): * @@ -263,12 +254,12 @@ private: template<bool inCharacterClass, class EscapeDelegate> bool parseEscape(EscapeDelegate& delegate) { - ASSERT(!m_err); + ASSERT(!hasError(m_errorCode)); ASSERT(peek() == '\\'); consume(); if (atEndOfPattern()) { - m_err = EscapeUnterminated; + m_errorCode = ErrorCode::EscapeUnterminated; return false; } @@ -276,18 +267,24 @@ private: // Assertions case 'b': consume(); - if (inCharacterClass) + if (inCharacterClass) { + if (isIdentityEscapeAnError('b')) + break; + delegate.atomPatternCharacter('\b'); - else { + } else { delegate.assertionWordBoundary(false); return false; } break; case 'B': consume(); - if (inCharacterClass) + if (inCharacterClass) { + if (isIdentityEscapeAnError('B')) + break; + delegate.atomPatternCharacter('B'); - else { + } else { delegate.assertionWordBoundary(true); return false; } @@ -296,27 +293,27 @@ private: // CharacterClassEscape case 'd': consume(); - delegate.atomBuiltInCharacterClass(DigitClassID, false); + delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::DigitClassID, false); break; case 's': consume(); - delegate.atomBuiltInCharacterClass(SpaceClassID, false); + delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::SpaceClassID, false); break; case 'w': consume(); - delegate.atomBuiltInCharacterClass(WordClassID, false); + delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::WordClassID, false); break; case 'D': consume(); - delegate.atomBuiltInCharacterClass(DigitClassID, true); + delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::DigitClassID, true); break; case 'S': consume(); - delegate.atomBuiltInCharacterClass(SpaceClassID, true); + delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::SpaceClassID, true); break; case 'W': consume(); - delegate.atomBuiltInCharacterClass(WordClassID, true); + delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::WordClassID, true); break; // DecimalEscape @@ -341,15 +338,22 @@ private: } restoreState(state); + + if (m_isUnicode) { + m_errorCode = ErrorCode::InvalidBackreference; + return false; + } } - - // Not a backreference, and not octal. + + // Not a backreference, and not octal. Just a number. if (peek() >= '8') { - delegate.atomPatternCharacter('\\'); + delegate.atomPatternCharacter(consume()); break; } + + // Fall-through to handle this as an octal escape. + FALLTHROUGH; } - Q_FALLTHROUGH(); // Handle this as an octal escape. // Octal escape case '0': @@ -400,32 +404,161 @@ private: case 'x': { consume(); int x = tryConsumeHex(2); - if (x == -1) + if (x == -1) { + if (isIdentityEscapeAnError('x')) + break; + delegate.atomPatternCharacter('x'); - else + } else delegate.atomPatternCharacter(x); break; } + // Named backreference + case 'k': { + consume(); + ParseState state = saveState(); + if (!atEndOfPattern() && !inCharacterClass) { + if (consume() == '<') { + auto groupName = tryConsumeGroupName(); + if (groupName && m_captureGroupNames.contains(groupName.value())) { + delegate.atomNamedBackReference(groupName.value()); + break; + } + if (m_isUnicode) { + m_errorCode = ErrorCode::InvalidBackreference; + break; + } + } + } + restoreState(state); + delegate.atomPatternCharacter('k'); + break; + } + + // Unicode property escapes + case 'p': + case 'P': { + int escapeChar = consume(); + + if (!m_isUnicode) { + if (isIdentityEscapeAnError(escapeChar)) + break; + delegate.atomPatternCharacter(escapeChar); + break; + } + + if (!atEndOfPattern() && peek() == '{') { + consume(); + auto optClassID = tryConsumeUnicodePropertyExpression(); + if (!optClassID) { + // tryConsumeUnicodePropertyExpression() will set m_errorCode for a malformed property expression + break; + } + delegate.atomBuiltInCharacterClass(optClassID.value(), escapeChar == 'P'); + } else + m_errorCode = ErrorCode::InvalidUnicodePropertyExpression; + break; + } + // UnicodeEscape case 'u': { consume(); + if (atEndOfPattern()) { + if (isIdentityEscapeAnError('u')) + break; + + delegate.atomPatternCharacter('u'); + break; + } + + if (m_isUnicode && peek() == '{') { + consume(); + UChar32 codePoint = 0; + do { + if (atEndOfPattern() || !isASCIIHexDigit(peek())) { + m_errorCode = ErrorCode::InvalidUnicodeEscape; + break; + } + + codePoint = (codePoint << 4) | toASCIIHexValue(consume()); + + if (codePoint > UCHAR_MAX_VALUE) + m_errorCode = ErrorCode::InvalidUnicodeEscape; + } while (!atEndOfPattern() && peek() != '}'); + if (!atEndOfPattern() && peek() == '}') + consume(); + else if (!hasError(m_errorCode)) + m_errorCode = ErrorCode::InvalidUnicodeEscape; + if (hasError(m_errorCode)) + return false; + + delegate.atomPatternCharacter(codePoint); + break; + } int u = tryConsumeHex(4); - if (u == -1) + if (u == -1) { + if (isIdentityEscapeAnError('u')) + break; + delegate.atomPatternCharacter('u'); - else + } else { + // If we have the first of a surrogate pair, look for the second. + if (U16_IS_LEAD(u) && m_isUnicode && (patternRemaining() >= 6) && peek() == '\\') { + ParseState state = saveState(); + consume(); + + if (tryConsume('u')) { + int surrogate2 = tryConsumeHex(4); + if (U16_IS_TRAIL(surrogate2)) { + u = U16_GET_SUPPLEMENTARY(u, surrogate2); + delegate.atomPatternCharacter(u); + break; + } + } + + restoreState(state); + } delegate.atomPatternCharacter(u); + } break; } // IdentityEscape default: + int ch = peek(); + + if (ch == '-' && m_isUnicode && inCharacterClass) { + // \- is allowed for ClassEscape with unicode flag. + delegate.atomPatternCharacter(consume()); + break; + } + + if (isIdentityEscapeAnError(ch)) + break; + delegate.atomPatternCharacter(consume()); } return true; } + UChar32 consumePossibleSurrogatePair() + { + UChar32 ch = consume(); + if (U16_IS_LEAD(ch) && m_isUnicode && (patternRemaining() > 0)) { + ParseState state = saveState(); + + UChar32 surrogate2 = consume(); + if (U16_IS_TRAIL(surrogate2)) + ch = U16_GET_SUPPLEMENTARY(ch, surrogate2); + else + restoreState(state); + } + + return ch; + } + /* * parseAtomEscape(), parseCharacterClassEscape(): * @@ -449,11 +582,11 @@ private: */ void parseCharacterClass() { - ASSERT(!m_err); + ASSERT(!hasError(m_errorCode)); ASSERT(peek() == '['); consume(); - CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err); + CharacterClassParserDelegate characterClassConstructor(m_delegate, m_errorCode); characterClassConstructor.begin(tryConsume('^')); @@ -469,14 +602,14 @@ private: break; default: - characterClassConstructor.atomPatternCharacter(consume(), true); + characterClassConstructor.atomPatternCharacter(consumePossibleSurrogatePair(), true); } - if (m_err) + if (hasError(m_errorCode)) return; } - m_err = CharacterClassUnmatched; + m_errorCode = ErrorCode::CharacterClassUnmatched; } /* @@ -486,13 +619,13 @@ private: */ void parseParenthesesBegin() { - ASSERT(!m_err); + ASSERT(!hasError(m_errorCode)); ASSERT(peek() == '('); consume(); if (tryConsume('?')) { if (atEndOfPattern()) { - m_err = ParenthesesTypeInvalid; + m_errorCode = ErrorCode::ParenthesesTypeInvalid; return; } @@ -508,9 +641,23 @@ private: case '!': m_delegate.atomParentheticalAssertionBegin(true); break; - + + case '<': { + auto groupName = tryConsumeGroupName(); + if (groupName) { + auto setAddResult = m_captureGroupNames.add(groupName.value()); + if (setAddResult.isNewEntry) + m_delegate.atomParenthesesSubpatternBegin(true, groupName); + else + m_errorCode = ErrorCode::DuplicateGroupName; + } else + m_errorCode = ErrorCode::InvalidGroupName; + + break; + } + default: - m_err = ParenthesesTypeInvalid; + m_errorCode = ErrorCode::ParenthesesTypeInvalid; } } else m_delegate.atomParenthesesSubpatternBegin(); @@ -525,14 +672,14 @@ private: */ void parseParenthesesEnd() { - ASSERT(!m_err); + ASSERT(!hasError(m_errorCode)); ASSERT(peek() == ')'); consume(); if (m_parenthesesNestingDepth > 0) m_delegate.atomParenthesesEnd(); else - m_err = ParenthesesUnmatched; + m_errorCode = ErrorCode::ParenthesesUnmatched; --m_parenthesesNestingDepth; } @@ -544,18 +691,18 @@ private: */ void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max) { - ASSERT(!m_err); + ASSERT(!hasError(m_errorCode)); ASSERT(min <= max); if (min == UINT_MAX) { - m_err = QuantifierTooLarge; + m_errorCode = ErrorCode::QuantifierTooLarge; return; } if (lastTokenWasAnAtom) m_delegate.quantifyAtom(min, max, !tryConsume('?')); else - m_err = QuantifierWithoutAtom; + m_errorCode = ErrorCode::QuantifierWithoutAtom; } /* @@ -603,7 +750,7 @@ private: case '.': consume(); - m_delegate.atomBuiltInCharacterClass(NewlineClassID, true); + m_delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::DotClassID, false); lastTokenWasAnAtom = true; break; @@ -649,7 +796,7 @@ private: if (min <= max) parseQuantifier(lastTokenWasAnAtom, min, max); else - m_err = QuantifierOutOfOrder; + m_errorCode = ErrorCode::QuantifierOutOfOrder; lastTokenWasAnAtom = false; break; } @@ -657,51 +804,36 @@ private: restoreState(state); } - Q_FALLTHROUGH(); // if we did not find a complete quantifer, fall through to the default case. + // if we did not find a complete quantifer, fall through to the default case. + FALLTHROUGH; default: - m_delegate.atomPatternCharacter(consume()); + m_delegate.atomPatternCharacter(consumePossibleSurrogatePair()); lastTokenWasAnAtom = true; } - if (m_err) + if (hasError(m_errorCode)) return; } if (m_parenthesesNestingDepth > 0) - m_err = MissingParentheses; + m_errorCode = ErrorCode::MissingParentheses; } /* * parse(): * - * This method calls parseTokens() to parse over the input and converts any - * error code to a const char* for a result. + * This method calls parseTokens() to parse over the input and returns error code for a result. */ - const char* parse() + ErrorCode parse() { if (m_size > MAX_PATTERN_SIZE) - m_err = PatternTooLarge; + m_errorCode = ErrorCode::PatternTooLarge; else parseTokens(); - ASSERT(atEndOfPattern() || m_err); - - // The order of this array must match the ErrorCode enum. - static const char* errorMessages[NumberOfErrorCodes] = { - 0, // NoError - REGEXP_ERROR_PREFIX "regular expression too large", - REGEXP_ERROR_PREFIX "numbers out of order in {} quantifier", - REGEXP_ERROR_PREFIX "nothing to repeat", - REGEXP_ERROR_PREFIX "number too large in {} quantifier", - REGEXP_ERROR_PREFIX "missing )", - REGEXP_ERROR_PREFIX "unmatched parentheses", - REGEXP_ERROR_PREFIX "unrecognized character after (?", - REGEXP_ERROR_PREFIX "missing terminating ] for character class", - REGEXP_ERROR_PREFIX "range out of order in character class", - REGEXP_ERROR_PREFIX "\\ at end of pattern" - }; - - return errorMessages[m_err]; + ASSERT(atEndOfPattern() || hasError(m_errorCode)); + + return m_errorCode; } // Misc helper functions: @@ -724,6 +856,12 @@ private: return m_index == m_size; } + unsigned patternRemaining() + { + ASSERT(m_index <= m_size); + return m_size - m_index; + } + int peek() { ASSERT(m_index < m_size); @@ -741,6 +879,87 @@ private: return peek() - '0'; } + int tryConsumeUnicodeEscape() + { + if (!tryConsume('u')) + return -1; + + if (m_isUnicode && tryConsume('{')) { + int codePoint = 0; + do { + if (atEndOfPattern() || !isASCIIHexDigit(peek())) { + m_errorCode = ErrorCode::InvalidUnicodeEscape; + return -1; + } + + codePoint = (codePoint << 4) | toASCIIHexValue(consume()); + + if (codePoint > UCHAR_MAX_VALUE) { + m_errorCode = ErrorCode::InvalidUnicodeEscape; + return -1; + } + } while (!atEndOfPattern() && peek() != '}'); + if (!atEndOfPattern() && peek() == '}') + consume(); + else if (!hasError(m_errorCode)) + m_errorCode = ErrorCode::InvalidUnicodeEscape; + if (hasError(m_errorCode)) + return -1; + + return codePoint; + } + + int u = tryConsumeHex(4); + if (u == -1) + return -1; + + // If we have the first of a surrogate pair, look for the second. + if (U16_IS_LEAD(u) && m_isUnicode && (patternRemaining() >= 6) && peek() == '\\') { + ParseState state = saveState(); + consume(); + + if (tryConsume('u')) { + int surrogate2 = tryConsumeHex(4); + if (U16_IS_TRAIL(surrogate2)) { + u = U16_GET_SUPPLEMENTARY(u, surrogate2); + return u; + } + } + + restoreState(state); + } + + return u; + } + + int tryConsumeIdentifierCharacter() + { + int ch = peek(); + + if (ch == '\\') { + consume(); + ch = tryConsumeUnicodeEscape(); + } else + consume(); + + return ch; + } + + bool isIdentifierStart(int ch) + { + return (WTF::isASCII(ch) && (WTF::isASCIIAlpha(ch) || ch == '_' || ch == '$')) || (U_GET_GC_MASK(ch) & U_GC_L_MASK); + } + + bool isIdentifierPart(int ch) + { + return (WTF::isASCII(ch) && (WTF::isASCIIAlpha(ch) || ch == '_' || ch == '$')) || (U_GET_GC_MASK(ch) & (U_GC_L_MASK | U_GC_MN_MASK | U_GC_MC_MASK | U_GC_ND_MASK | U_GC_PC_MASK)) || ch == 0x200C || ch == 0x200D; + } + + bool isUnicodePropertyValueExpressionChar(int ch) + { + return WTF::isASCIIAlphanumeric(ch) || ch == '_' || ch == '='; + } + int consume() { ASSERT(m_index < m_size); @@ -755,13 +974,10 @@ private: unsigned consumeNumber() { - unsigned n = consumeDigit(); - // check for overflow. - for (unsigned newValue; peekIsDigit() && ((newValue = n * 10 + peekDigit()) >= n); ) { - n = newValue; - consume(); - } - return n; + Checked<unsigned, RecordOverflow> n = consumeDigit(); + while (peekIsDigit()) + n = n * 10 + consumeDigit(); + return n.hasOverflowed() ? quantifyInfinite : n.unsafeGet(); } unsigned consumeOctal() @@ -797,13 +1013,99 @@ private: return n; } + std::optional<String> tryConsumeGroupName() + { + if (atEndOfPattern()) + return std::nullopt; + + ParseState state = saveState(); + + int ch = tryConsumeIdentifierCharacter(); + + if (isIdentifierStart(ch)) { + StringBuilder identifierBuilder; + identifierBuilder.append(ch); + + while (!atEndOfPattern()) { + ch = tryConsumeIdentifierCharacter(); + if (ch == '>') + return std::optional<String>(identifierBuilder.toString()); + + if (!isIdentifierPart(ch)) + break; + + identifierBuilder.append(ch); + } + } + + restoreState(state); + + return std::nullopt; + } + + std::optional<BuiltInCharacterClassID> tryConsumeUnicodePropertyExpression() + { + if (atEndOfPattern() || !isUnicodePropertyValueExpressionChar(peek())) { + m_errorCode = ErrorCode::InvalidUnicodePropertyExpression; + return std::nullopt; + } + + StringBuilder expressionBuilder; + String unicodePropertyName; + bool foundEquals = false; + unsigned errors = 0; + + expressionBuilder.append(consume()); + + while (!atEndOfPattern()) { + int ch = peek(); + if (ch == '}') { + consume(); + if (errors) { + m_errorCode = ErrorCode::InvalidUnicodePropertyExpression; + return std::nullopt; + } + + if (foundEquals) { + auto result = unicodeMatchPropertyValue(unicodePropertyName, expressionBuilder.toString()); + if (!result) + m_errorCode = ErrorCode::InvalidUnicodePropertyExpression; + return result; + } + + auto result = unicodeMatchProperty(expressionBuilder.toString()); + if (!result) + m_errorCode = ErrorCode::InvalidUnicodePropertyExpression; + return result; + } + + consume(); + if (ch == '=') { + if (!foundEquals) { + foundEquals = true; + unicodePropertyName = expressionBuilder.toString(); + expressionBuilder.clear(); + } else + errors++; + } else if (!isUnicodePropertyValueExpressionChar(ch)) + errors++; + else + expressionBuilder.append(ch); + } + + m_errorCode = ErrorCode::InvalidUnicodePropertyExpression; + return std::nullopt; + } + Delegate& m_delegate; unsigned m_backReferenceLimit; - ErrorCode m_err; + ErrorCode m_errorCode { ErrorCode::NoError }; const CharType* m_data; unsigned m_size; - unsigned m_index; - unsigned m_parenthesesNestingDepth; + unsigned m_index { 0 }; + bool m_isUnicode; + unsigned m_parenthesesNestingDepth { 0 }; + HashSet<String> m_captureGroupNames; // Derived by empirical testing of compile time in PCRE and WREC. static const unsigned MAX_PATTERN_SIZE = 1024 * 1024; @@ -823,17 +1125,18 @@ private: * void assertionEOL(); * void assertionWordBoundary(bool invert); * - * void atomPatternCharacter(UChar ch); + * void atomPatternCharacter(UChar32 ch); * void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert); * void atomCharacterClassBegin(bool invert) - * void atomCharacterClassAtom(UChar ch) - * void atomCharacterClassRange(UChar begin, UChar end) + * void atomCharacterClassAtom(UChar32 ch) + * void atomCharacterClassRange(UChar32 begin, UChar32 end) * void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) * void atomCharacterClassEnd() - * void atomParenthesesSubpatternBegin(bool capture = true); + * void atomParenthesesSubpatternBegin(bool capture = true, std::optional<String> groupName); * void atomParentheticalAssertionBegin(bool invert = false); * void atomParenthesesEnd(); * void atomBackReference(unsigned subpatternId); + * void atomNamedBackReference(String subpatternName); * * void quantifyAtom(unsigned min, unsigned max, bool greedy); * @@ -869,13 +1172,11 @@ private: */ template<class Delegate> -const char* parse(Delegate& delegate, const String& pattern, unsigned backReferenceLimit = quantifyInfinite) +ErrorCode parse(Delegate& delegate, const String& pattern, bool isUnicode, unsigned backReferenceLimit = quantifyInfinite) { if (pattern.is8Bit()) - return Parser<Delegate, LChar>(delegate, pattern, backReferenceLimit).parse(); - return Parser<Delegate, UChar>(delegate, pattern, backReferenceLimit).parse(); + return Parser<Delegate, LChar>(delegate, pattern, isUnicode, backReferenceLimit).parse(); + return Parser<Delegate, UChar>(delegate, pattern, isUnicode, backReferenceLimit).parse(); } } } // namespace JSC::Yarr - -#endif // YarrParser_h diff --git a/src/3rdparty/masm/yarr/YarrPattern.cpp b/src/3rdparty/masm/yarr/YarrPattern.cpp index c7e5b6b09b..ac66ea1b9a 100644 --- a/src/3rdparty/masm/yarr/YarrPattern.cpp +++ b/src/3rdparty/masm/yarr/YarrPattern.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2009, 2013 Apple Inc. All rights reserved. + * Copyright (C) 2009, 2013-2016 Apple Inc. All rights reserved. * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged * * Redistribution and use in source and binary forms, with or without @@ -27,10 +27,15 @@ #include "config.h" #include "YarrPattern.h" +#include "Options.h" #include "Yarr.h" -#include "YarrCanonicalizeUCS2.h" +#include "YarrCanonicalize.h" #include "YarrParser.h" +#include <wtf/DataLog.h> +#include <wtf/Optional.h> +//#include <wtf/Threading.h> #include <wtf/Vector.h> +#include <wtf/text/WTFString.h> using namespace WTF; @@ -40,8 +45,11 @@ namespace JSC { namespace Yarr { class CharacterClassConstructor { public: - CharacterClassConstructor(bool isCaseInsensitive = false) + CharacterClassConstructor(bool isCaseInsensitive, CanonicalMode canonicalMode) : m_isCaseInsensitive(isCaseInsensitive) + , m_hasNonBMPCharacters(false) + , m_anyCharacter(false) + , m_canonicalMode(canonicalMode) { } @@ -51,6 +59,8 @@ public: m_ranges.clear(); m_matchesUnicode.clear(); m_rangesUnicode.clear(); + m_hasNonBMPCharacters = false; + m_anyCharacter = false; } void append(const CharacterClass* other) @@ -65,11 +75,71 @@ public: addSortedRange(m_rangesUnicode, other->m_rangesUnicode[i].begin, other->m_rangesUnicode[i].end); } - void putChar(UChar ch) + void appendInverted(const CharacterClass* other) { - // Handle ascii cases. - if (ch <= 0x7f) { - if (m_isCaseInsensitive && isASCIIAlpha(ch)) { + auto addSortedInverted = [&](UChar32 min, UChar32 max, + const Vector<UChar32>& srcMatches, const Vector<CharacterRange>& srcRanges, + Vector<UChar32>& destMatches, Vector<CharacterRange>& destRanges) { + + auto addSortedMatchOrRange = [&](UChar32 lo, UChar32 hiPlusOne) { + if (lo < hiPlusOne) { + if (lo + 1 == hiPlusOne) + addSorted(destMatches, lo); + else + addSortedRange(destRanges, lo, hiPlusOne - 1); + } + }; + + UChar32 lo = min; + size_t matchesIndex = 0; + size_t rangesIndex = 0; + bool matchesRemaining = matchesIndex < srcMatches.size(); + bool rangesRemaining = rangesIndex < srcRanges.size(); + + if (!matchesRemaining && !rangesRemaining) { + addSortedMatchOrRange(min, max + 1); + return; + } + + while (matchesRemaining || rangesRemaining) { + UChar32 hiPlusOne; + UChar32 nextLo; + + if (matchesRemaining + && (!rangesRemaining || srcMatches[matchesIndex] < srcRanges[rangesIndex].begin)) { + hiPlusOne = srcMatches[matchesIndex]; + nextLo = hiPlusOne + 1; + ++matchesIndex; + matchesRemaining = matchesIndex < srcMatches.size(); + } else { + hiPlusOne = srcRanges[rangesIndex].begin; + nextLo = srcRanges[rangesIndex].end + 1; + ++rangesIndex; + rangesRemaining = rangesIndex < srcRanges.size(); + } + + addSortedMatchOrRange(lo, hiPlusOne); + + lo = nextLo; + } + + addSortedMatchOrRange(lo, max + 1); + }; + + addSortedInverted(0, 0x7f, other->m_matches, other->m_ranges, m_matches, m_ranges); + addSortedInverted(0x80, 0x10ffff, other->m_matchesUnicode, other->m_rangesUnicode, m_matchesUnicode, m_rangesUnicode); + } + + void putChar(UChar32 ch) + { + if (!m_isCaseInsensitive) { + addSorted(ch); + return; + } + + if (m_canonicalMode == CanonicalMode::UCS2 && isASCII(ch)) { + // Handle ASCII cases. + if (isASCIIAlpha(ch)) { addSorted(m_matches, toASCIIUpper(ch)); addSorted(m_matches, toASCIILower(ch)); } else @@ -77,40 +147,33 @@ public: return; } - // Simple case, not a case-insensitive match. - if (!m_isCaseInsensitive) { - addSorted(m_matchesUnicode, ch); - return; - } - // Add multiple matches, if necessary. - UCS2CanonicalizationRange* info = rangeInfoFor(ch); + const CanonicalizationRange* info = canonicalRangeInfoFor(ch, m_canonicalMode); if (info->type == CanonicalizeUnique) - addSorted(m_matchesUnicode, ch); + addSorted(ch); else putUnicodeIgnoreCase(ch, info); } - void putUnicodeIgnoreCase(UChar ch, UCS2CanonicalizationRange* info) + void putUnicodeIgnoreCase(UChar32 ch, const CanonicalizationRange* info) { ASSERT(m_isCaseInsensitive); - ASSERT(ch > 0x7f); ASSERT(ch >= info->begin && ch <= info->end); ASSERT(info->type != CanonicalizeUnique); if (info->type == CanonicalizeSet) { - for (uint16_t* set = characterSetInfo[info->value]; (ch = *set); ++set) - addSorted(m_matchesUnicode, ch); + for (const UChar32* set = canonicalCharacterSetInfo(info->value, m_canonicalMode); (ch = *set); ++set) + addSorted(ch); } else { - addSorted(m_matchesUnicode, ch); - addSorted(m_matchesUnicode, getCanonicalPair(info, ch)); + addSorted(ch); + addSorted(getCanonicalPair(info, ch)); } } - void putRange(UChar lo, UChar hi) + void putRange(UChar32 lo, UChar32 hi) { - if (lo <= 0x7f) { + if (isASCII(lo)) { char asciiLo = lo; - char asciiHi = std::min(hi, (UChar)0x7f); + char asciiHi = std::min(hi, (UChar32)0x7f); addSortedRange(m_ranges, lo, asciiHi); if (m_isCaseInsensitive) { @@ -120,19 +183,19 @@ public: addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a')); } } - if (hi <= 0x7f) + if (isASCII(hi)) return; - lo = std::max(lo, (UChar)0x80); + lo = std::max(lo, (UChar32)0x80); addSortedRange(m_rangesUnicode, lo, hi); if (!m_isCaseInsensitive) return; - UCS2CanonicalizationRange* info = rangeInfoFor(lo); + const CanonicalizationRange* info = canonicalRangeInfoFor(lo, m_canonicalMode); while (true) { // Handle the range [lo .. end] - UChar end = std::min<UChar>(info->end, hi); + UChar32 end = std::min<UChar32>(info->end, hi); switch (info->type) { case CanonicalizeUnique: @@ -140,7 +203,7 @@ public: break; case CanonicalizeSet: { UChar ch; - for (uint16_t* set = characterSetInfo[info->value]; (ch = *set); ++set) + for (const UChar32* set = canonicalCharacterSetInfo(info->value, m_canonicalMode); (ch = *set); ++set) addSorted(m_matchesUnicode, ch); break; } @@ -175,24 +238,38 @@ public: } - PassOwnPtr<CharacterClass> charClass() + std::unique_ptr<CharacterClass> charClass() { - OwnPtr<CharacterClass> characterClass = adoptPtr(new CharacterClass); + coalesceTables(); + + auto characterClass = std::make_unique<CharacterClass>(); characterClass->m_matches.swap(m_matches); characterClass->m_ranges.swap(m_ranges); characterClass->m_matchesUnicode.swap(m_matchesUnicode); characterClass->m_rangesUnicode.swap(m_rangesUnicode); + characterClass->m_hasNonBMPCharacters = hasNonBMPCharacters(); + characterClass->m_anyCharacter = anyCharacter(); + + m_hasNonBMPCharacters = false; + m_anyCharacter = false; - return characterClass.release(); + return characterClass; } private: - void addSorted(Vector<UChar>& matches, UChar ch) + void addSorted(UChar32 ch) + { + addSorted(isASCII(ch) ? m_matches : m_matchesUnicode, ch); + } + + void addSorted(Vector<UChar32>& matches, UChar32 ch) { unsigned pos = 0; - ASSERT(matches.size() <= UINT_MAX); - unsigned range = static_cast<unsigned>(matches.size()); + unsigned range = matches.size(); + + if (!U_IS_BMP(ch)) + m_hasNonBMPCharacters = true; // binary chop, find position to insert char. while (range) { @@ -201,9 +278,31 @@ private: int val = matches[pos+index] - ch; if (!val) return; - else if (val > 0) + else if (val > 0) { + if (val == 1) { + UChar32 lo = ch; + UChar32 hi = ch + 1; + matches.remove(pos + index); + if (pos + index > 0 && matches[pos + index - 1] == ch - 1) { + lo = ch - 1; + matches.remove(pos + index - 1); + } + addSortedRange(isASCII(ch) ? m_ranges : m_rangesUnicode, lo, hi); + return; + } range = index; - else { + } else { + if (val == -1) { + UChar32 lo = ch - 1; + UChar32 hi = ch; + matches.remove(pos + index); + if (pos + index + 1 < matches.size() && matches[pos + index + 1] == ch + 1) { + hi = ch + 1; + matches.remove(pos + index + 1); + } + addSortedRange(isASCII(ch) ? m_ranges : m_rangesUnicode, lo, hi); + return; + } pos += (index+1); range -= (index+1); } @@ -215,17 +314,19 @@ private: matches.insert(pos, ch); } - void addSortedRange(Vector<CharacterRange>& ranges, UChar lo, UChar hi) + void addSortedRange(Vector<CharacterRange>& ranges, UChar32 lo, UChar32 hi) { - ASSERT(ranges.size() <= UINT_MAX); - unsigned end = static_cast<unsigned>(ranges.size()); - + size_t end = ranges.size(); + + if (!U_IS_BMP(hi)) + m_hasNonBMPCharacters = true; + // Simple linear scan - I doubt there are that many ranges anyway... // feel free to fix this with something faster (eg binary chop). - for (unsigned i = 0; i < end; ++i) { + for (size_t i = 0; i < end; ++i) { // does the new range fall before the current position in the array if (hi < ranges[i].begin) { - // optional optimization: concatenate appending ranges? - may not be worthwhile. + // Concatenate appending ranges. if (hi == (ranges[i].begin - 1)) { ranges[i].begin = lo; return; @@ -233,7 +334,7 @@ private: ranges.insert(i, CharacterRange(lo, hi)); return; } - // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the beginning + // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining // If the new range start at or before the end of the last range, then the overlap (if it starts one after the // end of the last range they concatenate, which is just as good. if (lo <= (ranges[i].end + 1)) { @@ -241,18 +342,7 @@ private: ranges[i].begin = std::min(ranges[i].begin, lo); ranges[i].end = std::max(ranges[i].end, hi); - // now check if the new range can subsume any subsequent ranges. - unsigned next = i+1; - // each iteration of the loop we will either remove something from the list, or break the loop. - while (next < ranges.size()) { - if (ranges[next].begin <= (ranges[i].end + 1)) { - // the next entry now overlaps / concatenates this one. - ranges[i].end = std::max(ranges[i].end, ranges[next].end); - ranges.remove(next); - } else - break; - } - + mergeRangesFrom(ranges, i); return; } } @@ -261,25 +351,95 @@ private: ranges.append(CharacterRange(lo, hi)); } - bool m_isCaseInsensitive; + void mergeRangesFrom(Vector<CharacterRange>& ranges, size_t index) + { + size_t next = index + 1; + + // each iteration of the loop we will either remove something from the list, or break out of the loop. + while (next < ranges.size()) { + if (ranges[next].begin <= (ranges[index].end + 1)) { + // the next entry now overlaps / concatenates with this one. + ranges[index].end = std::max(ranges[index].end, ranges[next].end); + ranges.remove(next); + } else + break; + } + + } + + void coalesceTables() + { + auto coalesceMatchesAndRanges = [&](Vector<UChar32>& matches, Vector<CharacterRange>& ranges) { + + size_t matchesIndex = 0; + size_t rangesIndex = 0; + + while (matchesIndex < matches.size() && rangesIndex < ranges.size()) { + while (matchesIndex < matches.size() && matches[matchesIndex] < ranges[rangesIndex].begin - 1) + matchesIndex++; + + if (matchesIndex < matches.size() && matches[matchesIndex] == ranges[rangesIndex].begin - 1) { + ranges[rangesIndex].begin = matches[matchesIndex]; + matches.remove(matchesIndex); + } + + while (matchesIndex < matches.size() && matches[matchesIndex] < ranges[rangesIndex].end + 1) + matchesIndex++; + + if (matchesIndex < matches.size()) { + if (matches[matchesIndex] == ranges[rangesIndex].end + 1) { + ranges[rangesIndex].end = matches[matchesIndex]; + matches.remove(matchesIndex); + + mergeRangesFrom(ranges, rangesIndex); + } else + matchesIndex++; + } + } + }; + + coalesceMatchesAndRanges(m_matches, m_ranges); + coalesceMatchesAndRanges(m_matchesUnicode, m_rangesUnicode); + + if (!m_matches.size() && !m_matchesUnicode.size() + && m_ranges.size() == 1 && m_rangesUnicode.size() == 1 + && m_ranges[0].begin == 0 && m_ranges[0].end == 0x7f + && m_rangesUnicode[0].begin == 0x80 && m_rangesUnicode[0].end == 0x10ffff) + m_anyCharacter = true; + } - Vector<UChar> m_matches; + bool hasNonBMPCharacters() + { + return m_hasNonBMPCharacters; + } + + bool anyCharacter() + { + return m_anyCharacter; + } + + bool m_isCaseInsensitive : 1; + bool m_hasNonBMPCharacters : 1; + bool m_anyCharacter : 1; + CanonicalMode m_canonicalMode; + + Vector<UChar32> m_matches; Vector<CharacterRange> m_ranges; - Vector<UChar> m_matchesUnicode; + Vector<UChar32> m_matchesUnicode; Vector<CharacterRange> m_rangesUnicode; }; class YarrPatternConstructor { public: - YarrPatternConstructor(YarrPattern& pattern) + YarrPatternConstructor(YarrPattern& pattern, void* stackLimit) : m_pattern(pattern) - , m_characterClassConstructor(pattern.m_ignoreCase) - , m_invertParentheticalAssertion(false) + , m_characterClassConstructor(pattern.ignoreCase(), pattern.unicode() ? CanonicalMode::Unicode : CanonicalMode::UCS2) + , m_stackLimit(stackLimit) { - OwnPtr<PatternDisjunction> body = adoptPtr(new PatternDisjunction); + auto body = std::make_unique<PatternDisjunction>(); m_pattern.m_body = body.get(); m_alternative = body->addNewAlternative(); - m_pattern.m_disjunctions.append(body.release()); + m_pattern.m_disjunctions.append(WTFMove(body)); } ~YarrPatternConstructor() @@ -291,15 +451,15 @@ public: m_pattern.reset(); m_characterClassConstructor.reset(); - OwnPtr<PatternDisjunction> body = adoptPtr(new PatternDisjunction); + auto body = std::make_unique<PatternDisjunction>(); m_pattern.m_body = body.get(); m_alternative = body->addNewAlternative(); - m_pattern.m_disjunctions.append(body.release()); + m_pattern.m_disjunctions.append(WTFMove(body)); } void assertionBOL() { - if (!m_alternative->m_terms.size() & !m_invertParentheticalAssertion) { + if (!m_alternative->m_terms.size() && !m_invertParentheticalAssertion) { m_alternative->m_startsWithBOL = true; m_alternative->m_containsBOL = true; m_pattern.m_containsBOL = true; @@ -315,41 +475,51 @@ public: m_alternative->m_terms.append(PatternTerm::WordBoundary(invert)); } - void atomPatternCharacter(UChar ch) + void atomPatternCharacter(UChar32 ch) { // We handle case-insensitive checking of unicode characters which do have both // cases by handling them as if they were defined using a CharacterClass. - if (!m_pattern.m_ignoreCase || isASCII(ch)) { + if (!m_pattern.ignoreCase() || (isASCII(ch) && !m_pattern.unicode())) { m_alternative->m_terms.append(PatternTerm(ch)); return; } - UCS2CanonicalizationRange* info = rangeInfoFor(ch); + const CanonicalizationRange* info = canonicalRangeInfoFor(ch, m_pattern.unicode() ? CanonicalMode::Unicode : CanonicalMode::UCS2); if (info->type == CanonicalizeUnique) { m_alternative->m_terms.append(PatternTerm(ch)); return; } m_characterClassConstructor.putUnicodeIgnoreCase(ch, info); - OwnPtr<CharacterClass> newCharacterClass = m_characterClassConstructor.charClass(); + auto newCharacterClass = m_characterClassConstructor.charClass(); m_alternative->m_terms.append(PatternTerm(newCharacterClass.get(), false)); - m_pattern.m_userCharacterClasses.append(newCharacterClass.release()); + m_pattern.m_userCharacterClasses.append(WTFMove(newCharacterClass)); } void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) { switch (classID) { - case DigitClassID: + case BuiltInCharacterClassID::DigitClassID: m_alternative->m_terms.append(PatternTerm(m_pattern.digitsCharacterClass(), invert)); break; - case SpaceClassID: + case BuiltInCharacterClassID::SpaceClassID: m_alternative->m_terms.append(PatternTerm(m_pattern.spacesCharacterClass(), invert)); break; - case WordClassID: - m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert)); + case BuiltInCharacterClassID::WordClassID: + if (m_pattern.unicode() && m_pattern.ignoreCase()) + m_alternative->m_terms.append(PatternTerm(m_pattern.wordUnicodeIgnoreCaseCharCharacterClass(), invert)); + else + m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert)); + break; + case BuiltInCharacterClassID::DotClassID: + ASSERT(!invert); + if (m_pattern.dotAll()) + m_alternative->m_terms.append(PatternTerm(m_pattern.anyCharacterClass(), false)); + else + m_alternative->m_terms.append(PatternTerm(m_pattern.newlineCharacterClass(), true)); break; - case NewlineClassID: - m_alternative->m_terms.append(PatternTerm(m_pattern.newlineCharacterClass(), invert)); + default: + m_alternative->m_terms.append(PatternTerm(m_pattern.unicodeCharacterClassFor(classID), invert)); break; } } @@ -359,64 +529,83 @@ public: m_invertCharacterClass = invert; } - void atomCharacterClassAtom(UChar ch) + void atomCharacterClassAtom(UChar32 ch) { m_characterClassConstructor.putChar(ch); } - void atomCharacterClassRange(UChar begin, UChar end) + void atomCharacterClassRange(UChar32 begin, UChar32 end) { m_characterClassConstructor.putRange(begin, end); } void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) { - ASSERT(classID != NewlineClassID); + ASSERT(classID != BuiltInCharacterClassID::DotClassID); switch (classID) { - case DigitClassID: + case BuiltInCharacterClassID::DigitClassID: m_characterClassConstructor.append(invert ? m_pattern.nondigitsCharacterClass() : m_pattern.digitsCharacterClass()); break; - case SpaceClassID: + case BuiltInCharacterClassID::SpaceClassID: m_characterClassConstructor.append(invert ? m_pattern.nonspacesCharacterClass() : m_pattern.spacesCharacterClass()); break; - case WordClassID: - m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass()); + case BuiltInCharacterClassID::WordClassID: + if (m_pattern.unicode() && m_pattern.ignoreCase()) + m_characterClassConstructor.append(invert ? m_pattern.nonwordUnicodeIgnoreCaseCharCharacterClass() : m_pattern.wordUnicodeIgnoreCaseCharCharacterClass()); + else + m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass()); break; default: - RELEASE_ASSERT_NOT_REACHED(); + if (!invert) + m_characterClassConstructor.append(m_pattern.unicodeCharacterClassFor(classID)); + else + m_characterClassConstructor.appendInverted(m_pattern.unicodeCharacterClassFor(classID)); } } void atomCharacterClassEnd() { - OwnPtr<CharacterClass> newCharacterClass = m_characterClassConstructor.charClass(); + auto newCharacterClass = m_characterClassConstructor.charClass(); + + if (!m_invertCharacterClass && newCharacterClass.get()->m_anyCharacter) { + m_alternative->m_terms.append(PatternTerm(m_pattern.anyCharacterClass(), false)); + return; + } m_alternative->m_terms.append(PatternTerm(newCharacterClass.get(), m_invertCharacterClass)); - m_pattern.m_userCharacterClasses.append(newCharacterClass.release()); + m_pattern.m_userCharacterClasses.append(WTFMove(newCharacterClass)); } - void atomParenthesesSubpatternBegin(bool capture = true) + void atomParenthesesSubpatternBegin(bool capture = true, std::optional<String> optGroupName = std::nullopt) { unsigned subpatternId = m_pattern.m_numSubpatterns + 1; - if (capture) + if (capture) { m_pattern.m_numSubpatterns++; + if (optGroupName) { + while (m_pattern.m_captureGroupNames.size() < subpatternId) + m_pattern.m_captureGroupNames.append(String()); + m_pattern.m_captureGroupNames.append(optGroupName.value()); + m_pattern.m_namedGroupToParenIndex.add(optGroupName.value(), subpatternId); + } + } else + ASSERT(!optGroupName); - OwnPtr<PatternDisjunction> parenthesesDisjunction = adoptPtr(new PatternDisjunction(m_alternative)); + auto parenthesesDisjunction = std::make_unique<PatternDisjunction>(m_alternative); m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction.get(), capture, false)); m_alternative = parenthesesDisjunction->addNewAlternative(); - m_pattern.m_disjunctions.append(parenthesesDisjunction.release()); + m_pattern.m_disjunctions.append(WTFMove(parenthesesDisjunction)); } void atomParentheticalAssertionBegin(bool invert = false) { - OwnPtr<PatternDisjunction> parenthesesDisjunction = adoptPtr(new PatternDisjunction(m_alternative)); + auto parenthesesDisjunction = std::make_unique<PatternDisjunction>(m_alternative); m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction.get(), false, invert)); m_alternative = parenthesesDisjunction->addNewAlternative(); m_invertParentheticalAssertion = invert; - m_pattern.m_disjunctions.append(parenthesesDisjunction.release()); + m_pattern.m_disjunctions.append(WTFMove(parenthesesDisjunction)); } void atomParenthesesEnd() @@ -429,8 +618,7 @@ public: PatternTerm& lastTerm = m_alternative->lastTerm(); - ASSERT(parenthesesDisjunction->m_alternatives.size() <= UINT_MAX); - unsigned numParenAlternatives = static_cast<unsigned>(parenthesesDisjunction->m_alternatives.size()); + unsigned numParenAlternatives = parenthesesDisjunction->m_alternatives.size(); unsigned numBOLAnchoredAlts = 0; for (unsigned i = 0; i < numParenAlternatives; i++) { @@ -478,16 +666,22 @@ public: m_alternative->m_terms.append(PatternTerm(subpatternId)); } - // deep copy the argument disjunction. If filterStartsWithBOL is true, + void atomNamedBackReference(String subpatternName) + { + ASSERT(m_pattern.m_namedGroupToParenIndex.find(subpatternName) != m_pattern.m_namedGroupToParenIndex.end()); + atomBackReference(m_pattern.m_namedGroupToParenIndex.get(subpatternName)); + } + + // deep copy the argument disjunction. If filterStartsWithBOL is true, // skip alternatives with m_startsWithBOL set true. PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction, bool filterStartsWithBOL = false) { - OwnPtr<PatternDisjunction> newDisjunction; + std::unique_ptr<PatternDisjunction> newDisjunction; for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { PatternAlternative* alternative = disjunction->m_alternatives[alt].get(); if (!filterStartsWithBOL || !alternative->m_startsWithBOL) { if (!newDisjunction) { - newDisjunction = adoptPtr(new PatternDisjunction()); + newDisjunction = std::make_unique<PatternDisjunction>(); newDisjunction->m_parent = disjunction->m_parent; } PatternAlternative* newAlternative = newDisjunction->addNewAlternative(); @@ -501,7 +695,7 @@ public: return 0; PatternDisjunction* copiedDisjunction = newDisjunction.get(); - m_pattern.m_disjunctions.append(newDisjunction.release()); + m_pattern.m_disjunctions.append(WTFMove(newDisjunction)); return copiedDisjunction; } @@ -512,6 +706,7 @@ public: PatternTerm termCopy = term; termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction, filterStartsWithBOL); + m_pattern.m_hasCopiedParenSubexpressions = true; return termCopy; } @@ -527,7 +722,7 @@ public: PatternTerm& term = m_alternative->lastTerm(); ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary); - ASSERT((term.quantityCount == 1) && (term.quantityType == QuantifierFixedCount)); + ASSERT(term.quantityMinCount == 1 && term.quantityMaxCount == 1 && term.quantityType == QuantifierFixedCount); if (term.type == PatternTerm::TypeParentheticalAssertion) { // If an assertion is quantified with a minimum count of zero, it can simply be removed. @@ -549,12 +744,12 @@ public: return; } - if (min == 0) - term.quantify(max, greedy ? QuantifierGreedy : QuantifierNonGreedy); - else if (min == max) - term.quantify(min, QuantifierFixedCount); + if (min == max) + term.quantify(min, max, QuantifierFixedCount); + else if (!min || (term.type == PatternTerm::TypeParenthesesSubpattern && m_pattern.m_hasCopiedParenSubexpressions)) + term.quantify(min, max, greedy ? QuantifierGreedy : QuantifierNonGreedy); else { - term.quantify(min, QuantifierFixedCount); + term.quantify(min, min, QuantifierFixedCount); m_alternative->m_terms.append(copyTerm(term)); // NOTE: this term is interesting from an analysis perspective, in that it can be ignored..... m_alternative->lastTerm().quantify((max == quantifyInfinite) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy); @@ -568,10 +763,14 @@ public: m_alternative = m_alternative->m_parent->addNewAlternative(); } - unsigned setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition) + ErrorCode setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition, unsigned& newCallFrameSize) WARN_UNUSED_RETURN { + if (UNLIKELY(!isSafeToRecurse())) + return ErrorCode::TooManyDisjunctions; + + ErrorCode error = ErrorCode::NoError; alternative->m_hasFixedSize = true; - Checked<unsigned> currentInputPosition = initialInputPosition; + Checked<unsigned, RecordOverflow> currentInputPosition = initialInputPosition; for (unsigned i = 0; i < alternative->m_terms.size(); ++i) { PatternTerm& term = alternative->m_terms[i]; @@ -599,8 +798,14 @@ public: term.frameLocation = currentCallFrameSize; currentCallFrameSize += YarrStackSpaceForBackTrackInfoPatternCharacter; alternative->m_hasFixedSize = false; + } else if (m_pattern.unicode()) { + Checked<unsigned, RecordOverflow> tempCount = term.quantityMaxCount; + tempCount *= U16_LENGTH(term.patternCharacter); + if (tempCount.hasOverflowed()) + return ErrorCode::OffsetTooLarge; + currentInputPosition += tempCount; } else - currentInputPosition += term.quantityCount; + currentInputPosition += term.quantityMaxCount; break; case PatternTerm::TypeCharacterClass: @@ -609,29 +814,39 @@ public: term.frameLocation = currentCallFrameSize; currentCallFrameSize += YarrStackSpaceForBackTrackInfoCharacterClass; alternative->m_hasFixedSize = false; + } else if (m_pattern.unicode()) { + term.frameLocation = currentCallFrameSize; + currentCallFrameSize += YarrStackSpaceForBackTrackInfoCharacterClass; + currentInputPosition += term.quantityMaxCount; + alternative->m_hasFixedSize = false; } else - currentInputPosition += term.quantityCount; + currentInputPosition += term.quantityMaxCount; break; case PatternTerm::TypeParenthesesSubpattern: // Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own. term.frameLocation = currentCallFrameSize; - if (term.quantityCount == 1 && !term.parentheses.isCopy) { - if (term.quantityType != QuantifierFixedCount) - currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesOnce; - currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet()); + if (term.quantityMaxCount == 1 && !term.parentheses.isCopy) { + currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesOnce; + error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet(), currentCallFrameSize); + if (hasError(error)) + return error; // If quantity is fixed, then pre-check its minimum size. if (term.quantityType == QuantifierFixedCount) currentInputPosition += term.parentheses.disjunction->m_minimumSize; term.inputPosition = currentInputPosition.unsafeGet(); } else if (term.parentheses.isTerminal) { currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesTerminal; - currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet()); + error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet(), currentCallFrameSize); + if (hasError(error)) + return error; term.inputPosition = currentInputPosition.unsafeGet(); } else { term.inputPosition = currentInputPosition.unsafeGet(); - setupDisjunctionOffsets(term.parentheses.disjunction, 0, currentInputPosition.unsafeGet()); currentCallFrameSize += YarrStackSpaceForBackTrackInfoParentheses; + error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet(), currentCallFrameSize); + if (hasError(error)) + return error; } // Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length. alternative->m_hasFixedSize = false; @@ -640,35 +855,53 @@ public: case PatternTerm::TypeParentheticalAssertion: term.inputPosition = currentInputPosition.unsafeGet(); term.frameLocation = currentCallFrameSize; - currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + YarrStackSpaceForBackTrackInfoParentheticalAssertion, currentInputPosition.unsafeGet()); + error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + YarrStackSpaceForBackTrackInfoParentheticalAssertion, currentInputPosition.unsafeGet(), currentCallFrameSize); + if (hasError(error)) + return error; break; case PatternTerm::TypeDotStarEnclosure: + ASSERT(!m_pattern.m_saveInitialStartValue); alternative->m_hasFixedSize = false; term.inputPosition = initialInputPosition; + m_pattern.m_initialStartValueFrameLocation = currentCallFrameSize; + currentCallFrameSize += YarrStackSpaceForDotStarEnclosure; + m_pattern.m_saveInitialStartValue = true; break; } + if (currentInputPosition.hasOverflowed()) + return ErrorCode::OffsetTooLarge; } alternative->m_minimumSize = (currentInputPosition - initialInputPosition).unsafeGet(); - return currentCallFrameSize; + newCallFrameSize = currentCallFrameSize; + return error; } - unsigned setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition) + ErrorCode setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition, unsigned& callFrameSize) { + if (UNLIKELY(!isSafeToRecurse())) + return ErrorCode::TooManyDisjunctions; + if ((disjunction != m_pattern.m_body) && (disjunction->m_alternatives.size() > 1)) initialCallFrameSize += YarrStackSpaceForBackTrackInfoAlternative; unsigned minimumInputSize = UINT_MAX; unsigned maximumCallFrameSize = 0; bool hasFixedSize = true; + ErrorCode error = ErrorCode::NoError; for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { PatternAlternative* alternative = disjunction->m_alternatives[alt].get(); - unsigned currentAlternativeCallFrameSize = setupAlternativeOffsets(alternative, initialCallFrameSize, initialInputPosition); + unsigned currentAlternativeCallFrameSize; + error = setupAlternativeOffsets(alternative, initialCallFrameSize, initialInputPosition, currentAlternativeCallFrameSize); + if (hasError(error)) + return error; minimumInputSize = std::min(minimumInputSize, alternative->m_minimumSize); maximumCallFrameSize = std::max(maximumCallFrameSize, currentAlternativeCallFrameSize); hasFixedSize &= alternative->m_hasFixedSize; + if (alternative->m_minimumSize > INT_MAX) + m_pattern.m_containsUnsignedLengthPattern = true; } ASSERT(minimumInputSize != UINT_MAX); @@ -677,12 +910,15 @@ public: disjunction->m_hasFixedSize = hasFixedSize; disjunction->m_minimumSize = minimumInputSize; disjunction->m_callFrameSize = maximumCallFrameSize; - return maximumCallFrameSize; + callFrameSize = maximumCallFrameSize; + return error; } - void setupOffsets() + ErrorCode setupOffsets() { - setupDisjunctionOffsets(m_pattern.m_body, 0, 0); + // FIXME: Yarr should not use the stack to handle subpatterns (rdar://problem/26436314). + unsigned ignoredCallFrameSize; + return setupDisjunctionOffsets(m_pattern.m_body, 0, 0, ignoredCallFrameSize); } // This optimization identifies sets of parentheses that we will never need to backtrack. @@ -699,14 +935,15 @@ public: if (m_pattern.m_numSubpatterns) return; - Vector<OwnPtr<PatternAlternative> >& alternatives = m_pattern.m_body->m_alternatives; + Vector<std::unique_ptr<PatternAlternative>>& alternatives = m_pattern.m_body->m_alternatives; for (size_t i = 0; i < alternatives.size(); ++i) { Vector<PatternTerm>& terms = alternatives[i]->m_terms; if (terms.size()) { PatternTerm& term = terms.last(); if (term.type == PatternTerm::TypeParenthesesSubpattern && term.quantityType == QuantifierGreedy - && term.quantityCount == quantifyInfinite + && term.quantityMinCount == 0 + && term.quantityMaxCount == quantifyInfinite && !term.capture()) term.parentheses.isTerminal = true; } @@ -722,7 +959,7 @@ public: // At this point, this is only valid for non-multiline expressions. PatternDisjunction* disjunction = m_pattern.m_body; - if (!m_pattern.m_containsBOL || m_pattern.m_multiline) + if (!m_pattern.m_containsBOL || m_pattern.multiline()) return; PatternDisjunction* loopDisjunction = copyDisjunction(disjunction, true); @@ -740,11 +977,12 @@ public: } } - bool containsCapturingTerms(PatternAlternative* alternative, size_t firstTermIndex, size_t lastTermIndex) + bool containsCapturingTerms(PatternAlternative* alternative, size_t firstTermIndex, size_t endIndex) { Vector<PatternTerm>& terms = alternative->m_terms; - for (size_t termIndex = firstTermIndex; termIndex <= lastTermIndex; ++termIndex) { + ASSERT(endIndex <= terms.size()); + for (size_t termIndex = firstTermIndex; termIndex < endIndex; ++termIndex) { PatternTerm& term = terms[termIndex]; if (term.m_capture) @@ -753,7 +991,7 @@ public: if (term.type == PatternTerm::TypeParenthesesSubpattern) { PatternDisjunction* nestedDisjunction = term.parentheses.disjunction; for (unsigned alt = 0; alt < nestedDisjunction->m_alternatives.size(); ++alt) { - if (containsCapturingTerms(nestedDisjunction->m_alternatives[alt].get(), 0, nestedDisjunction->m_alternatives[alt]->m_terms.size() - 1)) + if (containsCapturingTerms(nestedDisjunction->m_alternatives[alt].get(), 0, nestedDisjunction->m_alternatives[alt]->m_terms.size())) return true; } } @@ -769,16 +1007,17 @@ public: // beginning and the end of the match. void optimizeDotStarWrappedExpressions() { - Vector<OwnPtr<PatternAlternative> >& alternatives = m_pattern.m_body->m_alternatives; + Vector<std::unique_ptr<PatternAlternative>>& alternatives = m_pattern.m_body->m_alternatives; if (alternatives.size() != 1) return; + CharacterClass* dotCharacterClass = m_pattern.dotAll() ? m_pattern.anyCharacterClass() : m_pattern.newlineCharacterClass(); PatternAlternative* alternative = alternatives[0].get(); Vector<PatternTerm>& terms = alternative->m_terms; if (terms.size() >= 3) { bool startsWithBOL = false; bool endsWithEOL = false; - size_t termIndex, firstExpressionTerm, lastExpressionTerm; + size_t termIndex, firstExpressionTerm; termIndex = 0; if (terms[termIndex].type == PatternTerm::TypeAssertionBOL) { @@ -787,7 +1026,10 @@ public: } PatternTerm& firstNonAnchorTerm = terms[termIndex]; - if ((firstNonAnchorTerm.type != PatternTerm::TypeCharacterClass) || (firstNonAnchorTerm.characterClass != m_pattern.newlineCharacterClass()) || !((firstNonAnchorTerm.quantityType == QuantifierGreedy) || (firstNonAnchorTerm.quantityType == QuantifierNonGreedy))) + if (firstNonAnchorTerm.type != PatternTerm::TypeCharacterClass + || firstNonAnchorTerm.characterClass != dotCharacterClass + || firstNonAnchorTerm.quantityMinCount + || firstNonAnchorTerm.quantityMaxCount != quantifyInfinite) return; firstExpressionTerm = termIndex + 1; @@ -799,16 +1041,19 @@ public: } PatternTerm& lastNonAnchorTerm = terms[termIndex]; - if ((lastNonAnchorTerm.type != PatternTerm::TypeCharacterClass) || (lastNonAnchorTerm.characterClass != m_pattern.newlineCharacterClass()) || (lastNonAnchorTerm.quantityType != QuantifierGreedy)) + if (lastNonAnchorTerm.type != PatternTerm::TypeCharacterClass + || lastNonAnchorTerm.characterClass != dotCharacterClass + || lastNonAnchorTerm.quantityType != QuantifierGreedy + || lastNonAnchorTerm.quantityMinCount + || lastNonAnchorTerm.quantityMaxCount != quantifyInfinite) return; - - lastExpressionTerm = termIndex - 1; - if (firstExpressionTerm > lastExpressionTerm) + size_t endIndex = termIndex; + if (firstExpressionTerm >= endIndex) return; - if (!containsCapturingTerms(alternative, firstExpressionTerm, lastExpressionTerm)) { - for (termIndex = terms.size() - 1; termIndex > lastExpressionTerm; --termIndex) + if (!containsCapturingTerms(alternative, firstExpressionTerm, endIndex)) { + for (termIndex = terms.size() - 1; termIndex >= endIndex; --termIndex) terms.remove(termIndex); for (termIndex = firstExpressionTerm; termIndex > 0; --termIndex) @@ -822,62 +1067,445 @@ public: } private: + bool isSafeToRecurse() const + { + if (!m_stackLimit) + return true; + int8_t* curr = reinterpret_cast<int8_t*>(&curr); + int8_t* limit = reinterpret_cast<int8_t*>(m_stackLimit); + return curr >= limit; + } + YarrPattern& m_pattern; PatternAlternative* m_alternative; CharacterClassConstructor m_characterClassConstructor; + void* m_stackLimit; bool m_invertCharacterClass; - bool m_invertParentheticalAssertion; + bool m_invertParentheticalAssertion { false }; }; -const char* YarrPattern::compile(const String& patternString) +ErrorCode YarrPattern::compile(const String& patternString, void* stackLimit) { - YarrPatternConstructor constructor(*this); + YarrPatternConstructor constructor(*this, stackLimit); - if (const char* error = parse(constructor, patternString)) - return error; + if (m_flags == InvalidFlags) + return ErrorCode::InvalidRegularExpressionFlags; + + { + ErrorCode error = parse(constructor, patternString, unicode()); + if (hasError(error)) + return error; + } // If the pattern contains illegal backreferences reset & reparse. // Quoting Netscape's "What's new in JavaScript 1.2", // "Note: if the number of left parentheses is less than the number specified // in \#, the \# is taken as an octal escape as described in the next row." if (containsIllegalBackReference()) { + if (unicode()) + return ErrorCode::InvalidBackreference; + unsigned numSubpatterns = m_numSubpatterns; constructor.reset(); -#if !ASSERT_DISABLED - const char* error = -#endif - parse(constructor, patternString, numSubpatterns); - - ASSERT(!error); + ErrorCode error = parse(constructor, patternString, unicode(), numSubpatterns); + ASSERT_UNUSED(error, !hasError(error)); ASSERT(numSubpatterns == m_numSubpatterns); } constructor.checkForTerminalParentheses(); constructor.optimizeDotStarWrappedExpressions(); constructor.optimizeBOL(); - - constructor.setupOffsets(); - return 0; + { + ErrorCode error = constructor.setupOffsets(); + if (hasError(error)) + return error; + } + + if (Options::dumpCompiledRegExpPatterns()) + dumpPattern(patternString); + + return ErrorCode::NoError; } -YarrPattern::YarrPattern(const String& pattern, bool ignoreCase, bool multiline, const char** error) - : m_ignoreCase(ignoreCase) - , m_multiline(multiline) - , m_containsBackreferences(false) +YarrPattern::YarrPattern(const String& pattern, RegExpFlags flags, ErrorCode& error, void* stackLimit) + : m_containsBackreferences(false) , m_containsBOL(false) - , m_numSubpatterns(0) - , m_maxBackReference(0) - , newlineCached(0) - , digitsCached(0) - , spacesCached(0) - , wordcharCached(0) - , nondigitsCached(0) - , nonspacesCached(0) - , nonwordcharCached(0) + , m_containsUnsignedLengthPattern(false) + , m_hasCopiedParenSubexpressions(false) + , m_saveInitialStartValue(false) + , m_flags(flags) +{ + error = compile(pattern, stackLimit); +} + +void indentForNestingLevel(PrintStream& out, unsigned nestingDepth) +{ + out.print(" "); + for (; nestingDepth; --nestingDepth) + out.print(" "); +} + +void dumpUChar32(PrintStream& out, UChar32 c) +{ + if (c >= ' '&& c <= 0xff) + out.printf("'%c'", static_cast<char>(c)); + else + out.printf("0x%04x", c); +} + +void dumpCharacterClass(PrintStream& out, YarrPattern* pattern, CharacterClass* characterClass) +{ + if (characterClass == pattern->anyCharacterClass()) + out.print("<any character>"); + else if (characterClass == pattern->newlineCharacterClass()) + out.print("<newline>"); + else if (characterClass == pattern->digitsCharacterClass()) + out.print("<digits>"); + else if (characterClass == pattern->spacesCharacterClass()) + out.print("<whitespace>"); + else if (characterClass == pattern->wordcharCharacterClass()) + out.print("<word>"); + else if (characterClass == pattern->wordUnicodeIgnoreCaseCharCharacterClass()) + out.print("<unicode ignore case>"); + else if (characterClass == pattern->nondigitsCharacterClass()) + out.print("<non-digits>"); + else if (characterClass == pattern->nonspacesCharacterClass()) + out.print("<non-whitespace>"); + else if (characterClass == pattern->nonwordcharCharacterClass()) + out.print("<non-word>"); + else if (characterClass == pattern->nonwordUnicodeIgnoreCaseCharCharacterClass()) + out.print("<unicode non-ignore case>"); + else { + bool needMatchesRangesSeperator = false; + + auto dumpMatches = [&] (const char* prefix, Vector<UChar32> matches) { + size_t matchesSize = matches.size(); + if (matchesSize) { + if (needMatchesRangesSeperator) + out.print(","); + needMatchesRangesSeperator = true; + + out.print(prefix, ":("); + for (size_t i = 0; i < matchesSize; ++i) { + if (i) + out.print(","); + dumpUChar32(out, matches[i]); + } + out.print(")"); + } + }; + + auto dumpRanges = [&] (const char* prefix, Vector<CharacterRange> ranges) { + size_t rangeSize = ranges.size(); + if (rangeSize) { + if (needMatchesRangesSeperator) + out.print(","); + needMatchesRangesSeperator = true; + + out.print(prefix, " ranges:("); + for (size_t i = 0; i < rangeSize; ++i) { + if (i) + out.print(","); + CharacterRange range = ranges[i]; + out.print("("); + dumpUChar32(out, range.begin); + out.print(".."); + dumpUChar32(out, range.end); + out.print(")"); + } + out.print(")"); + } + }; + + out.print("["); + dumpMatches("ASCII", characterClass->m_matches); + dumpRanges("ASCII", characterClass->m_ranges); + dumpMatches("Unicode", characterClass->m_matchesUnicode); + dumpRanges("Unicode", characterClass->m_rangesUnicode); + out.print("]"); + } +} + +void PatternAlternative::dump(PrintStream& out, YarrPattern* thisPattern, unsigned nestingDepth) +{ + out.print("minimum size: ", m_minimumSize); + if (m_hasFixedSize) + out.print(",fixed size"); + if (m_onceThrough) + out.print(",once through"); + if (m_startsWithBOL) + out.print(",starts with ^"); + if (m_containsBOL) + out.print(",contains ^"); + out.print("\n"); + + for (size_t i = 0; i < m_terms.size(); ++i) + m_terms[i].dump(out, thisPattern, nestingDepth); +} + +void PatternTerm::dumpQuantifier(PrintStream& out) +{ + if (quantityType == QuantifierFixedCount && quantityMinCount == 1 && quantityMaxCount == 1) + return; + out.print(" {", quantityMinCount.unsafeGet()); + if (quantityMinCount != quantityMaxCount) { + if (quantityMaxCount == UINT_MAX) + out.print(",..."); + else + out.print(",", quantityMaxCount.unsafeGet()); + } + out.print("}"); + if (quantityType == QuantifierGreedy) + out.print(" greedy"); + else if (quantityType == QuantifierNonGreedy) + out.print(" non-greedy"); +} + +void PatternTerm::dump(PrintStream& out, YarrPattern* thisPattern, unsigned nestingDepth) +{ + indentForNestingLevel(out, nestingDepth); + + if (type != TypeParenthesesSubpattern && type != TypeParentheticalAssertion) { + if (invert()) + out.print("not "); + } + + switch (type) { + case TypeAssertionBOL: + out.println("BOL"); + break; + case TypeAssertionEOL: + out.println("EOL"); + break; + case TypeAssertionWordBoundary: + out.println("word boundary"); + break; + case TypePatternCharacter: + out.printf("character "); + out.printf("inputPosition %u ", inputPosition); + if (thisPattern->ignoreCase() && isASCIIAlpha(patternCharacter)) { + dumpUChar32(out, toASCIIUpper(patternCharacter)); + out.print("/"); + dumpUChar32(out, toASCIILower(patternCharacter)); + } else + dumpUChar32(out, patternCharacter); + dumpQuantifier(out); + if (quantityType != QuantifierFixedCount) + out.print(",frame location ", frameLocation); + out.println(); + break; + case TypeCharacterClass: + out.print("character class "); + if (characterClass->m_anyCharacter) + out.print("<any character>"); + else if (characterClass == thisPattern->newlineCharacterClass()) + out.print("<newline>"); + else if (characterClass == thisPattern->digitsCharacterClass()) + out.print("<digits>"); + else if (characterClass == thisPattern->spacesCharacterClass()) + out.print("<whitespace>"); + else if (characterClass == thisPattern->wordcharCharacterClass()) + out.print("<word>"); + else if (characterClass == thisPattern->wordUnicodeIgnoreCaseCharCharacterClass()) + out.print("<unicode ignore case>"); + else if (characterClass == thisPattern->nondigitsCharacterClass()) + out.print("<non-digits>"); + else if (characterClass == thisPattern->nonspacesCharacterClass()) + out.print("<non-whitespace>"); + else if (characterClass == thisPattern->nonwordcharCharacterClass()) + out.print("<non-word>"); + else if (characterClass == thisPattern->nonwordUnicodeIgnoreCaseCharCharacterClass()) + out.print("<unicode non-ignore case>"); + else { + bool needMatchesRangesSeperator = false; + + auto dumpMatches = [&] (const char* prefix, Vector<UChar32> matches) { + size_t matchesSize = matches.size(); + if (matchesSize) { + if (needMatchesRangesSeperator) + out.print(","); + needMatchesRangesSeperator = true; + + out.print(prefix, ":("); + for (size_t i = 0; i < matchesSize; ++i) { + if (i) + out.print(","); + dumpUChar32(out, matches[i]); + } + out.print(")"); + } + }; + + auto dumpRanges = [&] (const char* prefix, Vector<CharacterRange> ranges) { + size_t rangeSize = ranges.size(); + if (rangeSize) { + if (needMatchesRangesSeperator) + out.print(","); + needMatchesRangesSeperator = true; + + out.print(prefix, " ranges:("); + for (size_t i = 0; i < rangeSize; ++i) { + if (i) + out.print(","); + CharacterRange range = ranges[i]; + out.print("("); + dumpUChar32(out, range.begin); + out.print(".."); + dumpUChar32(out, range.end); + out.print(")"); + } + out.print(")"); + } + }; + + out.print("["); + dumpMatches("ASCII", characterClass->m_matches); + dumpRanges("ASCII", characterClass->m_ranges); + dumpMatches("Unicode", characterClass->m_matchesUnicode); + dumpRanges("Unicode", characterClass->m_rangesUnicode); + out.print("]"); + } + dumpQuantifier(out); + if (quantityType != QuantifierFixedCount || thisPattern->unicode()) + out.print(",frame location ", frameLocation); + out.println(); + break; + case TypeBackReference: + out.print("back reference to subpattern #", backReferenceSubpatternId); + out.println(",frame location ", frameLocation); + break; + case TypeForwardReference: + out.println("forward reference"); + break; + case TypeParenthesesSubpattern: + if (m_capture) + out.print("captured "); + else + out.print("non-captured "); + + FALLTHROUGH; + case TypeParentheticalAssertion: + if (m_invert) + out.print("inverted "); + + if (type == TypeParenthesesSubpattern) + out.print("subpattern"); + else if (type == TypeParentheticalAssertion) + out.print("assertion"); + + if (m_capture) + out.print(" #", parentheses.subpatternId); + + dumpQuantifier(out); + + if (parentheses.isCopy) + out.print(",copy"); + + if (parentheses.isTerminal) + out.print(",terminal"); + + out.println(",frame location ", frameLocation); + + if (parentheses.disjunction->m_alternatives.size() > 1) { + indentForNestingLevel(out, nestingDepth + 1); + unsigned alternativeFrameLocation = frameLocation; + if (quantityMaxCount == 1 && !parentheses.isCopy) + alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce; + else if (parentheses.isTerminal) + alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesTerminal; + else + alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParentheses; + out.println("alternative list,frame location ", alternativeFrameLocation); + } + + parentheses.disjunction->dump(out, thisPattern, nestingDepth + 1); + break; + case TypeDotStarEnclosure: + out.println(".* enclosure,frame location ", thisPattern->m_initialStartValueFrameLocation); + break; + } +} + +void PatternDisjunction::dump(PrintStream& out, YarrPattern* thisPattern, unsigned nestingDepth = 0) +{ + unsigned alternativeCount = m_alternatives.size(); + for (unsigned i = 0; i < alternativeCount; ++i) { + indentForNestingLevel(out, nestingDepth); + if (alternativeCount > 1) + out.print("alternative #", i, ": "); + m_alternatives[i].get()->dump(out, thisPattern, nestingDepth + (alternativeCount > 1)); + } +} + +void YarrPattern::dumpPattern(const String& patternString) +{ + dumpPattern(WTF::dataFile(), patternString); +} + +void YarrPattern::dumpPattern(PrintStream& out, const String& patternString) +{ + out.print("RegExp pattern for /"); + out.print(patternString); + out.print("/"); + if (global()) + out.print("g"); + if (ignoreCase()) + out.print("i"); + if (multiline()) + out.print("m"); + if (unicode()) + out.print("u"); + if (sticky()) + out.print("y"); + if (m_flags != NoFlags) { + bool printSeperator = false; + out.print(" ("); + if (global()) { + out.print("global"); + printSeperator = true; + } + if (ignoreCase()) { + if (printSeperator) + out.print("|"); + out.print("ignore case"); + printSeperator = true; + } + if (multiline()) { + if (printSeperator) + out.print("|"); + out.print("multiline"); + printSeperator = true; + } + if (unicode()) { + if (printSeperator) + out.print("|"); + out.print("unicode"); + printSeperator = true; + } + if (sticky()) { + if (printSeperator) + out.print("|"); + out.print("sticky"); + printSeperator = true; + } + out.print(")"); + } + out.print(":\n"); + if (m_body->m_callFrameSize) + out.print(" callframe size: ", m_body->m_callFrameSize, "\n"); + m_body->dump(out, this); +} + +std::unique_ptr<CharacterClass> anycharCreate() { - *error = compile(pattern); + auto characterClass = std::make_unique<CharacterClass>(); + characterClass->m_ranges.append(CharacterRange(0x00, 0x7f)); + characterClass->m_rangesUnicode.append(CharacterRange(0x0080, 0x10ffff)); + characterClass->m_hasNonBMPCharacters = true; + characterClass->m_anyCharacter = true; + return characterClass; } -} } +} } // namespace JSC::Yarr diff --git a/src/3rdparty/masm/yarr/YarrPattern.h b/src/3rdparty/masm/yarr/YarrPattern.h index e7d187c2b3..59decbac46 100644 --- a/src/3rdparty/masm/yarr/YarrPattern.h +++ b/src/3rdparty/masm/yarr/YarrPattern.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2009, 2013 Apple Inc. All rights reserved. + * Copyright (C) 2009, 2013-2017 Apple Inc. All rights reserved. * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged * * Redistribution and use in source and binary forms, with or without @@ -24,26 +24,27 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef YarrPattern_h -#define YarrPattern_h +#pragma once +#include "RegExpKey.h" +#include "YarrErrorCode.h" +#include "YarrUnicodeProperties.h" #include <wtf/CheckedArithmetic.h> -#include <wtf/OwnPtr.h> -#include <wtf/PassOwnPtr.h> -#include <wtf/RefCounted.h> +#include <wtf/HashMap.h> +#include <wtf/PrintStream.h> #include <wtf/Vector.h> #include <wtf/text/WTFString.h> -#include <wtf/unicode/Unicode.h> namespace JSC { namespace Yarr { +struct YarrPattern; struct PatternDisjunction; struct CharacterRange { - UChar begin; - UChar end; + UChar32 begin { 0 }; + UChar32 end { 0x10ffff }; - CharacterRange(UChar begin, UChar end) + CharacterRange(UChar32 begin, UChar32 end) : begin(begin) , end(end) { @@ -58,20 +59,38 @@ public: // specified matches and ranges) CharacterClass() : m_table(0) + , m_hasNonBMPCharacters(false) + , m_anyCharacter(false) { } CharacterClass(const char* table, bool inverted) : m_table(table) , m_tableInverted(inverted) + , m_hasNonBMPCharacters(false) + , m_anyCharacter(false) { } - Vector<UChar> m_matches; + CharacterClass(std::initializer_list<UChar32> matches, std::initializer_list<CharacterRange> ranges, std::initializer_list<UChar32> matchesUnicode, std::initializer_list<CharacterRange> rangesUnicode) + : m_matches(matches) + , m_ranges(ranges) + , m_matchesUnicode(matchesUnicode) + , m_rangesUnicode(rangesUnicode) + , m_table(0) + , m_tableInverted(false) + , m_hasNonBMPCharacters(false) + , m_anyCharacter(false) + { + } + + Vector<UChar32> m_matches; Vector<CharacterRange> m_ranges; - Vector<UChar> m_matchesUnicode; + Vector<UChar32> m_matchesUnicode; Vector<CharacterRange> m_rangesUnicode; const char* m_table; - bool m_tableInverted; + bool m_tableInverted : 1; + bool m_hasNonBMPCharacters : 1; + bool m_anyCharacter : 1; }; enum QuantifierType { @@ -96,7 +115,7 @@ struct PatternTerm { bool m_capture :1; bool m_invert :1; union { - UChar patternCharacter; + UChar32 patternCharacter; CharacterClass* characterClass; unsigned backReferenceSubpatternId; struct { @@ -112,18 +131,19 @@ struct PatternTerm { } anchors; }; QuantifierType quantityType; - Checked<unsigned> quantityCount; - int inputPosition; + Checked<unsigned> quantityMinCount; + Checked<unsigned> quantityMaxCount; + unsigned inputPosition; unsigned frameLocation; - PatternTerm(UChar ch) + PatternTerm(UChar32 ch) : type(PatternTerm::TypePatternCharacter) , m_capture(false) , m_invert(false) { patternCharacter = ch; quantityType = QuantifierFixedCount; - quantityCount = 1; + quantityMinCount = quantityMaxCount = 1; } PatternTerm(CharacterClass* charClass, bool invert) @@ -133,7 +153,7 @@ struct PatternTerm { { characterClass = charClass; quantityType = QuantifierFixedCount; - quantityCount = 1; + quantityMinCount = quantityMaxCount = 1; } PatternTerm(Type type, unsigned subpatternId, PatternDisjunction* disjunction, bool capture = false, bool invert = false) @@ -146,7 +166,7 @@ struct PatternTerm { parentheses.isCopy = false; parentheses.isTerminal = false; quantityType = QuantifierFixedCount; - quantityCount = 1; + quantityMinCount = quantityMaxCount = 1; } PatternTerm(Type type, bool invert = false) @@ -155,7 +175,7 @@ struct PatternTerm { , m_invert(invert) { quantityType = QuantifierFixedCount; - quantityCount = 1; + quantityMinCount = quantityMaxCount = 1; } PatternTerm(unsigned spatternId) @@ -165,7 +185,7 @@ struct PatternTerm { { backReferenceSubpatternId = spatternId; quantityType = QuantifierFixedCount; - quantityCount = 1; + quantityMinCount = quantityMaxCount = 1; } PatternTerm(bool bolAnchor, bool eolAnchor) @@ -176,7 +196,7 @@ struct PatternTerm { anchors.bolAnchor = bolAnchor; anchors.eolAnchor = eolAnchor; quantityType = QuantifierFixedCount; - quantityCount = 1; + quantityMinCount = quantityMaxCount = 1; } static PatternTerm ForwardReference() @@ -208,12 +228,32 @@ struct PatternTerm { { return m_capture; } - + + bool containsAnyCaptures() + { + ASSERT(this->type == TypeParenthesesSubpattern); + return parentheses.lastSubpatternId >= parentheses.subpatternId; + } + void quantify(unsigned count, QuantifierType type) { - quantityCount = count; + quantityMinCount = 0; + quantityMaxCount = count; quantityType = type; } + + void quantify(unsigned minCount, unsigned maxCount, QuantifierType type) + { + // Currently only Parentheses can specify a non-zero min with a different max. + ASSERT(this->type == TypeParenthesesSubpattern || !minCount || minCount == maxCount); + ASSERT(minCount <= maxCount); + quantityMinCount = minCount; + quantityMaxCount = maxCount; + quantityType = type; + } + + void dumpQuantifier(PrintStream&); + void dump(PrintStream&, YarrPattern*, unsigned); }; struct PatternAlternative { @@ -250,6 +290,8 @@ public: return m_onceThrough; } + void dump(PrintStream&, YarrPattern*, unsigned); + Vector<PatternTerm> m_terms; PatternDisjunction* m_parent; unsigned m_minimumSize; @@ -270,12 +312,13 @@ public: PatternAlternative* addNewAlternative() { - PatternAlternative* alternative = new PatternAlternative(this); - m_alternatives.append(adoptPtr(alternative)); - return alternative; + m_alternatives.append(std::make_unique<PatternAlternative>(this)); + return static_cast<PatternAlternative*>(m_alternatives.last().get()); } - Vector<OwnPtr<PatternAlternative> > m_alternatives; + void dump(PrintStream&, YarrPattern*, unsigned); + + Vector<std::unique_ptr<PatternAlternative>> m_alternatives; PatternAlternative* m_parent; unsigned m_minimumSize; unsigned m_callFrameSize; @@ -286,13 +329,17 @@ public: // (please to be calling newlineCharacterClass() et al on your // friendly neighborhood YarrPattern instance to get nicely // cached copies). -CharacterClass* newlineCreate(); -CharacterClass* digitsCreate(); -CharacterClass* spacesCreate(); -CharacterClass* wordcharCreate(); -CharacterClass* nondigitsCreate(); -CharacterClass* nonspacesCreate(); -CharacterClass* nonwordcharCreate(); + +std::unique_ptr<CharacterClass> anycharCreate(); +std::unique_ptr<CharacterClass> newlineCreate(); +std::unique_ptr<CharacterClass> digitsCreate(); +std::unique_ptr<CharacterClass> spacesCreate(); +std::unique_ptr<CharacterClass> wordcharCreate(); +std::unique_ptr<CharacterClass> wordUnicodeIgnoreCaseCharCreate(); +std::unique_ptr<CharacterClass> nondigitsCreate(); +std::unique_ptr<CharacterClass> nonspacesCreate(); +std::unique_ptr<CharacterClass> nonwordcharCreate(); +std::unique_ptr<CharacterClass> nonwordUnicodeIgnoreCaseCharCreate(); struct TermChain { TermChain(PatternTerm term) @@ -303,27 +350,37 @@ struct TermChain { Vector<TermChain> hotTerms; }; + struct YarrPattern { - JS_EXPORT_PRIVATE YarrPattern(const String& pattern, bool ignoreCase, bool multiline, const char** error); + JS_EXPORT_PRIVATE YarrPattern(const String& pattern, RegExpFlags, ErrorCode&, void* stackLimit = nullptr); void reset() { m_numSubpatterns = 0; m_maxBackReference = 0; + m_initialStartValueFrameLocation = 0; m_containsBackreferences = false; m_containsBOL = false; - - newlineCached = 0; - digitsCached = 0; - spacesCached = 0; - wordcharCached = 0; - nondigitsCached = 0; - nonspacesCached = 0; - nonwordcharCached = 0; + m_containsUnsignedLengthPattern = false; + m_hasCopiedParenSubexpressions = false; + m_saveInitialStartValue = false; + + anycharCached = nullptr; + newlineCached = nullptr; + digitsCached = nullptr; + spacesCached = nullptr; + wordcharCached = nullptr; + wordUnicodeIgnoreCaseCharCached = nullptr; + nondigitsCached = nullptr; + nonspacesCached = nullptr; + nonwordcharCached = nullptr; + nonwordUnicodeIgnoreCasecharCached = nullptr; + unicodePropertiesCached.clear(); m_disjunctions.clear(); m_userCharacterClasses.clear(); + m_captureGroupNames.shrink(0); } bool containsIllegalBackReference() @@ -331,71 +388,212 @@ struct YarrPattern { return m_maxBackReference > m_numSubpatterns; } + bool containsUnsignedLengthPattern() + { + return m_containsUnsignedLengthPattern; + } + + CharacterClass* anyCharacterClass() + { + if (!anycharCached) { + m_userCharacterClasses.append(anycharCreate()); + anycharCached = m_userCharacterClasses.last().get(); + } + return anycharCached; + } CharacterClass* newlineCharacterClass() { - if (!newlineCached) - m_userCharacterClasses.append(adoptPtr(newlineCached = newlineCreate())); + if (!newlineCached) { + m_userCharacterClasses.append(newlineCreate()); + newlineCached = m_userCharacterClasses.last().get(); + } return newlineCached; } CharacterClass* digitsCharacterClass() { - if (!digitsCached) - m_userCharacterClasses.append(adoptPtr(digitsCached = digitsCreate())); + if (!digitsCached) { + m_userCharacterClasses.append(digitsCreate()); + digitsCached = m_userCharacterClasses.last().get(); + } return digitsCached; } CharacterClass* spacesCharacterClass() { - if (!spacesCached) - m_userCharacterClasses.append(adoptPtr(spacesCached = spacesCreate())); + if (!spacesCached) { + m_userCharacterClasses.append(spacesCreate()); + spacesCached = m_userCharacterClasses.last().get(); + } return spacesCached; } CharacterClass* wordcharCharacterClass() { - if (!wordcharCached) - m_userCharacterClasses.append(adoptPtr(wordcharCached = wordcharCreate())); + if (!wordcharCached) { + m_userCharacterClasses.append(wordcharCreate()); + wordcharCached = m_userCharacterClasses.last().get(); + } return wordcharCached; } + CharacterClass* wordUnicodeIgnoreCaseCharCharacterClass() + { + if (!wordUnicodeIgnoreCaseCharCached) { + m_userCharacterClasses.append(wordUnicodeIgnoreCaseCharCreate()); + wordUnicodeIgnoreCaseCharCached = m_userCharacterClasses.last().get(); + } + return wordUnicodeIgnoreCaseCharCached; + } CharacterClass* nondigitsCharacterClass() { - if (!nondigitsCached) - m_userCharacterClasses.append(adoptPtr(nondigitsCached = nondigitsCreate())); + if (!nondigitsCached) { + m_userCharacterClasses.append(nondigitsCreate()); + nondigitsCached = m_userCharacterClasses.last().get(); + } return nondigitsCached; } CharacterClass* nonspacesCharacterClass() { - if (!nonspacesCached) - m_userCharacterClasses.append(adoptPtr(nonspacesCached = nonspacesCreate())); + if (!nonspacesCached) { + m_userCharacterClasses.append(nonspacesCreate()); + nonspacesCached = m_userCharacterClasses.last().get(); + } return nonspacesCached; } CharacterClass* nonwordcharCharacterClass() { - if (!nonwordcharCached) - m_userCharacterClasses.append(adoptPtr(nonwordcharCached = nonwordcharCreate())); + if (!nonwordcharCached) { + m_userCharacterClasses.append(nonwordcharCreate()); + nonwordcharCached = m_userCharacterClasses.last().get(); + } return nonwordcharCached; } + CharacterClass* nonwordUnicodeIgnoreCaseCharCharacterClass() + { + if (!nonwordUnicodeIgnoreCasecharCached) { + m_userCharacterClasses.append(nonwordUnicodeIgnoreCaseCharCreate()); + nonwordUnicodeIgnoreCasecharCached = m_userCharacterClasses.last().get(); + } + return nonwordUnicodeIgnoreCasecharCached; + } + CharacterClass* unicodeCharacterClassFor(BuiltInCharacterClassID unicodeClassID) + { + ASSERT(unicodeClassID >= BuiltInCharacterClassID::BaseUnicodePropertyID); + + unsigned classID = static_cast<unsigned>(unicodeClassID); + + if (unicodePropertiesCached.find(classID) == unicodePropertiesCached.end()) { + m_userCharacterClasses.append(createUnicodeCharacterClassFor(unicodeClassID)); + CharacterClass* result = m_userCharacterClasses.last().get(); + unicodePropertiesCached.add(classID, result); + return result; + } + + return unicodePropertiesCached.get(classID); + } + + void dumpPattern(const String& pattern); + void dumpPattern(PrintStream& out, const String& pattern); + + bool global() const { return m_flags & FlagGlobal; } + bool ignoreCase() const { return m_flags & FlagIgnoreCase; } + bool multiline() const { return m_flags & FlagMultiline; } + bool sticky() const { return m_flags & FlagSticky; } + bool unicode() const { return m_flags & FlagUnicode; } + bool dotAll() const { return m_flags & FlagDotAll; } - bool m_ignoreCase : 1; - bool m_multiline : 1; bool m_containsBackreferences : 1; bool m_containsBOL : 1; - unsigned m_numSubpatterns; - unsigned m_maxBackReference; + bool m_containsUnsignedLengthPattern : 1; + bool m_hasCopiedParenSubexpressions : 1; + bool m_saveInitialStartValue : 1; + RegExpFlags m_flags; + unsigned m_numSubpatterns { 0 }; + unsigned m_maxBackReference { 0 }; + unsigned m_initialStartValueFrameLocation { 0 }; PatternDisjunction* m_body; - Vector<OwnPtr<PatternDisjunction>, 4> m_disjunctions; - Vector<OwnPtr<CharacterClass> > m_userCharacterClasses; + Vector<std::unique_ptr<PatternDisjunction>, 4> m_disjunctions; + Vector<std::unique_ptr<CharacterClass>> m_userCharacterClasses; + Vector<String> m_captureGroupNames; + HashMap<String, unsigned> m_namedGroupToParenIndex; private: - const char* compile(const String& patternString); - - CharacterClass* newlineCached; - CharacterClass* digitsCached; - CharacterClass* spacesCached; - CharacterClass* wordcharCached; - CharacterClass* nondigitsCached; - CharacterClass* nonspacesCached; - CharacterClass* nonwordcharCached; + ErrorCode compile(const String& patternString, void* stackLimit); + + CharacterClass* anycharCached { nullptr }; + CharacterClass* newlineCached { nullptr }; + CharacterClass* digitsCached { nullptr }; + CharacterClass* spacesCached { nullptr }; + CharacterClass* wordcharCached { nullptr }; + CharacterClass* wordUnicodeIgnoreCaseCharCached { nullptr }; + CharacterClass* nondigitsCached { nullptr }; + CharacterClass* nonspacesCached { nullptr }; + CharacterClass* nonwordcharCached { nullptr }; + CharacterClass* nonwordUnicodeIgnoreCasecharCached { nullptr }; + HashMap<unsigned, CharacterClass*> unicodePropertiesCached; }; -} } // namespace JSC::Yarr + void indentForNestingLevel(PrintStream&, unsigned); + void dumpUChar32(PrintStream&, UChar32); + void dumpCharacterClass(PrintStream&, YarrPattern*, CharacterClass*); + + struct BackTrackInfoPatternCharacter { + uintptr_t begin; // Only needed for unicode patterns + uintptr_t matchAmount; + + static unsigned beginIndex() { return offsetof(BackTrackInfoPatternCharacter, begin) / sizeof(uintptr_t); } + static unsigned matchAmountIndex() { return offsetof(BackTrackInfoPatternCharacter, matchAmount) / sizeof(uintptr_t); } + }; -#endif // YarrPattern_h + struct BackTrackInfoCharacterClass { + uintptr_t begin; // Only needed for unicode patterns + uintptr_t matchAmount; + + static unsigned beginIndex() { return offsetof(BackTrackInfoCharacterClass, begin) / sizeof(uintptr_t); } + static unsigned matchAmountIndex() { return offsetof(BackTrackInfoCharacterClass, matchAmount) / sizeof(uintptr_t); } + }; + + struct BackTrackInfoBackReference { + uintptr_t begin; // Not really needed for greedy quantifiers. + uintptr_t matchAmount; // Not really needed for fixed quantifiers. + + unsigned beginIndex() { return offsetof(BackTrackInfoBackReference, begin) / sizeof(uintptr_t); } + unsigned matchAmountIndex() { return offsetof(BackTrackInfoBackReference, matchAmount) / sizeof(uintptr_t); } + }; + + struct BackTrackInfoAlternative { + union { + uintptr_t offset; + }; + }; + + struct BackTrackInfoParentheticalAssertion { + uintptr_t begin; + + static unsigned beginIndex() { return offsetof(BackTrackInfoParentheticalAssertion, begin) / sizeof(uintptr_t); } + }; + + struct BackTrackInfoParenthesesOnce { + uintptr_t begin; + uintptr_t returnAddress; + + static unsigned beginIndex() { return offsetof(BackTrackInfoParenthesesOnce, begin) / sizeof(uintptr_t); } + static unsigned returnAddressIndex() { return offsetof(BackTrackInfoParenthesesOnce, returnAddress) / sizeof(uintptr_t); } + }; + + struct BackTrackInfoParenthesesTerminal { + uintptr_t begin; + + static unsigned beginIndex() { return offsetof(BackTrackInfoParenthesesTerminal, begin) / sizeof(uintptr_t); } + }; + + struct BackTrackInfoParentheses { + uintptr_t begin; + uintptr_t returnAddress; + uintptr_t matchAmount; + uintptr_t parenContextHead; + + static unsigned beginIndex() { return offsetof(BackTrackInfoParentheses, begin) / sizeof(uintptr_t); } + static unsigned returnAddressIndex() { return offsetof(BackTrackInfoParentheses, returnAddress) / sizeof(uintptr_t); } + static unsigned matchAmountIndex() { return offsetof(BackTrackInfoParentheses, matchAmount) / sizeof(uintptr_t); } + static unsigned parenContextHeadIndex() { return offsetof(BackTrackInfoParentheses, parenContextHead) / sizeof(uintptr_t); } + }; + +} } // namespace JSC::Yarr diff --git a/src/3rdparty/masm/yarr/YarrSyntaxChecker.cpp b/src/3rdparty/masm/yarr/YarrSyntaxChecker.cpp index aa98c4a354..9f05f22852 100644 --- a/src/3rdparty/masm/yarr/YarrSyntaxChecker.cpp +++ b/src/3rdparty/masm/yarr/YarrSyntaxChecker.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2011 Apple Inc. All rights reserved. + * Copyright (C) 2011, 2016 Apple Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -27,6 +27,8 @@ #include "YarrSyntaxChecker.h" #include "YarrParser.h" +#include <wtf/Optional.h> +#include <wtf/text/WTFString.h> namespace JSC { namespace Yarr { @@ -35,25 +37,26 @@ public: void assertionBOL() {} void assertionEOL() {} void assertionWordBoundary(bool) {} - void atomPatternCharacter(UChar) {} + void atomPatternCharacter(UChar32) {} void atomBuiltInCharacterClass(BuiltInCharacterClassID, bool) {} void atomCharacterClassBegin(bool = false) {} void atomCharacterClassAtom(UChar) {} void atomCharacterClassRange(UChar, UChar) {} void atomCharacterClassBuiltIn(BuiltInCharacterClassID, bool) {} void atomCharacterClassEnd() {} - void atomParenthesesSubpatternBegin(bool = true) {} + void atomParenthesesSubpatternBegin(bool = true, std::optional<String> = std::nullopt) {} void atomParentheticalAssertionBegin(bool = false) {} void atomParenthesesEnd() {} void atomBackReference(unsigned) {} + void atomNamedBackReference(String) {} void quantifyAtom(unsigned, unsigned, bool) {} void disjunction() {} }; -const char* checkSyntax(const String& pattern) +ErrorCode checkSyntax(const String& pattern, const String& flags) { SyntaxChecker syntaxChecker; - return parse(syntaxChecker, pattern); + return parse(syntaxChecker, pattern, flags.contains('u')); } -}} // JSC::YARR +}} // JSC::Yarr diff --git a/src/3rdparty/masm/yarr/YarrSyntaxChecker.h b/src/3rdparty/masm/yarr/YarrSyntaxChecker.h index 104ced3ab4..86daf38bcb 100644 --- a/src/3rdparty/masm/yarr/YarrSyntaxChecker.h +++ b/src/3rdparty/masm/yarr/YarrSyntaxChecker.h @@ -23,16 +23,13 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef YarrSyntaxChecker_h -#define YarrSyntaxChecker_h +#pragma once +#include "YarrErrorCode.h" #include <wtf/text/WTFString.h> namespace JSC { namespace Yarr { -const char* checkSyntax(const String& pattern); - -}} // JSC::YARR - -#endif // YarrSyntaxChecker_h +ErrorCode checkSyntax(const String& pattern, const String& flags); +}} // JSC::Yarr diff --git a/src/3rdparty/masm/yarr/YarrUnicodeProperties.h b/src/3rdparty/masm/yarr/YarrUnicodeProperties.h new file mode 100644 index 0000000000..20f6739de3 --- /dev/null +++ b/src/3rdparty/masm/yarr/YarrUnicodeProperties.h @@ -0,0 +1,41 @@ +/* + * Copyright (C) 2017 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#pragma once + +#include "Yarr.h" +#include <wtf/Optional.h> +#include <wtf/text/WTFString.h> + +namespace JSC { namespace Yarr { + +struct CharacterClass; + +JS_EXPORT_PRIVATE std::optional<BuiltInCharacterClassID> unicodeMatchPropertyValue(WTF::String, WTF::String); +JS_EXPORT_PRIVATE std::optional<BuiltInCharacterClassID> unicodeMatchProperty(WTF::String); + +std::unique_ptr<CharacterClass> createUnicodeCharacterClassFor(BuiltInCharacterClassID); + +} } // namespace JSC::Yarr diff --git a/src/3rdparty/masm/create_regex_tables b/src/3rdparty/masm/yarr/create_regex_tables index 7544b75cd9..4c3dbbe3fb 100644 --- a/src/3rdparty/masm/create_regex_tables +++ b/src/3rdparty/masm/yarr/create_regex_tables @@ -1,4 +1,6 @@ -# Copyright (C) 2010, 2013 Apple Inc. All rights reserved. +#!/usr/bin/env python + +# Copyright (C) 2010, 2013-2017 Apple Inc. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -21,16 +23,19 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import os import sys types = { "wordchar": { "UseTable" : True, "data": ['_', ('0','9'), ('A', 'Z'), ('a','z')]}, - "nonwordchar": { "UseTable" : True, "Inverse": "wordchar", "data": ['`', (0, ord('0') - 1), (ord('9') + 1, ord('A') - 1), (ord('Z') + 1, ord('_') - 1), (ord('z') + 1, 0xffff)]}, + "wordUnicodeIgnoreCaseChar": { "UseTable" : False, "data": ['_', ('0', '9'), ('A', 'Z'), ('a', 'z'), 0x017f, 0x212a]}, + "nonwordchar": { "UseTable" : True, "Inverse": "wordchar", "data": ['`', (0, ord('0') - 1), (ord('9') + 1, ord('A') - 1), (ord('Z') + 1, ord('_') - 1), (ord('z') + 1, 0x10ffff)]}, + "nonwordUnicodeIgnoreCaseChar": { "UseTable" : False, "Inverse": "wordUnicodeIgnoreCaseChar", "data": ['`', (0, ord('0') - 1), (ord('9') + 1, ord('A') - 1), (ord('Z') + 1, ord('_') - 1), (ord('z') + 1, 0x017e), (0x0180, 0x2129), (0x212b, 0x10ffff)]}, "newline": { "UseTable" : False, "data": ['\n', '\r', 0x2028, 0x2029]}, "spaces": { "UseTable" : True, "data": [' ', ('\t', '\r'), 0xa0, 0x1680, 0x180e, 0x2028, 0x2029, 0x202f, 0x205f, 0x3000, (0x2000, 0x200a), 0xfeff]}, - "nonspaces": { "UseTable" : True, "Inverse": "spaces", "data": [(0, ord('\t') - 1), (ord('\r') + 1, ord(' ') - 1), (ord(' ') + 1, 0x009f), (0x00a1, 0x167f), (0x1681, 0x180d), (0x180f, 0x1fff), (0x200b, 0x2027), (0x202a, 0x202e), (0x2030, 0x205e), (0x2060, 0x2fff), (0x3001, 0xfefe), (0xff00, 0xffff)]}, + "nonspaces": { "UseTable" : True, "Inverse": "spaces", "data": [(0, ord('\t') - 1), (ord('\r') + 1, ord(' ') - 1), (ord(' ') + 1, 0x009f), (0x00a1, 0x167f), (0x1681, 0x180d), (0x180f, 0x1fff), (0x200b, 0x2027), (0x202a, 0x202e), (0x2030, 0x205e), (0x2060, 0x2fff), (0x3001, 0xfefe), (0xff00, 0x10ffff)]}, "digits": { "UseTable" : False, "data": [('0', '9')]}, - "nondigits": { "UseTable" : False, "Inverse": "digits", "data": [(0, ord('0') - 1), (ord('9') + 1, 0xffff)] } + "nondigits": { "UseTable" : False, "Inverse": "digits", "data": [(0, ord('0') - 1), (ord('9') + 1, 0x10ffff)] } } entriesPerLine = 50 arrays = ""; @@ -86,15 +91,16 @@ for name, classes in types.items(): # Generate createFunction: function = ""; - function += ("CharacterClass* %sCreate()\n" % name) + function += ("std::unique_ptr<CharacterClass> %sCreate()\n" % name) function += ("{\n") if emitTables and classes["UseTable"]: if "Inverse" in classes: - function += (" CharacterClass* characterClass = new CharacterClass(_%sData, true);\n" % (classes["Inverse"])) + function += (" auto characterClass = std::make_unique<CharacterClass>(_%sData, true);\n" % (classes["Inverse"])) else: - function += (" CharacterClass* characterClass = new CharacterClass(_%sData, false);\n" % (name)) + function += (" auto characterClass = std::make_unique<CharacterClass>(_%sData, false);\n" % (name)) else: - function += (" CharacterClass* characterClass = new CharacterClass;\n") + function += (" auto characterClass = std::make_unique<CharacterClass>();\n") + hasNonBMPCharacters = False for (min, max) in ranges: if (min == max): if (min > 127): @@ -106,12 +112,19 @@ for name, classes in types.items(): function += (" characterClass->m_rangesUnicode.append(CharacterRange(0x%04x, 0x%04x));\n" % (min, max)) else: function += (" characterClass->m_ranges.append(CharacterRange(0x%02x, 0x%02x));\n" % (min, max)) + if max >= 0x10000: + hasNonBMPCharacters = True + function += (" characterClass->m_hasNonBMPCharacters = %s;\n" % ("true" if hasNonBMPCharacters else "false")) function += (" return characterClass;\n") function += ("}\n\n") functions += function if (len(sys.argv) > 1): - f = open(sys.argv[-1], "w") + path = sys.argv[-1] + dirname = os.path.dirname(path) + if not os.path.isdir(dirname): + os.makedirs(dirname) + f = open(path, "w") f.write(arrays) f.write(functions) f.close() diff --git a/src/3rdparty/masm/yarr/generateYarrCanonicalizeUnicode b/src/3rdparty/masm/yarr/generateYarrCanonicalizeUnicode new file mode 100644 index 0000000000..a103bcdf16 --- /dev/null +++ b/src/3rdparty/masm/yarr/generateYarrCanonicalizeUnicode @@ -0,0 +1,204 @@ +#! /usr/bin/env python + +# Copyright (C) 2016 Apple Inc. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# This tool processes the Unicode Character Database file CaseFolding.txt to create +# canonicalization table as decribed in ECMAScript 6 standard in section +# "21.2.2.8.2 Runtime Semantics: Canonicalize()", step 2. + +import optparse +import os +import re +import sys +from sets import Set + +header = """/* +* Copyright (C) 2016 Apple Inc. All rights reserved. +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* +* 1. Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* 2. Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in the +* documentation and/or other materials provided with the distribution. +* +* THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY +* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +* DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY +* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +// DO NO EDIT! - This file was generated by generateYarrCanonicalizeUnicode + +#include "config.h" +#include "YarrCanonicalize.h" + +namespace JSC { namespace Yarr { + +""" + +footer = """} } // JSC::Yarr +""" + +MaxUnicode = 0x10ffff +commonAndSimpleLinesRE = re.compile(r"(?P<code>[0-9A-F]+)\s*;\s*[CS]\s*;\s*(?P<mapping>[0-9A-F]+)", re.IGNORECASE) + +def openOrExit(path, mode): + try: + dirname = os.path.dirname(path) + if not os.path.isdir(dirname): + os.makedirs(dirname) + return open(path, mode) + except IOError as e: + print "I/O error opening {0}, ({1}): {2}".format(path, e.errno, e.strerror) + exit(1) + +class Canonicalize: + def __init__(self): + self.canonicalGroups = {}; + + def addMapping(self, code, mapping): + if mapping not in self.canonicalGroups: + self.canonicalGroups[mapping] = [] + self.canonicalGroups[mapping].append(code) + + def readCaseFolding(self, file): + codesSeen = Set() + for line in file: + line = line.split('#', 1)[0] + line = line.rstrip() + if (not len(line)): + continue + + fields = commonAndSimpleLinesRE.match(line) + if (not fields): + continue + + code = int(fields.group('code'), 16) + mapping = int(fields.group('mapping'), 16) + + codesSeen.add(code) + self.addMapping(code, mapping) + + for i in range(MaxUnicode + 1): + if i in codesSeen: + continue; + + self.addMapping(i, i) + + def createTables(self, file): + typeInfo = [""] * (MaxUnicode + 1) + characterSets = [] + + for mapping in sorted(self.canonicalGroups.keys()): + characters = self.canonicalGroups[mapping] + if len(characters) == 1: + typeInfo[characters[0]] = "CanonicalizeUnique:0" + else: + characters.sort() + if len(characters) > 2: + for ch in characters: + typeInfo[ch] = "CanonicalizeSet:%d" % len(characterSets) + characterSets.append(characters) + else: + low = characters[0] + high = characters[1] + delta = high - low + if delta == 1: + type = "CanonicalizeAlternatingUnaligned:0" if low & 1 else "CanonicalizeAlternatingAligned:0" + typeInfo[low] = type + typeInfo[high] = type + else: + typeInfo[low] = "CanonicalizeRangeLo:%d" % delta + typeInfo[high] = "CanonicalizeRangeHi:%d" % delta + + rangeInfo = [] + end = 0 + while end <= MaxUnicode: + begin = end + type = typeInfo[end] + while end < MaxUnicode and typeInfo[end + 1] == type: + end = end + 1 + rangeInfo.append({"begin": begin, "end": end, "type": type}) + end = end + 1 + + for i in range(len(characterSets)): + characters = "" + set = characterSets[i] + for ch in set: + characters = characters + "0x{character:04x}, ".format(character=ch) + file.write("const UChar32 unicodeCharacterSet{index:d}[] = {{ {characters}0 }};\n".format(index=i, characters=characters)) + + file.write("\n") + file.write("static const size_t UNICODE_CANONICALIZATION_SETS = {setCount:d};\n".format(setCount=len(characterSets))) + file.write("const UChar32* const unicodeCharacterSetInfo[UNICODE_CANONICALIZATION_SETS] = {\n") + + for i in range(len(characterSets)): + file.write(" unicodeCharacterSet{setNumber:d},\n".format(setNumber=i)) + + file.write("};\n") + file.write("\n") + file.write("const size_t UNICODE_CANONICALIZATION_RANGES = {rangeCount:d};\n".format(rangeCount=len(rangeInfo))) + file.write("const CanonicalizationRange unicodeRangeInfo[UNICODE_CANONICALIZATION_RANGES] = {\n") + + for info in rangeInfo: + typeAndValue = info["type"].split(":") + file.write(" {{ 0x{begin:04x}, 0x{end:04x}, 0x{value:04x}, {type} }},\n".format(begin=info["begin"], end=info["end"], value=int(typeAndValue[1]), type=typeAndValue[0])) + + file.write("};\n") + file.write("\n") + + +if __name__ == "__main__": + parser = optparse.OptionParser(usage = "usage: %prog <CaseFolding.txt> <YarrCanonicalizeUnicode.h>") + (options, args) = parser.parse_args() + + if len(args) != 2: + parser.error("<CaseFolding.txt> <YarrCanonicalizeUnicode.h>") + + caseFoldingTxtPath = args[0] + canonicalizeHPath = args[1] + caseFoldingTxtFile = openOrExit(caseFoldingTxtPath, "r") + canonicalizeHFile = openOrExit(canonicalizeHPath, "wb") + + canonicalize = Canonicalize() + canonicalize.readCaseFolding(caseFoldingTxtFile) + + canonicalizeHFile.write(header); + canonicalize.createTables(canonicalizeHFile) + canonicalizeHFile.write(footer); + + caseFoldingTxtFile.close() + canonicalizeHFile.close() + + exit(0) diff --git a/src/3rdparty/masm/yarr/yarr.pri b/src/3rdparty/masm/yarr/yarr.pri index 7e9b4d3f3b..c8e30990be 100644 --- a/src/3rdparty/masm/yarr/yarr.pri +++ b/src/3rdparty/masm/yarr/yarr.pri @@ -8,5 +8,8 @@ SOURCES += \ $$PWD/YarrInterpreter.cpp \ $$PWD/YarrPattern.cpp \ $$PWD/YarrSyntaxChecker.cpp \ - $$PWD/YarrCanonicalizeUCS2.cpp + $$PWD/YarrCanonicalizeUCS2.cpp \ + $$PWD/YarrCanonicalizeUnicode.cpp \ + $$PWD/YarrErrorCode.cpp \ + $$PWD/YarrUnicodeProperties.cpp diff --git a/src/qml/jsruntime/qv4enginebase_p.h b/src/qml/jsruntime/qv4enginebase_p.h index 3e89e57abb..189208e731 100644 --- a/src/qml/jsruntime/qv4enginebase_p.h +++ b/src/qml/jsruntime/qv4enginebase_p.h @@ -72,9 +72,8 @@ struct Q_QML_EXPORT EngineBase { quint8 hasException = false; quint8 writeBarrierActive = false; quint16 unused = 0; -#if QT_POINTER_SIZE == 8 - quint8 padding[4]; -#endif + quint8 isExecutingInRegExpJIT = false; + quint8 padding[3]; MemoryManager *memoryManager = nullptr; Runtime runtime; @@ -133,7 +132,7 @@ Q_STATIC_ASSERT(std::is_standard_layout<EngineBase>::value); Q_STATIC_ASSERT(offsetof(EngineBase, currentStackFrame) == 0); Q_STATIC_ASSERT(offsetof(EngineBase, jsStackTop) == offsetof(EngineBase, currentStackFrame) + QT_POINTER_SIZE); Q_STATIC_ASSERT(offsetof(EngineBase, hasException) == offsetof(EngineBase, jsStackTop) + QT_POINTER_SIZE); -Q_STATIC_ASSERT(offsetof(EngineBase, memoryManager) == offsetof(EngineBase, hasException) + QT_POINTER_SIZE); +Q_STATIC_ASSERT(offsetof(EngineBase, memoryManager) == offsetof(EngineBase, hasException) + 8); Q_STATIC_ASSERT(offsetof(EngineBase, runtime) == offsetof(EngineBase, memoryManager) + QT_POINTER_SIZE); } diff --git a/src/qml/jsruntime/qv4regexp.cpp b/src/qml/jsruntime/qv4regexp.cpp index e10493b879..89fd9fc233 100644 --- a/src/qml/jsruntime/qv4regexp.cpp +++ b/src/qml/jsruntime/qv4regexp.cpp @@ -41,6 +41,7 @@ #include "qv4engine_p.h" #include "qv4scopedvalue_p.h" #include <private/qv4mm_p.h> +#include <runtime/VM.h> using namespace QV4; @@ -100,16 +101,24 @@ void Heap::RegExp::init(ExecutionEngine *engine, const QString &pattern, bool ig valid = false; - const char* error = nullptr; - JSC::Yarr::YarrPattern yarrPattern(WTF::String(pattern), ignoreCase, multiLine, &error); - if (error) + JSC::Yarr::ErrorCode error = JSC::Yarr::ErrorCode::NoError; + JSC::RegExpFlags flags = JSC::NoFlags; + if (ignoreCase) + flags = static_cast<JSC::RegExpFlags>(flags | JSC::FlagIgnoreCase); + if (multiline) + flags = static_cast<JSC::RegExpFlags>(flags | JSC::FlagMultiline); + if (global) + flags = static_cast<JSC::RegExpFlags>(flags | JSC::FlagGlobal); + + JSC::Yarr::YarrPattern yarrPattern(WTF::String(pattern), flags, error); + if (error != JSC::Yarr::ErrorCode::NoError) return; subPatternCount = yarrPattern.m_numSubpatterns; #if ENABLE(YARR_JIT) if (!yarrPattern.m_containsBackreferences && engine->canJIT()) { jitCode = new JSC::Yarr::YarrCodeBlock; - JSC::JSGlobalData dummy(internalClass->engine->regExpAllocator); - JSC::Yarr::jitCompile(yarrPattern, JSC::Yarr::Char16, &dummy, *jitCode); + JSC::VM *vm = static_cast<JSC::VM *>(engine); + JSC::Yarr::jitCompile(yarrPattern, JSC::Yarr::Char16, vm, *jitCode); } #else Q_UNUSED(engine) @@ -118,8 +127,7 @@ void Heap::RegExp::init(ExecutionEngine *engine, const QString &pattern, bool ig valid = true; return; } - OwnPtr<JSC::Yarr::BytecodePattern> p = JSC::Yarr::byteCompile(yarrPattern, internalClass->engine->bumperPointerAllocator); - byteCode = p.take(); + byteCode = JSC::Yarr::byteCompile(yarrPattern, internalClass->engine->bumperPointerAllocator).release(); if (byteCode) valid = true; } diff --git a/src/qml/jsruntime/qv4regexp_p.h b/src/qml/jsruntime/qv4regexp_p.h index 56454f73d3..597e42538a 100644 --- a/src/qml/jsruntime/qv4regexp_p.h +++ b/src/qml/jsruntime/qv4regexp_p.h @@ -86,7 +86,7 @@ struct RegExp : Base { #endif bool hasValidJITCode() const { #if ENABLE(YARR_JIT) - return jitCode && !jitCode->isFallBack() && jitCode->has16BitCode(); + return jitCode && !jitCode->failureReason().has_value() && jitCode->has16BitCode(); #else return false; #endif |