aboutsummaryrefslogtreecommitdiffstats
path: root/src/3rdparty/masm/yarr/YarrCanonicalizeUCS2.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/3rdparty/masm/yarr/YarrCanonicalizeUCS2.h')
-rw-r--r--src/3rdparty/masm/yarr/YarrCanonicalizeUCS2.h138
1 files changed, 138 insertions, 0 deletions
diff --git a/src/3rdparty/masm/yarr/YarrCanonicalizeUCS2.h b/src/3rdparty/masm/yarr/YarrCanonicalizeUCS2.h
new file mode 100644
index 0000000000..9dce78200c
--- /dev/null
+++ b/src/3rdparty/masm/yarr/YarrCanonicalizeUCS2.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright (C) 2012 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef YarrCanonicalizeUCS2_H
+#define YarrCanonicalizeUCS2_H
+
+#include <stdint.h>
+#include <wtf/unicode/Unicode.h>
+
+namespace JSC { namespace Yarr {
+
+// This set of data (autogenerated using YarrCanonicalizeUCS2.js into YarrCanonicalizeUCS2.cpp)
+// provides information for each UCS2 code point as to the set of code points that it should
+// match under the ES5.1 case insensitive RegExp matching rules, specified in 15.10.2.8.
+enum UCS2CanonicalizationType {
+ CanonicalizeUnique, // No canonically equal values, e.g. 0x0.
+ CanonicalizeSet, // Value indicates a set in characterSetInfo.
+ CanonicalizeRangeLo, // Value is positive delta to pair, E.g. 0x41 has value 0x20, -> 0x61.
+ CanonicalizeRangeHi, // Value is positive delta to pair, E.g. 0x61 has value 0x20, -> 0x41.
+ CanonicalizeAlternatingAligned, // Aligned consequtive pair, e.g. 0x1f4,0x1f5.
+ CanonicalizeAlternatingUnaligned, // Unaligned consequtive pair, e.g. 0x241,0x242.
+};
+struct UCS2CanonicalizationRange { uint16_t begin, end, value, type; };
+extern const size_t UCS2_CANONICALIZATION_RANGES;
+extern uint16_t* characterSetInfo[];
+extern UCS2CanonicalizationRange rangeInfo[];
+
+// This table is similar to the full rangeInfo table, however this maps from UCS2 codepoints to
+// the set of Latin1 codepoints that could match.
+enum LatinCanonicalizationType {
+ CanonicalizeLatinSelf, // This character is in the Latin1 range, but has no canonical equivalent in the range.
+ CanonicalizeLatinMask0x20, // One of a pair of characters, under the mask 0x20.
+ CanonicalizeLatinOther, // This character is not in the Latin1 range, but canonicalizes to another that is.
+ CanonicalizeLatinInvalid, // Cannot match against Latin1 input.
+};
+struct LatinCanonicalizationRange { uint16_t begin, end, value, type; };
+extern const size_t LATIN_CANONICALIZATION_RANGES;
+extern LatinCanonicalizationRange latinRangeInfo[];
+
+// This searches in log2 time over ~364 entries, so should typically result in 8 compares.
+inline UCS2CanonicalizationRange* rangeInfoFor(UChar ch)
+{
+ UCS2CanonicalizationRange* info = rangeInfo;
+ size_t entries = UCS2_CANONICALIZATION_RANGES;
+
+ while (true) {
+ size_t candidate = entries >> 1;
+ UCS2CanonicalizationRange* candidateInfo = info + candidate;
+ if (ch < candidateInfo->begin)
+ entries = candidate;
+ else if (ch <= candidateInfo->end)
+ return candidateInfo;
+ else {
+ info = candidateInfo + 1;
+ entries -= (candidate + 1);
+ }
+ }
+}
+
+// Should only be called for characters that have one canonically matching value.
+inline UChar getCanonicalPair(UCS2CanonicalizationRange* info, UChar ch)
+{
+ ASSERT(ch >= info->begin && ch <= info->end);
+ switch (info->type) {
+ case CanonicalizeRangeLo:
+ return ch + info->value;
+ case CanonicalizeRangeHi:
+ return ch - info->value;
+ case CanonicalizeAlternatingAligned:
+ return ch ^ 1;
+ case CanonicalizeAlternatingUnaligned:
+ return ((ch - 1) ^ 1) + 1;
+ default:
+ RELEASE_ASSERT_NOT_REACHED();
+ }
+ RELEASE_ASSERT_NOT_REACHED();
+ return 0;
+}
+
+// Returns true if no other UCS2 codepoint can match this value.
+inline bool isCanonicallyUnique(UChar ch)
+{
+ return rangeInfoFor(ch)->type == CanonicalizeUnique;
+}
+
+// Returns true if values are equal, under the canonicalization rules.
+inline bool areCanonicallyEquivalent(UChar a, UChar b)
+{
+ UCS2CanonicalizationRange* info = rangeInfoFor(a);
+ switch (info->type) {
+ case CanonicalizeUnique:
+ return a == b;
+ case CanonicalizeSet: {
+ for (uint16_t* set = characterSetInfo[info->value]; (a = *set); ++set) {
+ if (a == b)
+ return true;
+ }
+ return false;
+ }
+ case CanonicalizeRangeLo:
+ return (a == b) || (a + info->value == b);
+ case CanonicalizeRangeHi:
+ return (a == b) || (a - info->value == b);
+ case CanonicalizeAlternatingAligned:
+ return (a | 1) == (b | 1);
+ case CanonicalizeAlternatingUnaligned:
+ return ((a - 1) | 1) == ((b - 1) | 1);
+ }
+
+ RELEASE_ASSERT_NOT_REACHED();
+ return false;
+}
+
+} } // JSC::Yarr
+
+#endif