1 files changed, 1912 insertions, 0 deletions
diff --git a/src/shared/quickjs/libunicode.c b/src/shared/quickjs/libunicode.c
new file mode 100644
index 000000000..482c51944
--- /dev/null
+++ b/src/shared/quickjs/libunicode.c
@@ -0,0 +1,1912 @@
+/*
+ * Unicode utilities
+ *
+ * Copyright (c) 2017-2018 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <string.h>
+#include <assert.h>
+
+#include "cutils.h"
+#include "libunicode.h"
+#include "libunicode-table.h"
+
+enum {
+    RUN_TYPE_U,
+    RUN_TYPE_L,
+    RUN_TYPE_UF,
+    RUN_TYPE_LF,
+    RUN_TYPE_UL,
+    RUN_TYPE_LSU,
+    RUN_TYPE_U2L_399_EXT2,
+    RUN_TYPE_UF_D20,
+    RUN_TYPE_UF_D1_EXT,
+    RUN_TYPE_U_EXT,
+    RUN_TYPE_LF_EXT,
+    RUN_TYPE_UF_EXT2,
+    RUN_TYPE_LF_EXT2,
+    RUN_TYPE_UF_EXT3,
+};
+
+static int lre_case_conv1(uint32_t c, int conv_type)
+{
+    uint32_t res[LRE_CC_RES_LEN_MAX];
+    lre_case_conv(res, c, conv_type);
+    return res[0];
+}
+
+/* case conversion using the table entry 'idx' with value 'v' */
+static int lre_case_conv_entry(uint32_t *res, uint32_t c, int conv_type, uint32_t idx, uint32_t v)
+{
+    uint32_t code, data, type, a, is_lower;
+    is_lower = (conv_type != 0);
+    type = (v >> (32 - 17 - 7 - 4)) & 0xf;
+    data = ((v & 0xf) << 8) | case_conv_table2[idx];
+    code = v >> (32 - 17);
+    switch(type) {
+    case RUN_TYPE_U:
+    case RUN_TYPE_L:
+    case RUN_TYPE_UF:
+    case RUN_TYPE_LF:
+        if (conv_type == (type & 1) ||
+            (type >= RUN_TYPE_UF && conv_type == 2)) {
+            c = c - code + (case_conv_table1[data] >> (32 - 17));
+        }
+        break;
+    case RUN_TYPE_UL:
+        a = c - code;
+        if ((a & 1) != (1 - is_lower))
+            break;
+        c = (a ^ 1) + code;
+        break;
+    case RUN_TYPE_LSU:
+        a = c - code;
+        if (a == 1) {
+            c += 2 * is_lower - 1;
+        } else if (a == (1 - is_lower) * 2) {
+            c += (2 * is_lower - 1) * 2;
+        }
+        break;
+    case RUN_TYPE_U2L_399_EXT2:
+        if (!is_lower) {
+            res[0] = c - code + case_conv_ext[data >> 6];
+            res[1] = 0x399;
+            return 2;
+        } else {
+            c = c - code + case_conv_ext[data & 0x3f];
+        }
+        break;
+    case RUN_TYPE_UF_D20:
+        if (conv_type == 1)
+            break;
+        c = data + (conv_type == 2) * 0x20;
+        break;
+    case RUN_TYPE_UF_D1_EXT:
+        if (conv_type == 1)
+            break;
+        c = case_conv_ext[data] + (conv_type == 2);
+        break;
+    case RUN_TYPE_U_EXT:
+    case RUN_TYPE_LF_EXT:
+        if (is_lower != (type - RUN_TYPE_U_EXT))
+            break;
+        c = case_conv_ext[data];
+        break;
+    case RUN_TYPE_LF_EXT2:
+        if (!is_lower)
+            break;
+        res[0] = c - code + case_conv_ext[data >> 6];
+        res[1] = case_conv_ext[data & 0x3f];
+        return 2;
+    case RUN_TYPE_UF_EXT2:
+        if (conv_type == 1)
+            break;
+        res[0] = c - code + case_conv_ext[data >> 6];
+        res[1] = case_conv_ext[data & 0x3f];
+        if (conv_type == 2) {
+            /* convert to lower */
+            res[0] = lre_case_conv1(res[0], 1);
+            res[1] = lre_case_conv1(res[1], 1);
+        }
+        return 2;
+    default:
+    case RUN_TYPE_UF_EXT3:
+        if (conv_type == 1)
+            break;
+        res[0] = case_conv_ext[data >> 8];
+        res[1] = case_conv_ext[(data >> 4) & 0xf];
+        res[2] = case_conv_ext[data & 0xf];
+        if (conv_type == 2) {
+            /* convert to lower */
+            res[0] = lre_case_conv1(res[0], 1);
+            res[1] = lre_case_conv1(res[1], 1);
+            res[2] = lre_case_conv1(res[2], 1);
+        }
+        return 3;
+    }
+    res[0] = c;
+    return 1;
+}
+
+/* conv_type:
+   0 = to upper
+   1 = to lower
+   2 = case folding (= to lower with modifications)
+*/
+int lre_case_conv(uint32_t *res, uint32_t c, int conv_type)
+{
+    if (c < 128) {
+        if (conv_type) {
+            if (c >= 'A' && c <= 'Z') {
+                c = c - 'A' + 'a';
+            }
+        } else {
+            if (c >= 'a' && c <= 'z') {
+                c = c - 'a' + 'A';
+            }
+        }
+    } else {
+        uint32_t v, code, len;
+        int idx, idx_min, idx_max;
+
+        idx_min = 0;
+        idx_max = countof(case_conv_table1) - 1;
+        while (idx_min <= idx_max) {
+            idx = (unsigned)(idx_max + idx_min) / 2;
+            v = case_conv_table1[idx];
+            code = v >> (32 - 17);
+            len = (v >> (32 - 17 - 7)) & 0x7f;
+            if (c < code) {
+                idx_max = idx - 1;
+            } else if (c >= code + len) {
+                idx_min = idx + 1;
+            } else {
+                return lre_case_conv_entry(res, c, conv_type, idx, v);
+            }
+        }
+    }
+    res[0] = c;
+    return 1;
+}
+
+static int lre_case_folding_entry(uint32_t c, uint32_t idx, uint32_t v, BOOL is_unicode)
+{
+    uint32_t res[LRE_CC_RES_LEN_MAX];
+    int len;
+
+    if (is_unicode) {
+        len = lre_case_conv_entry(res, c, 2, idx, v);
+        if (len == 1) {
+            c = res[0];
+        } else {
+            /* handle the few specific multi-character cases (see
+               unicode_gen.c:dump_case_folding_special_cases()) */
+            if (c == 0xfb06) {
+                c = 0xfb05;
+            } else if (c == 0x01fd3) {
+                c = 0x390;
+            } else if (c == 0x01fe3) {
+                c = 0x3b0;
+            }
+        }
+    } else {
+        if (likely(c < 128)) {
+            if (c >= 'a' && c <= 'z')
+                c = c - 'a' + 'A';
+        } else {
+            /* legacy regexp: to upper case if single char >= 128 */
+            len = lre_case_conv_entry(res, c, FALSE, idx, v);
+            if (len == 1 && res[0] >= 128)
+                c = res[0];
+        }
+    }
+    return c;
+}
+
+/* JS regexp specific rules for case folding */
+int lre_canonicalize(uint32_t c, BOOL is_unicode)
+{
+    if (c < 128) {
+        /* fast case */
+        if (is_unicode) {
+            if (c >= 'A' && c <= 'Z') {
+                c = c - 'A' + 'a';
+            }
+        } else {
+            if (c >= 'a' && c <= 'z') {
+                c = c - 'a' + 'A';
+            }
+        }
+    } else {
+        uint32_t v, code, len;
+        int idx, idx_min, idx_max;
+
+        idx_min = 0;
+        idx_max = countof(case_conv_table1) - 1;
+        while (idx_min <= idx_max) {
+            idx = (unsigned)(idx_max + idx_min) / 2;
+            v = case_conv_table1[idx];
+            code = v >> (32 - 17);
+            len = (v >> (32 - 17 - 7)) & 0x7f;
+            if (c < code) {
+                idx_max = idx - 1;
+            } else if (c >= code + len) {
+                idx_min = idx + 1;
+            } else {
+                return lre_case_folding_entry(c, idx, v, is_unicode);
+            }
+        }
+    }
+    return c;
+}
+
+static uint32_t get_le24(const uint8_t *ptr)
+{
+    return ptr[0] | (ptr[1] << 8) | (ptr[2] << 16);
+}
+
+#define UNICODE_INDEX_BLOCK_LEN 32
+
+/* return -1 if not in table, otherwise the offset in the block */
+static int get_index_pos(uint32_t *pcode, uint32_t c,
+                         const uint8_t *index_table, int index_table_len)
+{
+    uint32_t code, v;
+    int idx_min, idx_max, idx;
+
+    idx_min = 0;
+    v = get_le24(index_table);
+    code = v & ((1 << 21) - 1);
+    if (c < code) {
+        *pcode = 0;
+        return 0;
+    }
+    idx_max = index_table_len - 1;
+    code = get_le24(index_table + idx_max * 3);
+    if (c >= code)
+        return -1;
+    /* invariant: tab[idx_min] <= c < tab2[idx_max] */
+    while ((idx_max - idx_min) > 1) {
+        idx = (idx_max + idx_min) / 2;
+        v = get_le24(index_table + idx * 3);
+        code = v & ((1 << 21) - 1);
+        if (c < code) {
+            idx_max = idx;
+        } else {
+            idx_min = idx;
+        }
+    }
+    v = get_le24(index_table + idx_min * 3);
+    *pcode = v & ((1 << 21) - 1);
+    return (idx_min + 1) * UNICODE_INDEX_BLOCK_LEN + (v >> 21);
+}
+
+static BOOL lre_is_in_table(uint32_t c, const uint8_t *table,
+                            const uint8_t *index_table, int index_table_len)
+{
+    uint32_t code, b, bit;
+    int pos;
+    const uint8_t *p;
+
+    pos = get_index_pos(&code, c, index_table, index_table_len);
+    if (pos < 0)
+        return FALSE; /* outside the table */
+    p = table + pos;
+    bit = 0;
+    /* Compressed run length encoding:
+       00..3F: 2 packed lengths: 3-bit + 3-bit
+       40..5F: 5-bits plus extra byte for length
+       60..7F: 5-bits plus 2 extra bytes for length
+       80..FF: 7-bit length
+       lengths must be incremented to get character count
+       Ranges alternate between false and true return value.
+     */
+    for(;;) {
+        b = *p++;
+        if (b < 64) {
+            code += (b >> 3) + 1;
+            if (c < code)
+                return bit;
+            bit ^= 1;
+            code += (b & 7) + 1;
+        } else if (b >= 0x80) {
+            code += b - 0x80 + 1;
+        } else if (b < 0x60) {
+            code += (((b - 0x40) << 8) | p[0]) + 1;
+            p++;
+        } else {
+            code += (((b - 0x60) << 16) | (p[0] << 8) | p[1]) + 1;
+            p += 2;
+        }
+        if (c < code)
+            return bit;
+        bit ^= 1;
+    }
+}
+
+BOOL lre_is_cased(uint32_t c)
+{
+    uint32_t v, code, len;
+    int idx, idx_min, idx_max;
+
+    idx_min = 0;
+    idx_max = countof(case_conv_table1) - 1;
+    while (idx_min <= idx_max) {
+        idx = (unsigned)(idx_max + idx_min) / 2;
+        v = case_conv_table1[idx];
+        code = v >> (32 - 17);
+        len = (v >> (32 - 17 - 7)) & 0x7f;
+        if (c < code) {
+            idx_max = idx - 1;
+        } else if (c >= code + len) {
+            idx_min = idx + 1;
+        } else {
+            return TRUE;
+        }
+    }
+    return lre_is_in_table(c, unicode_prop_Cased1_table,
+                           unicode_prop_Cased1_index,
+                           sizeof(unicode_prop_Cased1_index) / 3);
+}
+
+BOOL lre_is_case_ignorable(uint32_t c)
+{
+    return lre_is_in_table(c, unicode_prop_Case_Ignorable_table,
+                           unicode_prop_Case_Ignorable_index,
+                           sizeof(unicode_prop_Case_Ignorable_index) / 3);
+}
+
+/* character range */
+
+static __maybe_unused void cr_dump(CharRange *cr)
+{
+    int i;
+    for(i = 0; i < cr->len; i++)
+        printf("%d: 0x%04x\n", i, cr->points[i]);
+}
+
+static void *cr_default_realloc(void *opaque, void *ptr, size_t size)
+{
+    return realloc(ptr, size);
+}
+
+void cr_init(CharRange *cr, void *mem_opaque, DynBufReallocFunc *realloc_func)
+{
+    cr->len = cr->size = 0;
+    cr->points = NULL;
+    cr->mem_opaque = mem_opaque;
+    cr->realloc_func = realloc_func ? realloc_func : cr_default_realloc;
+}
+
+void cr_free(CharRange *cr)
+{
+    cr->realloc_func(cr->mem_opaque, cr->points, 0);
+}
+
+int cr_realloc(CharRange *cr, int size)
+{
+    int new_size;
+    uint32_t *new_buf;
+
+    if (size > cr->size) {
+        new_size = max_int(size, cr->size * 3 / 2);
+        new_buf = cr->realloc_func(cr->mem_opaque, cr->points,
+                                   new_size * sizeof(cr->points[0]));
+        if (!new_buf)
+            return -1;
+        cr->points = new_buf;
+        cr->size = new_size;
+    }
+    return 0;
+}
+
+int cr_copy(CharRange *cr, const CharRange *cr1)
+{
+    if (cr_realloc(cr, cr1->len))
+        return -1;
+    memcpy(cr->points, cr1->points, sizeof(cr->points[0]) * cr1->len);
+    cr->len = cr1->len;
+    return 0;
+}
+
+/* merge consecutive intervals and remove empty intervals */
+static void cr_compress(CharRange *cr)
+{
+    int i, j, k, len;
+    uint32_t *pt;
+
+    pt = cr->points;
+    len = cr->len;
+    i = 0;
+    j = 0;
+    k = 0;
+    while ((i + 1) < len) {
+        if (pt[i] == pt[i + 1]) {
+            /* empty interval */
+            i += 2;
+        } else {
+            j = i;
+            while ((j + 3) < len && pt[j + 1] == pt[j + 2])
+                j += 2;
+            /* just copy */
+            pt[k] = pt[i];
+            pt[k + 1] = pt[j + 1];
+            k += 2;
+            i = j + 2;
+        }
+    }
+    cr->len = k;
+}
+
+/* union or intersection */
+int cr_op(CharRange *cr, const uint32_t *a_pt, int a_len,
+          const uint32_t *b_pt, int b_len, int op)
+{
+    int a_idx, b_idx, is_in;
+    uint32_t v;
+
+    a_idx = 0;
+    b_idx = 0;
+    for(;;) {
+        /* get one more point from a or b in increasing order */
+        if (a_idx < a_len && b_idx < b_len) {
+            if (a_pt[a_idx] < b_pt[b_idx]) {
+                goto a_add;
+            } else if (a_pt[a_idx] == b_pt[b_idx]) {
+                v = a_pt[a_idx];
+                a_idx++;
+                b_idx++;
+            } else {
+                goto b_add;
+            }
+        } else if (a_idx < a_len) {
+        a_add:
+            v = a_pt[a_idx++];
+        } else if (b_idx < b_len) {
+        b_add:
+            v = b_pt[b_idx++];
+        } else {
+            break;
+        }
+        /* add the point if the in/out status changes */
+        switch(op) {
+        case CR_OP_UNION:
+            is_in = (a_idx & 1) | (b_idx & 1);
+            break;
+        case CR_OP_INTER:
+            is_in = (a_idx & 1) & (b_idx & 1);
+            break;
+        case CR_OP_XOR:
+            is_in = (a_idx & 1) ^ (b_idx & 1);
+            break;
+        default:
+            abort();
+        }
+        if (is_in != (cr->len & 1)) {
+            if (cr_add_point(cr, v))
+                return -1;
+        }
+    }
+    cr_compress(cr);
+    return 0;
+}
+
+int cr_union1(CharRange *cr, const uint32_t *b_pt, int b_len)
+{
+    CharRange a = *cr;
+    int ret;
+    cr->len = 0;
+    cr->size = 0;
+    cr->points = NULL;
+    ret = cr_op(cr, a.points, a.len, b_pt, b_len, CR_OP_UNION);
+    cr_free(&a);
+    return ret;
+}
+
+int cr_invert(CharRange *cr)
+{
+    int len;
+    len = cr->len;
+    if (cr_realloc(cr, len + 2))
+        return -1;
+    memmove(cr->points + 1, cr->points, len * sizeof(cr->points[0]));
+    cr->points[0] = 0;
+    cr->points[len + 1] = UINT32_MAX;
+    cr->len = len + 2;
+    cr_compress(cr);
+    return 0;
+}
+
+#ifdef CONFIG_ALL_UNICODE
+
+BOOL lre_is_id_start(uint32_t c)
+{
+    return lre_is_in_table(c, unicode_prop_ID_Start_table,
+                           unicode_prop_ID_Start_index,
+                           sizeof(unicode_prop_ID_Start_index) / 3);
+}
+
+BOOL lre_is_id_continue(uint32_t c)
+{
+    return lre_is_id_start(c) ||
+        lre_is_in_table(c, unicode_prop_ID_Continue1_table,
+                        unicode_prop_ID_Continue1_index,
+                        sizeof(unicode_prop_ID_Continue1_index) / 3);
+}
+
+#define UNICODE_DECOMP_LEN_MAX 18
+
+typedef enum {
+    DECOMP_TYPE_C1, /* 16 bit char */
+    DECOMP_TYPE_L1, /* 16 bit char table */
+    DECOMP_TYPE_L2,
+    DECOMP_TYPE_L3,
+    DECOMP_TYPE_L4,
+    DECOMP_TYPE_L5, /* XXX: not used */
+    DECOMP_TYPE_L6, /* XXX: could remove */
+    DECOMP_TYPE_L7, /* XXX: could remove */
+    DECOMP_TYPE_LL1, /* 18 bit char table */
+    DECOMP_TYPE_LL2,
+    DECOMP_TYPE_S1, /* 8 bit char table */
+    DECOMP_TYPE_S2,
+    DECOMP_TYPE_S3,
+    DECOMP_TYPE_S4,
+    DECOMP_TYPE_S5,
+    DECOMP_TYPE_I1, /* increment 16 bit char value */
+    DECOMP_TYPE_I2_0,
+    DECOMP_TYPE_I2_1,
+    DECOMP_TYPE_I3_1,
+    DECOMP_TYPE_I3_2,
+    DECOMP_TYPE_I4_1,
+    DECOMP_TYPE_I4_2,
+    DECOMP_TYPE_B1, /* 16 bit base + 8 bit offset */
+    DECOMP_TYPE_B2,
+    DECOMP_TYPE_B3,
+    DECOMP_TYPE_B4,
+    DECOMP_TYPE_B5,
+    DECOMP_TYPE_B6,
+    DECOMP_TYPE_B7,
+    DECOMP_TYPE_B8,
+    DECOMP_TYPE_B18,
+    DECOMP_TYPE_LS2,
+    DECOMP_TYPE_PAT3,
+    DECOMP_TYPE_S2_UL,
+    DECOMP_TYPE_LS2_UL,
+} DecompTypeEnum;
+
+static uint32_t unicode_get_short_code(uint32_t c)
+{
+    static const uint16_t unicode_short_table[2] = { 0x2044, 0x2215 };
+
+    if (c < 0x80)
+        return c;
+    else if (c < 0x80 + 0x50)
+        return c - 0x80 + 0x300;
+    else
+        return unicode_short_table[c - 0x80 - 0x50];
+}
+
+static uint32_t unicode_get_lower_simple(uint32_t c)
+{
+    if (c < 0x100 || (c >= 0x410 && c <= 0x42f))
+        c += 0x20;
+    else
+        c++;
+    return c;
+}
+
+static uint16_t unicode_get16(const uint8_t *p)
+{
+    return p[0] | (p[1] << 8);
+}
+
+static int unicode_decomp_entry(uint32_t *res, uint32_t c,
+                                int idx, uint32_t code, uint32_t len,
+                                uint32_t type)
+{
+    uint32_t c1;
+    int l, i, p;
+    const uint8_t *d;
+
+    if (type == DECOMP_TYPE_C1) {
+        res[0] = unicode_decomp_table2[idx];
+        return 1;
+    } else {
+        d = unicode_decomp_data + unicode_decomp_table2[idx];
+        switch(type) {
+        case DECOMP_TYPE_L1:
+        case DECOMP_TYPE_L2:
+        case DECOMP_TYPE_L3:
+        case DECOMP_TYPE_L4:
+        case DECOMP_TYPE_L5:
+        case DECOMP_TYPE_L6:
+        case DECOMP_TYPE_L7:
+            l = type - DECOMP_TYPE_L1 + 1;
+            d += (c - code) * l * 2;
+            for(i = 0; i < l; i++) {
+                if ((res[i] = unicode_get16(d + 2 * i)) == 0)
+                    return 0;
+            }
+            return l;
+        case DECOMP_TYPE_LL1:
+        case DECOMP_TYPE_LL2:
+            {
+                uint32_t k, p;
+                l = type - DECOMP_TYPE_LL1 + 1;
+                k = (c - code) * l;
+                p = len * l * 2;
+                for(i = 0; i < l; i++) {
+                    c1 = unicode_get16(d + 2 * k) |
+                        (((d[p + (k / 4)] >> ((k % 4) * 2)) & 3) << 16);
+                    if (!c1)
+                        return 0;
+                    res[i] = c1;
+                    k++;
+                }
+            }
+            return l;
+        case DECOMP_TYPE_S1:
+        case DECOMP_TYPE_S2:
+        case DECOMP_TYPE_S3:
+        case DECOMP_TYPE_S4:
+        case DECOMP_TYPE_S5:
+            l = type - DECOMP_TYPE_S1 + 1;
+            d += (c - code) * l;
+            for(i = 0; i < l; i++) {
+                if ((res[i] = unicode_get_short_code(d[i])) == 0)
+                    return 0;
+            }
+            return l;
+        case DECOMP_TYPE_I1:
+            l = 1;
+            p = 0;
+            goto decomp_type_i;
+        case DECOMP_TYPE_I2_0:
+        case DECOMP_TYPE_I2_1:
+        case DECOMP_TYPE_I3_1:
+        case DECOMP_TYPE_I3_2:
+        case DECOMP_TYPE_I4_1:
+        case DECOMP_TYPE_I4_2:
+            l = 2 + ((type - DECOMP_TYPE_I2_0) >> 1);
+            p = ((type - DECOMP_TYPE_I2_0) & 1) + (l > 2);
+        decomp_type_i:
+            for(i = 0; i < l; i++) {
+                c1 = unicode_get16(d + 2 * i);
+                if (i == p)
+                    c1 += c - code;
+                res[i] = c1;
+            }
+            return l;
+        case DECOMP_TYPE_B18:
+            l = 18;
+            goto decomp_type_b;
+        case DECOMP_TYPE_B1:
+        case DECOMP_TYPE_B2:
+        case DECOMP_TYPE_B3:
+        case DECOMP_TYPE_B4:
+        case DECOMP_TYPE_B5:
+        case DECOMP_TYPE_B6:
+        case DECOMP_TYPE_B7:
+        case DECOMP_TYPE_B8:
+            l = type - DECOMP_TYPE_B1 + 1;
+        decomp_type_b:
+            {
+                uint32_t c_min;
+                c_min = unicode_get16(d);
+                d += 2 + (c - code) * l;
+                for(i = 0; i < l; i++) {
+                    c1 = d[i];
+                    if (c1 == 0xff)
+                        c1 = 0x20;
+                    else
+                        c1 += c_min;
+                    res[i] = c1;
+                }
+            }
+            return l;
+        case DECOMP_TYPE_LS2:
+            d += (c - code) * 3;
+            if (!(res[0] = unicode_get16(d)))
+                return 0;
+            res[1] = unicode_get_short_code(d[2]);
+            return 2;
+        case DECOMP_TYPE_PAT3:
+            res[0] = unicode_get16(d);
+            res[2] = unicode_get16(d + 2);
+            d += 4 + (c - code) * 2;
+            res[1] = unicode_get16(d);
+            return 3;
+        case DECOMP_TYPE_S2_UL:
+        case DECOMP_TYPE_LS2_UL:
+            c1 = c - code;
+            if (type == DECOMP_TYPE_S2_UL) {
+                d += c1 & ~1;
+                c = unicode_get_short_code(*d);
+                d++;
+            } else {
+                d += (c1 >> 1) * 3;
+                c = unicode_get16(d);
+                d += 2;
+            }
+            if (c1 & 1)
+                c = unicode_get_lower_simple(c);
+            res[0] = c;
+            res[1] = unicode_get_short_code(*d);
+            return 2;
+        }
+    }
+    return 0;
+}
+
+
+/* return the length of the decomposition (length <=
+   UNICODE_DECOMP_LEN_MAX) or 0 if no decomposition */
+static int unicode_decomp_char(uint32_t *res, uint32_t c, BOOL is_compat1)
+{
+    uint32_t v, type, is_compat, code, len;
+    int idx_min, idx_max, idx;
+
+    idx_min = 0;
+    idx_max = countof(unicode_decomp_table1) - 1;
+    while (idx_min <= idx_max) {
+        idx = (idx_max + idx_min) / 2;
+        v = unicode_decomp_table1[idx];
+        code = v >> (32 - 18);
+        len = (v >> (32 - 18 - 7)) & 0x7f;
+        //        printf("idx=%d code=%05x len=%d\n", idx, code, len);
+        if (c < code) {
+            idx_max = idx - 1;
+        } else if (c >= code + len) {
+            idx_min = idx + 1;
+        } else {
+            is_compat = v & 1;
+            if (is_compat1 < is_compat)
+                break;
+            type = (v >> (32 - 18 - 7 - 6)) & 0x3f;
+            return unicode_decomp_entry(res, c, idx, code, len, type);
+        }
+    }
+    return 0;
+}
+
+/* return 0 if no pair found */
+static int unicode_compose_pair(uint32_t c0, uint32_t c1)
+{
+    uint32_t code, len, type, v, idx1, d_idx, d_offset, ch;
+    int idx_min, idx_max, idx, d;
+    uint32_t pair[2];
+
+    idx_min = 0;
+    idx_max = countof(unicode_comp_table) - 1;
+    while (idx_min <= idx_max) {
+        idx = (idx_max + idx_min) / 2;
+        idx1 = unicode_comp_table[idx];
+
+        /* idx1 represent an entry of the decomposition table */
+        d_idx = idx1 >> 6;
+        d_offset = idx1 & 0x3f;
+        v = unicode_decomp_table1[d_idx];
+        code = v >> (32 - 18);
+        len = (v >> (32 - 18 - 7)) & 0x7f;
+        type = (v >> (32 - 18 - 7 - 6)) & 0x3f;
+        ch = code + d_offset;
+        unicode_decomp_entry(pair, ch, d_idx, code, len, type);
+        d = c0 - pair[0];
+        if (d == 0)
+            d = c1 - pair[1];
+        if (d < 0) {
+            idx_max = idx - 1;
+        } else if (d > 0) {
+            idx_min = idx + 1;
+        } else {
+            return ch;
+        }
+    }
+    return 0;
+}
+
+/* return the combining class of character c (between 0 and 255) */
+static int unicode_get_cc(uint32_t c)
+{
+    uint32_t code, n, type, cc, c1, b;
+    int pos;
+    const uint8_t *p;
+
+    pos = get_index_pos(&code, c,
+                        unicode_cc_index, sizeof(unicode_cc_index) / 3);
+    if (pos < 0)
+        return 0;
+    p = unicode_cc_table + pos;
+    /* Compressed run length encoding:
+       - 2 high order bits are combining class type
+       -         0:0, 1:230, 2:extra byte linear progression, 3:extra byte
+       - 00..2F: range length (add 1)
+       - 30..37: 3-bit range-length + 1 extra byte
+       - 38..3F: 3-bit range-length + 2 extra byte
+     */
+    for(;;) {
+        b = *p++;
+        type = b >> 6;
+        n = b & 0x3f;
+        if (n < 48) {
+        } else if (n < 56) {
+            n = (n - 48) << 8;
+            n |= *p++;
+            n += 48;
+        } else {
+            n = (n - 56) << 8;
+            n |= *p++ << 8;
+            n |= *p++;
+            n += 48 + (1 << 11);
+        }
+        if (type <= 1)
+            p++;
+        c1 = code + n + 1;
+        if (c < c1) {
+            switch(type) {
+            case 0:
+                cc = p[-1];
+                break;
+            case 1:
+                cc = p[-1] + c - code;
+                break;
+            case 2:
+                cc = 0;
+                break;
+            default:
+            case 3:
+                cc = 230;
+                break;
+            }
+            return cc;
+        }
+        code = c1;
+    }
+}
+
+static void sort_cc(int *buf, int len)
+{
+    int i, j, k, cc, cc1, start, ch1;
+
+    for(i = 0; i < len; i++) {
+        cc = unicode_get_cc(buf[i]);
+        if (cc != 0) {
+            start = i;
+            j = i + 1;
+            while (j < len) {
+                ch1 = buf[j];
+                cc1 = unicode_get_cc(ch1);
+                if (cc1 == 0)
+                    break;
+                k = j - 1;
+                while (k >= start) {
+                    if (unicode_get_cc(buf[k]) <= cc1)
+                        break;
+                    buf[k + 1] = buf[k];
+                    k--;
+                }
+                buf[k + 1] = ch1;
+                j++;
+            }
+#if 0
+            printf("cc:");
+            for(k = start; k < j; k++) {
+                printf(" %3d", unicode_get_cc(buf[k]));
+            }
+            printf("\n");
+#endif
+            i = j;
+        }
+    }
+}
+
+static void to_nfd_rec(DynBuf *dbuf,
+                       const int *src, int src_len, int is_compat)
+{
+    uint32_t c, v;
+    int i, l;
+    uint32_t res[UNICODE_DECOMP_LEN_MAX];
+
+    for(i = 0; i < src_len; i++) {
+        c = src[i];
+        if (c >= 0xac00 && c < 0xd7a4) {
+            /* Hangul decomposition */
+            c -= 0xac00;
+            dbuf_put_u32(dbuf, 0x1100 + c / 588);
+            dbuf_put_u32(dbuf, 0x1161 + (c % 588) / 28);
+            v = c % 28;
+            if (v != 0)
+                dbuf_put_u32(dbuf, 0x11a7 + v);
+        } else {
+            l = unicode_decomp_char(res, c, is_compat);
+            if (l) {
+                to_nfd_rec(dbuf, (int *)res, l, is_compat);
+            } else {
+                dbuf_put_u32(dbuf, c);
+            }
+        }
+    }
+}
+
+/* return 0 if not found */
+static int compose_pair(uint32_t c0, uint32_t c1)
+{
+    /* Hangul composition */
+    if (c0 >= 0x1100 && c0 < 0x1100 + 19 &&
+        c1 >= 0x1161 && c1 < 0x1161 + 21) {
+        return 0xac00 + (c0 - 0x1100) * 588 + (c1 - 0x1161) * 28;
+    } else if (c0 >= 0xac00 && c0 < 0xac00 + 11172 &&
+               (c0 - 0xac00) % 28 == 0 &&
+               c1 >= 0x11a7 && c1 < 0x11a7 + 28) {
+        return c0 + c1 - 0x11a7;
+    } else {
+        return unicode_compose_pair(c0, c1);
+    }
+}
+
+int unicode_normalize(uint32_t **pdst, const uint32_t *src, int src_len,
+                      UnicodeNormalizationEnum n_type,
+                      void *opaque, DynBufReallocFunc *realloc_func)
+{
+    int *buf, buf_len, i, p, starter_pos, cc, last_cc, out_len;
+    BOOL is_compat;
+    DynBuf dbuf_s, *dbuf = &dbuf_s;
+
+    is_compat = n_type >> 1;
+
+    dbuf_init2(dbuf, opaque, realloc_func);
+    if (dbuf_realloc(dbuf, sizeof(int) * src_len))
+        goto fail;
+
+    /* common case: latin1 is unaffected by NFC */
+    if (n_type == UNICODE_NFC) {
+        for(i = 0; i < src_len; i++) {
+            if (src[i] >= 0x100)
+                goto not_latin1;
+        }
+        buf = (int *)dbuf->buf;
+        memcpy(buf, src, src_len * sizeof(int));
+        *pdst = (uint32_t *)buf;
+        return src_len;
+    not_latin1: ;
+    }
+
+    to_nfd_rec(dbuf, (const int *)src, src_len, is_compat);
+    if (dbuf_error(dbuf)) {
+    fail:
+        *pdst = NULL;
+        return -1;
+    }
+    buf = (int *)dbuf->buf;
+    buf_len = dbuf->size / sizeof(int);
+
+    sort_cc(buf, buf_len);
+
+    if (buf_len <= 1 || (n_type & 1) != 0) {
+        /* NFD / NFKD */
+        *pdst = (uint32_t *)buf;
+        return buf_len;
+    }
+
+    i = 1;
+    out_len = 1;
+    while (i < buf_len) {
+        /* find the starter character and test if it is blocked from
+           the character at 'i' */
+        last_cc = unicode_get_cc(buf[i]);
+        starter_pos = out_len - 1;
+        while (starter_pos >= 0) {
+            cc = unicode_get_cc(buf[starter_pos]);
+            if (cc == 0)
+                break;
+            if (cc >= last_cc)
+                goto next;
+            last_cc = 256;
+            starter_pos--;
+        }
+        if (starter_pos >= 0 &&
+            (p = compose_pair(buf[starter_pos], buf[i])) != 0) {
+            buf[starter_pos] = p;
+            i++;
+        } else {
+        next:
+            buf[out_len++] = buf[i++];
+        }
+    }
+    *pdst = (uint32_t *)buf;
+    return out_len;
+}
+
+/* char ranges for various unicode properties */
+
+static int unicode_find_name(const char *name_table, const char *name)
+{
+    const char *p, *r;
+    int pos;
+    size_t name_len, len;
+
+    p = name_table;
+    pos = 0;
+    name_len = strlen(name);
+    while (*p) {
+        for(;;) {
+            r = strchr(p, ',');
+            if (!r)
+                len = strlen(p);
+            else
+                len = r - p;
+            if (len == name_len && !memcmp(p, name, name_len))
+                return pos;
+            p += len + 1;
+            if (!r)
+                break;
+        }
+        pos++;
+    }
+    return -1;
+}
+
+/* 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2
+   if not found */
+int unicode_script(CharRange *cr,
+                   const char *script_name, BOOL is_ext)
+{
+    int script_idx;
+    const uint8_t *p, *p_end;
+    uint32_t c, c1, b, n, v, v_len, i, type;
+    CharRange cr1_s, *cr1;
+    CharRange cr2_s, *cr2 = &cr2_s;
+    BOOL is_common;
+
+    script_idx = unicode_find_name(unicode_script_name_table, script_name);
+    if (script_idx < 0)
+        return -2;
+    /* Note: we remove the "Unknown" Script */
+    script_idx += UNICODE_SCRIPT_Unknown + 1;
+
+    is_common = (script_idx == UNICODE_SCRIPT_Common ||
+                 script_idx == UNICODE_SCRIPT_Inherited);
+    if (is_ext) {
+        cr1 = &cr1_s;
+        cr_init(cr1, cr->mem_opaque, cr->realloc_func);
+        cr_init(cr2, cr->mem_opaque, cr->realloc_func);
+    } else {
+        cr1 = cr;
+    }
+
+    p = unicode_script_table;
+    p_end = unicode_script_table + countof(unicode_script_table);
+    c = 0;
+    while (p < p_end) {
+        b = *p++;
+        type = b >> 7;
+        n = b & 0x7f;
+        if (n < 96) {
+        } else if (n < 112) {
+            n = (n - 96) << 8;
+            n |= *p++;
+            n += 96;
+        } else {
+            n = (n - 112) << 16;
+            n |= *p++ << 8;
+            n |= *p++;
+            n += 96 + (1 << 12);
+        }
+        if (type == 0)
+            v = 0;
+        else
+            v = *p++;
+        c1 = c + n + 1;
+        if (v == script_idx) {
+            if (cr_add_interval(cr1, c, c1))
+                goto fail;
+        }
+        c = c1;
+    }
+
+    if (is_ext) {
+        /* add the script extensions */
+        p = unicode_script_ext_table;
+        p_end = unicode_script_ext_table + countof(unicode_script_ext_table);
+        c = 0;
+        while (p < p_end) {
+            b = *p++;
+            if (b < 128) {
+                n = b;
+            } else if (b < 128 + 64) {
+                n = (b - 128) << 8;
+                n |= *p++;
+                n += 128;
+            } else {
+                n = (b - 128 - 64) << 16;
+                n |= *p++ << 8;
+                n |= *p++;
+                n += 128 + (1 << 14);
+            }
+            c1 = c + n + 1;
+            v_len = *p++;
+            if (is_common) {
+                if (v_len != 0) {
+                    if (cr_add_interval(cr2, c, c1))
+                        goto fail;
+                }
+            } else {
+                for(i = 0; i < v_len; i++) {
+                    if (p[i] == script_idx) {
+                        if (cr_add_interval(cr2, c, c1))
+                            goto fail;
+                        break;
+                    }
+                }
+            }
+            p += v_len;
+            c = c1;
+        }
+        if (is_common) {
+            /* remove all the characters with script extensions */
+            if (cr_invert(cr2))
+                goto fail;
+            if (cr_op(cr, cr1->points, cr1->len, cr2->points, cr2->len,
+                      CR_OP_INTER))
+                goto fail;
+        } else {
+            if (cr_op(cr, cr1->points, cr1->len, cr2->points, cr2->len,
+                      CR_OP_UNION))
+                goto fail;
+        }
+        cr_free(cr1);
+        cr_free(cr2);
+    }
+    return 0;
+ fail:
+    if (is_ext) {
+        cr_free(cr1);
+        cr_free(cr2);
+    }
+    goto fail;
+}
+
+#define M(id) (1U << UNICODE_GC_ ## id)
+
+static int unicode_general_category1(CharRange *cr, uint32_t gc_mask)
+{
+    const uint8_t *p, *p_end;
+    uint32_t c, c0, b, n, v;
+
+    p = unicode_gc_table;
+    p_end = unicode_gc_table + countof(unicode_gc_table);
+    c = 0;
+    /* Compressed range encoding:
+       initial byte:
+       bits 0..4: category number (special case 31)
+       bits 5..7: range length (add 1)
+       special case bits 5..7 == 7: read an extra byte
+       - 00..7F: range length (add 7 + 1)
+       - 80..BF: 6-bits plus extra byte for range length (add 7 + 128)
+       - C0..FF: 6-bits plus 2 extra bytes for range length (add 7 + 128 + 16384)
+     */
+    while (p < p_end) {
+        b = *p++;
+        n = b >> 5;
+        v = b & 0x1f;
+        if (n == 7) {
+            n = *p++;
+            if (n < 128) {
+                n += 7;
+            } else if (n < 128 + 64) {
+                n = (n - 128) << 8;
+                n |= *p++;
+                n += 7 + 128;
+            } else {
+                n = (n - 128 - 64) << 16;
+                n |= *p++ << 8;
+                n |= *p++;
+                n += 7 + 128 + (1 << 14);
+            }
+        }
+        c0 = c;
+        c += n + 1;
+        if (v == 31) {
+            /* run of Lu / Ll */
+            b = gc_mask & (M(Lu) | M(Ll));
+            if (b != 0) {
+                if (b == (M(Lu) | M(Ll))) {
+                    goto add_range;
+                } else {
+                    c0 += ((gc_mask & M(Ll)) != 0);
+                    for(; c0 < c; c0 += 2) {
+                        if (cr_add_interval(cr, c0, c0 + 1))
+                            return -1;
+                    }
+                }
+            }
+        } else if ((gc_mask >> v) & 1) {
+        add_range:
+            if (cr_add_interval(cr, c0, c))
+                return -1;
+        }
+    }
+    return 0;
+}
+
+static int unicode_prop1(CharRange *cr, int prop_idx)
+{
+    const uint8_t *p, *p_end;
+    uint32_t c, c0, b, bit;
+
+    p = unicode_prop_table[prop_idx];
+    p_end = p + unicode_prop_len_table[prop_idx];
+    c = 0;
+    bit = 0;
+    /* Compressed range encoding:
+       00..3F: 2 packed lengths: 3-bit + 3-bit
+       40..5F: 5-bits plus extra byte for length
+       60..7F: 5-bits plus 2 extra bytes for length
+       80..FF: 7-bit length
+       lengths must be incremented to get character count
+       Ranges alternate between false and true return value.
+     */
+    while (p < p_end) {
+        c0 = c;
+        b = *p++;
+        if (b < 64) {
+            c += (b >> 3) + 1;
+            if (bit)  {
+                if (cr_add_interval(cr, c0, c))
+                    return -1;
+            }
+            bit ^= 1;
+            c0 = c;
+            c += (b & 7) + 1;
+        } else if (b >= 0x80) {
+            c += b - 0x80 + 1;
+        } else if (b < 0x60) {
+            c += (((b - 0x40) << 8) | p[0]) + 1;
+            p++;
+        } else {
+            c += (((b - 0x60) << 16) | (p[0] << 8) | p[1]) + 1;
+            p += 2;
+        }
+        if (bit)  {
+            if (cr_add_interval(cr, c0, c))
+                return -1;
+        }
+        bit ^= 1;
+    }
+    return 0;
+}
+
+#define CASE_U (1 << 0)
+#define CASE_L (1 << 1)
+#define CASE_F (1 << 2)
+
+/* use the case conversion table to generate range of characters.
+   CASE_U: set char if modified by uppercasing,
+   CASE_L: set char if modified by lowercasing,
+   CASE_F: set char if modified by case folding,
+ */
+static int unicode_case1(CharRange *cr, int case_mask)
+{
+#define MR(x) (1 << RUN_TYPE_ ## x)
+    const uint32_t tab_run_mask[3] = {
+        MR(U) | MR(UF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(UF_D20) |
+        MR(UF_D1_EXT) | MR(U_EXT) | MR(UF_EXT2) | MR(UF_EXT3),
+
+        MR(L) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(LF_EXT2),
+
+        MR(UF) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(LF_EXT2) | MR(UF_D20) | MR(UF_D1_EXT) | MR(LF_EXT) | MR(UF_EXT2) | MR(UF_EXT3),
+    };
+#undef MR
+    uint32_t mask, v, code, type, len, i, idx;
+
+    if (case_mask == 0)
+        return 0;
+    mask = 0;
+    for(i = 0; i < 3; i++) {
+        if ((case_mask >> i) & 1)
+            mask |= tab_run_mask[i];
+    }
+    for(idx = 0; idx < countof(case_conv_table1); idx++) {
+        v = case_conv_table1[idx];
+        type = (v >> (32 - 17 - 7 - 4)) & 0xf;
+        code = v >> (32 - 17);
+        len = (v >> (32 - 17 - 7)) & 0x7f;
+        if ((mask >> type) & 1) {
+            //            printf("%d: type=%d %04x %04x\n", idx, type, code, code + len - 1);
+            switch(type) {
+            case RUN_TYPE_UL:
+                if ((case_mask & CASE_U) && (case_mask & (CASE_L | CASE_F)))
+                    goto def_case;
+                code += ((case_mask & CASE_U) != 0);
+                for(i = 0; i < len; i += 2) {
+                    if (cr_add_interval(cr, code + i, code + i + 1))
+                        return -1;
+                }
+                break;
+            case RUN_TYPE_LSU:
+                if ((case_mask & CASE_U) && (case_mask & (CASE_L | CASE_F)))
+                    goto def_case;
+                if (!(case_mask & CASE_U)) {
+                    if (cr_add_interval(cr, code, code + 1))
+                        return -1;
+                }
+                if (cr_add_interval(cr, code + 1, code + 2))
+                    return -1;
+                if (case_mask & CASE_U) {
+                    if (cr_add_interval(cr, code + 2, code + 3))
+                        return -1;
+                }
+                break;
+            default:
+            def_case:
+                if (cr_add_interval(cr, code, code + len))
+                    return -1;
+                break;
+            }
+        }
+    }
+    return 0;
+}
+
+static int point_cmp(const void *p1, const void *p2, void *arg)
+{
+    uint32_t v1 = *(uint32_t *)p1;
+    uint32_t v2 = *(uint32_t *)p2;
+    return (v1 > v2) - (v1 < v2);
+}
+
+static void cr_sort_and_remove_overlap(CharRange *cr)
+{
+    uint32_t start, end, start1, end1, i, j;
+
+    /* the resulting ranges are not necessarily sorted and may overlap */
+    rqsort(cr->points, cr->len / 2, sizeof(cr->points[0]) * 2, point_cmp, NULL);
+    j = 0;
+    for(i = 0; i < cr->len; ) {
+        start = cr->points[i];
+        end = cr->points[i + 1];
+        i += 2;
+        while (i < cr->len) {
+            start1 = cr->points[i];
+            end1 = cr->points[i + 1];
+            if (start1 > end) {
+                /* |------|
+                 *           |-------| */
+                break;
+            } else if (end1 <= end) {
+                /* |------|
+                 *    |--| */
+                i += 2;
+            } else {
+                /* |------|
+                 *     |-------| */
+                end = end1;
+                i += 2;
+            }
+        }
+        cr->points[j] = start;
+        cr->points[j + 1] = end;
+        j += 2;
+    }
+    cr->len = j;
+}
+
+/* canonicalize a character set using the JS regex case folding rules
+   (see lre_canonicalize()) */
+int cr_regexp_canonicalize(CharRange *cr, BOOL is_unicode)
+{
+    CharRange cr_inter, cr_mask, cr_result, cr_sub;
+    uint32_t v, code, len, i, idx, start, end, c, d_start, d_end, d;
+
+    cr_init(&cr_mask, cr->mem_opaque, cr->realloc_func);
+    cr_init(&cr_inter, cr->mem_opaque, cr->realloc_func);
+    cr_init(&cr_result, cr->mem_opaque, cr->realloc_func);
+    cr_init(&cr_sub, cr->mem_opaque, cr->realloc_func);
+
+    if (unicode_case1(&cr_mask, is_unicode ? CASE_F : CASE_U))
+        goto fail;
+    if (cr_op(&cr_inter, cr_mask.points, cr_mask.len, cr->points, cr->len, CR_OP_INTER))
+        goto fail;
+
+    if (cr_invert(&cr_mask))
+        goto fail;
+    if (cr_op(&cr_sub, cr_mask.points, cr_mask.len, cr->points, cr->len, CR_OP_INTER))
+        goto fail;
+
+    /* cr_inter = cr & cr_mask */
+    /* cr_sub = cr & ~cr_mask */
+
+    /* use the case conversion table to compute the result */
+    d_start = -1;
+    d_end = -1;
+    idx = 0;
+    v = case_conv_table1[idx];
+    code = v >> (32 - 17);
+    len = (v >> (32 - 17 - 7)) & 0x7f;
+    for(i = 0; i < cr_inter.len; i += 2) {
+        start = cr_inter.points[i];
+        end = cr_inter.points[i + 1];
+
+        for(c = start; c < end; c++) {
+            for(;;) {
+                if (c >= code && c < code + len)
+                    break;
+                idx++;
+                assert(idx < countof(case_conv_table1));
+                v = case_conv_table1[idx];
+                code = v >> (32 - 17);
+                len = (v >> (32 - 17 - 7)) & 0x7f;
+            }
+            d = lre_case_folding_entry(c, idx, v, is_unicode);
+            /* try to merge with the current interval */
+            if (d_start == -1) {
+                d_start = d;
+                d_end = d + 1;
+            } else if (d_end == d) {
+                d_end++;
+            } else {
+                cr_add_interval(&cr_result, d_start, d_end);
+                d_start = d;
+                d_end = d + 1;
+            }
+        }
+    }
+    if (d_start != -1) {
+        if (cr_add_interval(&cr_result, d_start, d_end))
+            goto fail;
+    }
+
+    /* the resulting ranges are not necessarily sorted and may overlap */
+    cr_sort_and_remove_overlap(&cr_result);
+
+    /* or with the character not affected by the case folding */
+    cr->len = 0;
+    if (cr_op(cr, cr_result.points, cr_result.len, cr_sub.points, cr_sub.len, CR_OP_UNION))
+        goto fail;
+
+    cr_free(&cr_inter);
+    cr_free(&cr_mask);
+    cr_free(&cr_result);
+    cr_free(&cr_sub);
+    return 0;
+ fail:
+    cr_free(&cr_inter);
+    cr_free(&cr_mask);
+    cr_free(&cr_result);
+    cr_free(&cr_sub);
+    return -1;
+}
+
+typedef enum {
+    POP_GC,
+    POP_PROP,
+    POP_CASE,
+    POP_UNION,
+    POP_INTER,
+    POP_XOR,
+    POP_INVERT,
+    POP_END,
+} PropOPEnum;
+
+#define POP_STACK_LEN_MAX 4
+
+static int unicode_prop_ops(CharRange *cr, ...)
+{
+    va_list ap;
+    CharRange stack[POP_STACK_LEN_MAX];
+    int stack_len, op, ret, i;
+    uint32_t a;
+
+    va_start(ap, cr);
+    stack_len = 0;
+    for(;;) {
+        op = va_arg(ap, int);
+        switch(op) {
+        case POP_GC:
+            assert(stack_len < POP_STACK_LEN_MAX);
+            a = va_arg(ap, int);
+            cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func);
+            if (unicode_general_category1(&stack[stack_len - 1], a))
+                goto fail;
+            break;
+        case POP_PROP:
+            assert(stack_len < POP_STACK_LEN_MAX);
+            a = va_arg(ap, int);
+            cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func);
+            if (unicode_prop1(&stack[stack_len - 1], a))
+                goto fail;
+            break;
+        case POP_CASE:
+            assert(stack_len < POP_STACK_LEN_MAX);
+            a = va_arg(ap, int);
+            cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func);
+            if (unicode_case1(&stack[stack_len - 1], a))
+                goto fail;
+            break;
+        case POP_UNION:
+        case POP_INTER:
+        case POP_XOR:
+            {
+                CharRange *cr1, *cr2, *cr3;
+                assert(stack_len >= 2);
+                assert(stack_len < POP_STACK_LEN_MAX);
+                cr1 = &stack[stack_len - 2];
+                cr2 = &stack[stack_len - 1];
+                cr3 = &stack[stack_len++];
+                cr_init(cr3, cr->mem_opaque, cr->realloc_func);
+                if (cr_op(cr3, cr1->points, cr1->len,
+                          cr2->points, cr2->len, op - POP_UNION + CR_OP_UNION))
+                    goto fail;
+                cr_free(cr1);
+                cr_free(cr2);
+                *cr1 = *cr3;
+                stack_len -= 2;
+            }
+            break;
+        case POP_INVERT:
+            assert(stack_len >= 1);
+            if (cr_invert(&stack[stack_len - 1]))
+                goto fail;
+            break;
+        case POP_END:
+            goto done;
+        default:
+            abort();
+        }
+    }
+ done:
+    va_end(ap);
+    assert(stack_len == 1);
+    ret = cr_copy(cr, &stack[0]);
+    cr_free(&stack[0]);
+    return ret;
+ fail:
+    va_end(ap);
+    for(i = 0; i < stack_len; i++)
+        cr_free(&stack[i]);
+    return -1;
+}
+
+static const uint32_t unicode_gc_mask_table[] = {
+    M(Lu) | M(Ll) | M(Lt), /* LC */
+    M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo), /* L */
+    M(Mn) | M(Mc) | M(Me), /* M */
+    M(Nd) | M(Nl) | M(No), /* N */
+    M(Sm) | M(Sc) | M(Sk) | M(So), /* S */
+    M(Pc) | M(Pd) | M(Ps) | M(Pe) | M(Pi) | M(Pf) | M(Po), /* P */
+    M(Zs) | M(Zl) | M(Zp), /* Z */
+    M(Cc) | M(Cf) | M(Cs) | M(Co) | M(Cn), /* C */
+};
+
+/* 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2
+   if not found */
+int unicode_general_category(CharRange *cr, const char *gc_name)
+{
+    int gc_idx;
+    uint32_t gc_mask;
+
+    gc_idx = unicode_find_name(unicode_gc_name_table, gc_name);
+    if (gc_idx < 0)
+        return -2;
+    if (gc_idx <= UNICODE_GC_Co) {
+        gc_mask = (uint64_t)1 << gc_idx;
+    } else {
+        gc_mask = unicode_gc_mask_table[gc_idx - UNICODE_GC_LC];
+    }
+    return unicode_general_category1(cr, gc_mask);
+}
+
+
+/* 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2
+   if not found */
+int unicode_prop(CharRange *cr, const char *prop_name)
+{
+    int prop_idx, ret;
+
+    prop_idx = unicode_find_name(unicode_prop_name_table, prop_name);
+    if (prop_idx < 0)
+        return -2;
+    prop_idx += UNICODE_PROP_ASCII_Hex_Digit;
+
+    ret = 0;
+    switch(prop_idx) {
+    case UNICODE_PROP_ASCII:
+        if (cr_add_interval(cr, 0x00, 0x7f + 1))
+            return -1;
+        break;
+    case UNICODE_PROP_Any:
+        if (cr_add_interval(cr, 0x00000, 0x10ffff + 1))
+            return -1;
+        break;
+    case UNICODE_PROP_Assigned:
+        ret = unicode_prop_ops(cr,
+                               POP_GC, M(Cn),
+                               POP_INVERT,
+                               POP_END);
+        break;
+    case UNICODE_PROP_Math:
+        ret = unicode_prop_ops(cr,
+                               POP_GC, M(Sm),
+                               POP_PROP, UNICODE_PROP_Other_Math,
+                               POP_UNION,
+                               POP_END);
+        break;
+    case UNICODE_PROP_Lowercase:
+        ret = unicode_prop_ops(cr,
+                               POP_GC, M(Ll),
+                               POP_PROP, UNICODE_PROP_Other_Lowercase,
+                               POP_UNION,
+                               POP_END);
+        break;
+    case UNICODE_PROP_Uppercase:
+        ret = unicode_prop_ops(cr,
+                               POP_GC, M(Lu),
+                               POP_PROP, UNICODE_PROP_Other_Uppercase,
+                               POP_UNION,
+                               POP_END);
+        break;
+    case UNICODE_PROP_Cased:
+        ret = unicode_prop_ops(cr,
+                               POP_GC, M(Lu) | M(Ll) | M(Lt),
+                               POP_PROP, UNICODE_PROP_Other_Uppercase,
+                               POP_UNION,
+                               POP_PROP, UNICODE_PROP_Other_Lowercase,
+                               POP_UNION,
+                               POP_END);
+        break;
+    case UNICODE_PROP_Alphabetic:
+        ret = unicode_prop_ops(cr,
+                               POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl),
+                               POP_PROP, UNICODE_PROP_Other_Uppercase,
+                               POP_UNION,
+                               POP_PROP, UNICODE_PROP_Other_Lowercase,
+                               POP_UNION,
+                               POP_PROP, UNICODE_PROP_Other_Alphabetic,
+                               POP_UNION,
+                               POP_END);
+        break;
+    case UNICODE_PROP_Grapheme_Base:
+        ret = unicode_prop_ops(cr,
+                               POP_GC, M(Cc) | M(Cf) | M(Cs) | M(Co) | M(Cn) | M(Zl) | M(Zp) | M(Me) | M(Mn),
+                               POP_PROP, UNICODE_PROP_Other_Grapheme_Extend,
+                               POP_UNION,
+                               POP_INVERT,
+                               POP_END);
+        break;
+    case UNICODE_PROP_Grapheme_Extend:
+        ret = unicode_prop_ops(cr,
+                               POP_GC, M(Me) | M(Mn),
+                               POP_PROP, UNICODE_PROP_Other_Grapheme_Extend,
+                               POP_UNION,
+                               POP_END);
+        break;
+    case UNICODE_PROP_XID_Start:
+        ret = unicode_prop_ops(cr,
+                               POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl),
+                               POP_PROP, UNICODE_PROP_Other_ID_Start,
+                               POP_UNION,
+                               POP_PROP, UNICODE_PROP_Pattern_Syntax,
+                               POP_PROP, UNICODE_PROP_Pattern_White_Space,
+                               POP_UNION,
+                               POP_PROP, UNICODE_PROP_XID_Start1,
+                               POP_UNION,
+                               POP_INVERT,
+                               POP_INTER,
+                               POP_END);
+        break;
+    case UNICODE_PROP_XID_Continue:
+        ret = unicode_prop_ops(cr,
+                               POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl) |
+                               M(Mn) | M(Mc) | M(Nd) | M(Pc),
+                               POP_PROP, UNICODE_PROP_Other_ID_Start,
+                               POP_UNION,
+                               POP_PROP, UNICODE_PROP_Other_ID_Continue,
+                               POP_UNION,
+                               POP_PROP, UNICODE_PROP_Pattern_Syntax,
+                               POP_PROP, UNICODE_PROP_Pattern_White_Space,
+                               POP_UNION,
+                               POP_PROP, UNICODE_PROP_XID_Continue1,
+                               POP_UNION,
+                               POP_INVERT,
+                               POP_INTER,
+                               POP_END);
+        break;
+    case UNICODE_PROP_Changes_When_Uppercased:
+        ret = unicode_case1(cr, CASE_U);
+        break;
+    case UNICODE_PROP_Changes_When_Lowercased:
+        ret = unicode_case1(cr, CASE_L);
+        break;
+    case UNICODE_PROP_Changes_When_Casemapped:
+        ret = unicode_case1(cr, CASE_U | CASE_L | CASE_F);
+        break;
+    case UNICODE_PROP_Changes_When_Titlecased:
+        ret = unicode_prop_ops(cr,
+                               POP_CASE, CASE_U,
+                               POP_PROP, UNICODE_PROP_Changes_When_Titlecased1,
+                               POP_XOR,
+                               POP_END);
+        break;
+    case UNICODE_PROP_Changes_When_Casefolded:
+        ret = unicode_prop_ops(cr,
+                               POP_CASE, CASE_F,
+                               POP_PROP, UNICODE_PROP_Changes_When_Casefolded1,
+                               POP_XOR,
+                               POP_END);
+        break;
+    case UNICODE_PROP_Changes_When_NFKC_Casefolded:
+        ret = unicode_prop_ops(cr,
+                               POP_CASE, CASE_F,
+                               POP_PROP, UNICODE_PROP_Changes_When_NFKC_Casefolded1,
+                               POP_XOR,
+                               POP_END);
+        break;
+#if 0
+    case UNICODE_PROP_ID_Start:
+        ret = unicode_prop_ops(cr,
+                               POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl),
+                               POP_PROP, UNICODE_PROP_Other_ID_Start,
+                               POP_UNION,
+                               POP_PROP, UNICODE_PROP_Pattern_Syntax,
+                               POP_PROP, UNICODE_PROP_Pattern_White_Space,
+                               POP_UNION,
+                               POP_INVERT,
+                               POP_INTER,
+                               POP_END);
+        break;
+    case UNICODE_PROP_ID_Continue:
+        ret = unicode_prop_ops(cr,
+                               POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl) |
+                               M(Mn) | M(Mc) | M(Nd) | M(Pc),
+                               POP_PROP, UNICODE_PROP_Other_ID_Start,
+                               POP_UNION,
+                               POP_PROP, UNICODE_PROP_Other_ID_Continue,
+                               POP_UNION,
+                               POP_PROP, UNICODE_PROP_Pattern_Syntax,
+                               POP_PROP, UNICODE_PROP_Pattern_White_Space,
+                               POP_UNION,
+                               POP_INVERT,
+                               POP_INTER,
+                               POP_END);
+        break;
+    case UNICODE_PROP_Case_Ignorable:
+        ret = unicode_prop_ops(cr,
+                               POP_GC, M(Mn) | M(Cf) | M(Lm) | M(Sk),
+                               POP_PROP, UNICODE_PROP_Case_Ignorable1,
+                               POP_XOR,
+                               POP_END);
+        break;
+#else
+        /* we use the existing tables */
+    case UNICODE_PROP_ID_Continue:
+        ret = unicode_prop_ops(cr,
+                               POP_PROP, UNICODE_PROP_ID_Start,
+                               POP_PROP, UNICODE_PROP_ID_Continue1,
+                               POP_XOR,
+                               POP_END);
+        break;
+#endif
+    default:
+        if (prop_idx >= countof(unicode_prop_table))
+            return -2;
+        ret = unicode_prop1(cr, prop_idx);
+        break;
+    }
+    return ret;
+}
+
+#endif /* CONFIG_ALL_UNICODE */
+
+/*---- lre codepoint categorizing functions ----*/
+
+#define S  UNICODE_C_SPACE
+#define D  UNICODE_C_DIGIT
+#define X  UNICODE_C_XDIGIT
+#define U  UNICODE_C_UPPER
+#define L  UNICODE_C_LOWER
+#define _  UNICODE_C_UNDER
+#define d  UNICODE_C_DOLLAR
+
+uint8_t const lre_ctype_bits[256] = {
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, S, S, S, S, S, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+
+    S, 0, 0, 0, d, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    X|D, X|D, X|D, X|D, X|D, X|D, X|D, X|D,
+    X|D, X|D, 0, 0, 0, 0, 0, 0,
+
+    0, X|U, X|U, X|U, X|U, X|U, X|U, U,
+    U, U, U, U, U, U, U, U,
+    U, U, U, U, U, U, U, U,
+    U, U, U, 0, 0, 0, 0, _,
+
+    0, X|L, X|L, X|L, X|L, X|L, X|L, L,
+    L, L, L, L, L, L, L, L,
+    L, L, L, L, L, L, L, L,
+    L, L, L, 0, 0, 0, 0, 0,
+
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+
+    S, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+#undef S
+#undef D
+#undef X
+#undef U
+#undef L
+#undef _
+#undef d
+
+/* code point ranges for Zs,Zl or Zp property */
+static const uint16_t char_range_s[] = {
+    10,
+    0x0009, 0x000D + 1,
+    0x0020, 0x0020 + 1,
+    0x00A0, 0x00A0 + 1,
+    0x1680, 0x1680 + 1,
+    0x2000, 0x200A + 1,
+    /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */
+    /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */
+    0x2028, 0x2029 + 1,
+    0x202F, 0x202F + 1,
+    0x205F, 0x205F + 1,
+    0x3000, 0x3000 + 1,
+    /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */
+    0xFEFF, 0xFEFF + 1,
+};
+
+BOOL lre_is_space_non_ascii(uint32_t c)
+{
+    size_t i, n;
+
+    n = countof(char_range_s);
+    for(i = 5; i < n; i += 2) {
+        uint32_t low = char_range_s[i];
+        uint32_t high = char_range_s[i + 1];
+        if (c < low)
+            return FALSE;
+        if (c < high)
+            return TRUE;
+    }
+    return FALSE;
+}