diff options
Diffstat (limited to 'src/shared/quickjs/libunicode.c')
-rw-r--r-- | src/shared/quickjs/libunicode.c | 1912 |
1 files changed, 1912 insertions, 0 deletions
diff --git a/src/shared/quickjs/libunicode.c b/src/shared/quickjs/libunicode.c new file mode 100644 index 000000000..482c51944 --- /dev/null +++ b/src/shared/quickjs/libunicode.c @@ -0,0 +1,1912 @@ +/* + * Unicode utilities + * + * Copyright (c) 2017-2018 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include <stdlib.h> +#include <stdio.h> +#include <stdarg.h> +#include <string.h> +#include <assert.h> + +#include "cutils.h" +#include "libunicode.h" +#include "libunicode-table.h" + +enum { + RUN_TYPE_U, + RUN_TYPE_L, + RUN_TYPE_UF, + RUN_TYPE_LF, + RUN_TYPE_UL, + RUN_TYPE_LSU, + RUN_TYPE_U2L_399_EXT2, + RUN_TYPE_UF_D20, + RUN_TYPE_UF_D1_EXT, + RUN_TYPE_U_EXT, + RUN_TYPE_LF_EXT, + RUN_TYPE_UF_EXT2, + RUN_TYPE_LF_EXT2, + RUN_TYPE_UF_EXT3, +}; + +static int lre_case_conv1(uint32_t c, int conv_type) +{ + uint32_t res[LRE_CC_RES_LEN_MAX]; + lre_case_conv(res, c, conv_type); + return res[0]; +} + +/* case conversion using the table entry 'idx' with value 'v' */ +static int lre_case_conv_entry(uint32_t *res, uint32_t c, int conv_type, uint32_t idx, uint32_t v) +{ + uint32_t code, data, type, a, is_lower; + is_lower = (conv_type != 0); + type = (v >> (32 - 17 - 7 - 4)) & 0xf; + data = ((v & 0xf) << 8) | case_conv_table2[idx]; + code = v >> (32 - 17); + switch(type) { + case RUN_TYPE_U: + case RUN_TYPE_L: + case RUN_TYPE_UF: + case RUN_TYPE_LF: + if (conv_type == (type & 1) || + (type >= RUN_TYPE_UF && conv_type == 2)) { + c = c - code + (case_conv_table1[data] >> (32 - 17)); + } + break; + case RUN_TYPE_UL: + a = c - code; + if ((a & 1) != (1 - is_lower)) + break; + c = (a ^ 1) + code; + break; + case RUN_TYPE_LSU: + a = c - code; + if (a == 1) { + c += 2 * is_lower - 1; + } else if (a == (1 - is_lower) * 2) { + c += (2 * is_lower - 1) * 2; + } + break; + case RUN_TYPE_U2L_399_EXT2: + if (!is_lower) { + res[0] = c - code + case_conv_ext[data >> 6]; + res[1] = 0x399; + return 2; + } else { + c = c - code + case_conv_ext[data & 0x3f]; + } + break; + case RUN_TYPE_UF_D20: + if (conv_type == 1) + break; + c = data + (conv_type == 2) * 0x20; + break; + case RUN_TYPE_UF_D1_EXT: + if (conv_type == 1) + break; + c = case_conv_ext[data] + (conv_type == 2); + break; + case RUN_TYPE_U_EXT: + case RUN_TYPE_LF_EXT: + if (is_lower != (type - RUN_TYPE_U_EXT)) + break; + c = case_conv_ext[data]; + break; + case RUN_TYPE_LF_EXT2: + if (!is_lower) + break; + res[0] = c - code + case_conv_ext[data >> 6]; + res[1] = case_conv_ext[data & 0x3f]; + return 2; + case RUN_TYPE_UF_EXT2: + if (conv_type == 1) + break; + res[0] = c - code + case_conv_ext[data >> 6]; + res[1] = case_conv_ext[data & 0x3f]; + if (conv_type == 2) { + /* convert to lower */ + res[0] = lre_case_conv1(res[0], 1); + res[1] = lre_case_conv1(res[1], 1); + } + return 2; + default: + case RUN_TYPE_UF_EXT3: + if (conv_type == 1) + break; + res[0] = case_conv_ext[data >> 8]; + res[1] = case_conv_ext[(data >> 4) & 0xf]; + res[2] = case_conv_ext[data & 0xf]; + if (conv_type == 2) { + /* convert to lower */ + res[0] = lre_case_conv1(res[0], 1); + res[1] = lre_case_conv1(res[1], 1); + res[2] = lre_case_conv1(res[2], 1); + } + return 3; + } + res[0] = c; + return 1; +} + +/* conv_type: + 0 = to upper + 1 = to lower + 2 = case folding (= to lower with modifications) +*/ +int lre_case_conv(uint32_t *res, uint32_t c, int conv_type) +{ + if (c < 128) { + if (conv_type) { + if (c >= 'A' && c <= 'Z') { + c = c - 'A' + 'a'; + } + } else { + if (c >= 'a' && c <= 'z') { + c = c - 'a' + 'A'; + } + } + } else { + uint32_t v, code, len; + int idx, idx_min, idx_max; + + idx_min = 0; + idx_max = countof(case_conv_table1) - 1; + while (idx_min <= idx_max) { + idx = (unsigned)(idx_max + idx_min) / 2; + v = case_conv_table1[idx]; + code = v >> (32 - 17); + len = (v >> (32 - 17 - 7)) & 0x7f; + if (c < code) { + idx_max = idx - 1; + } else if (c >= code + len) { + idx_min = idx + 1; + } else { + return lre_case_conv_entry(res, c, conv_type, idx, v); + } + } + } + res[0] = c; + return 1; +} + +static int lre_case_folding_entry(uint32_t c, uint32_t idx, uint32_t v, BOOL is_unicode) +{ + uint32_t res[LRE_CC_RES_LEN_MAX]; + int len; + + if (is_unicode) { + len = lre_case_conv_entry(res, c, 2, idx, v); + if (len == 1) { + c = res[0]; + } else { + /* handle the few specific multi-character cases (see + unicode_gen.c:dump_case_folding_special_cases()) */ + if (c == 0xfb06) { + c = 0xfb05; + } else if (c == 0x01fd3) { + c = 0x390; + } else if (c == 0x01fe3) { + c = 0x3b0; + } + } + } else { + if (likely(c < 128)) { + if (c >= 'a' && c <= 'z') + c = c - 'a' + 'A'; + } else { + /* legacy regexp: to upper case if single char >= 128 */ + len = lre_case_conv_entry(res, c, FALSE, idx, v); + if (len == 1 && res[0] >= 128) + c = res[0]; + } + } + return c; +} + +/* JS regexp specific rules for case folding */ +int lre_canonicalize(uint32_t c, BOOL is_unicode) +{ + if (c < 128) { + /* fast case */ + if (is_unicode) { + if (c >= 'A' && c <= 'Z') { + c = c - 'A' + 'a'; + } + } else { + if (c >= 'a' && c <= 'z') { + c = c - 'a' + 'A'; + } + } + } else { + uint32_t v, code, len; + int idx, idx_min, idx_max; + + idx_min = 0; + idx_max = countof(case_conv_table1) - 1; + while (idx_min <= idx_max) { + idx = (unsigned)(idx_max + idx_min) / 2; + v = case_conv_table1[idx]; + code = v >> (32 - 17); + len = (v >> (32 - 17 - 7)) & 0x7f; + if (c < code) { + idx_max = idx - 1; + } else if (c >= code + len) { + idx_min = idx + 1; + } else { + return lre_case_folding_entry(c, idx, v, is_unicode); + } + } + } + return c; +} + +static uint32_t get_le24(const uint8_t *ptr) +{ + return ptr[0] | (ptr[1] << 8) | (ptr[2] << 16); +} + +#define UNICODE_INDEX_BLOCK_LEN 32 + +/* return -1 if not in table, otherwise the offset in the block */ +static int get_index_pos(uint32_t *pcode, uint32_t c, + const uint8_t *index_table, int index_table_len) +{ + uint32_t code, v; + int idx_min, idx_max, idx; + + idx_min = 0; + v = get_le24(index_table); + code = v & ((1 << 21) - 1); + if (c < code) { + *pcode = 0; + return 0; + } + idx_max = index_table_len - 1; + code = get_le24(index_table + idx_max * 3); + if (c >= code) + return -1; + /* invariant: tab[idx_min] <= c < tab2[idx_max] */ + while ((idx_max - idx_min) > 1) { + idx = (idx_max + idx_min) / 2; + v = get_le24(index_table + idx * 3); + code = v & ((1 << 21) - 1); + if (c < code) { + idx_max = idx; + } else { + idx_min = idx; + } + } + v = get_le24(index_table + idx_min * 3); + *pcode = v & ((1 << 21) - 1); + return (idx_min + 1) * UNICODE_INDEX_BLOCK_LEN + (v >> 21); +} + +static BOOL lre_is_in_table(uint32_t c, const uint8_t *table, + const uint8_t *index_table, int index_table_len) +{ + uint32_t code, b, bit; + int pos; + const uint8_t *p; + + pos = get_index_pos(&code, c, index_table, index_table_len); + if (pos < 0) + return FALSE; /* outside the table */ + p = table + pos; + bit = 0; + /* Compressed run length encoding: + 00..3F: 2 packed lengths: 3-bit + 3-bit + 40..5F: 5-bits plus extra byte for length + 60..7F: 5-bits plus 2 extra bytes for length + 80..FF: 7-bit length + lengths must be incremented to get character count + Ranges alternate between false and true return value. + */ + for(;;) { + b = *p++; + if (b < 64) { + code += (b >> 3) + 1; + if (c < code) + return bit; + bit ^= 1; + code += (b & 7) + 1; + } else if (b >= 0x80) { + code += b - 0x80 + 1; + } else if (b < 0x60) { + code += (((b - 0x40) << 8) | p[0]) + 1; + p++; + } else { + code += (((b - 0x60) << 16) | (p[0] << 8) | p[1]) + 1; + p += 2; + } + if (c < code) + return bit; + bit ^= 1; + } +} + +BOOL lre_is_cased(uint32_t c) +{ + uint32_t v, code, len; + int idx, idx_min, idx_max; + + idx_min = 0; + idx_max = countof(case_conv_table1) - 1; + while (idx_min <= idx_max) { + idx = (unsigned)(idx_max + idx_min) / 2; + v = case_conv_table1[idx]; + code = v >> (32 - 17); + len = (v >> (32 - 17 - 7)) & 0x7f; + if (c < code) { + idx_max = idx - 1; + } else if (c >= code + len) { + idx_min = idx + 1; + } else { + return TRUE; + } + } + return lre_is_in_table(c, unicode_prop_Cased1_table, + unicode_prop_Cased1_index, + sizeof(unicode_prop_Cased1_index) / 3); +} + +BOOL lre_is_case_ignorable(uint32_t c) +{ + return lre_is_in_table(c, unicode_prop_Case_Ignorable_table, + unicode_prop_Case_Ignorable_index, + sizeof(unicode_prop_Case_Ignorable_index) / 3); +} + +/* character range */ + +static __maybe_unused void cr_dump(CharRange *cr) +{ + int i; + for(i = 0; i < cr->len; i++) + printf("%d: 0x%04x\n", i, cr->points[i]); +} + +static void *cr_default_realloc(void *opaque, void *ptr, size_t size) +{ + return realloc(ptr, size); +} + +void cr_init(CharRange *cr, void *mem_opaque, DynBufReallocFunc *realloc_func) +{ + cr->len = cr->size = 0; + cr->points = NULL; + cr->mem_opaque = mem_opaque; + cr->realloc_func = realloc_func ? realloc_func : cr_default_realloc; +} + +void cr_free(CharRange *cr) +{ + cr->realloc_func(cr->mem_opaque, cr->points, 0); +} + +int cr_realloc(CharRange *cr, int size) +{ + int new_size; + uint32_t *new_buf; + + if (size > cr->size) { + new_size = max_int(size, cr->size * 3 / 2); + new_buf = cr->realloc_func(cr->mem_opaque, cr->points, + new_size * sizeof(cr->points[0])); + if (!new_buf) + return -1; + cr->points = new_buf; + cr->size = new_size; + } + return 0; +} + +int cr_copy(CharRange *cr, const CharRange *cr1) +{ + if (cr_realloc(cr, cr1->len)) + return -1; + memcpy(cr->points, cr1->points, sizeof(cr->points[0]) * cr1->len); + cr->len = cr1->len; + return 0; +} + +/* merge consecutive intervals and remove empty intervals */ +static void cr_compress(CharRange *cr) +{ + int i, j, k, len; + uint32_t *pt; + + pt = cr->points; + len = cr->len; + i = 0; + j = 0; + k = 0; + while ((i + 1) < len) { + if (pt[i] == pt[i + 1]) { + /* empty interval */ + i += 2; + } else { + j = i; + while ((j + 3) < len && pt[j + 1] == pt[j + 2]) + j += 2; + /* just copy */ + pt[k] = pt[i]; + pt[k + 1] = pt[j + 1]; + k += 2; + i = j + 2; + } + } + cr->len = k; +} + +/* union or intersection */ +int cr_op(CharRange *cr, const uint32_t *a_pt, int a_len, + const uint32_t *b_pt, int b_len, int op) +{ + int a_idx, b_idx, is_in; + uint32_t v; + + a_idx = 0; + b_idx = 0; + for(;;) { + /* get one more point from a or b in increasing order */ + if (a_idx < a_len && b_idx < b_len) { + if (a_pt[a_idx] < b_pt[b_idx]) { + goto a_add; + } else if (a_pt[a_idx] == b_pt[b_idx]) { + v = a_pt[a_idx]; + a_idx++; + b_idx++; + } else { + goto b_add; + } + } else if (a_idx < a_len) { + a_add: + v = a_pt[a_idx++]; + } else if (b_idx < b_len) { + b_add: + v = b_pt[b_idx++]; + } else { + break; + } + /* add the point if the in/out status changes */ + switch(op) { + case CR_OP_UNION: + is_in = (a_idx & 1) | (b_idx & 1); + break; + case CR_OP_INTER: + is_in = (a_idx & 1) & (b_idx & 1); + break; + case CR_OP_XOR: + is_in = (a_idx & 1) ^ (b_idx & 1); + break; + default: + abort(); + } + if (is_in != (cr->len & 1)) { + if (cr_add_point(cr, v)) + return -1; + } + } + cr_compress(cr); + return 0; +} + +int cr_union1(CharRange *cr, const uint32_t *b_pt, int b_len) +{ + CharRange a = *cr; + int ret; + cr->len = 0; + cr->size = 0; + cr->points = NULL; + ret = cr_op(cr, a.points, a.len, b_pt, b_len, CR_OP_UNION); + cr_free(&a); + return ret; +} + +int cr_invert(CharRange *cr) +{ + int len; + len = cr->len; + if (cr_realloc(cr, len + 2)) + return -1; + memmove(cr->points + 1, cr->points, len * sizeof(cr->points[0])); + cr->points[0] = 0; + cr->points[len + 1] = UINT32_MAX; + cr->len = len + 2; + cr_compress(cr); + return 0; +} + +#ifdef CONFIG_ALL_UNICODE + +BOOL lre_is_id_start(uint32_t c) +{ + return lre_is_in_table(c, unicode_prop_ID_Start_table, + unicode_prop_ID_Start_index, + sizeof(unicode_prop_ID_Start_index) / 3); +} + +BOOL lre_is_id_continue(uint32_t c) +{ + return lre_is_id_start(c) || + lre_is_in_table(c, unicode_prop_ID_Continue1_table, + unicode_prop_ID_Continue1_index, + sizeof(unicode_prop_ID_Continue1_index) / 3); +} + +#define UNICODE_DECOMP_LEN_MAX 18 + +typedef enum { + DECOMP_TYPE_C1, /* 16 bit char */ + DECOMP_TYPE_L1, /* 16 bit char table */ + DECOMP_TYPE_L2, + DECOMP_TYPE_L3, + DECOMP_TYPE_L4, + DECOMP_TYPE_L5, /* XXX: not used */ + DECOMP_TYPE_L6, /* XXX: could remove */ + DECOMP_TYPE_L7, /* XXX: could remove */ + DECOMP_TYPE_LL1, /* 18 bit char table */ + DECOMP_TYPE_LL2, + DECOMP_TYPE_S1, /* 8 bit char table */ + DECOMP_TYPE_S2, + DECOMP_TYPE_S3, + DECOMP_TYPE_S4, + DECOMP_TYPE_S5, + DECOMP_TYPE_I1, /* increment 16 bit char value */ + DECOMP_TYPE_I2_0, + DECOMP_TYPE_I2_1, + DECOMP_TYPE_I3_1, + DECOMP_TYPE_I3_2, + DECOMP_TYPE_I4_1, + DECOMP_TYPE_I4_2, + DECOMP_TYPE_B1, /* 16 bit base + 8 bit offset */ + DECOMP_TYPE_B2, + DECOMP_TYPE_B3, + DECOMP_TYPE_B4, + DECOMP_TYPE_B5, + DECOMP_TYPE_B6, + DECOMP_TYPE_B7, + DECOMP_TYPE_B8, + DECOMP_TYPE_B18, + DECOMP_TYPE_LS2, + DECOMP_TYPE_PAT3, + DECOMP_TYPE_S2_UL, + DECOMP_TYPE_LS2_UL, +} DecompTypeEnum; + +static uint32_t unicode_get_short_code(uint32_t c) +{ + static const uint16_t unicode_short_table[2] = { 0x2044, 0x2215 }; + + if (c < 0x80) + return c; + else if (c < 0x80 + 0x50) + return c - 0x80 + 0x300; + else + return unicode_short_table[c - 0x80 - 0x50]; +} + +static uint32_t unicode_get_lower_simple(uint32_t c) +{ + if (c < 0x100 || (c >= 0x410 && c <= 0x42f)) + c += 0x20; + else + c++; + return c; +} + +static uint16_t unicode_get16(const uint8_t *p) +{ + return p[0] | (p[1] << 8); +} + +static int unicode_decomp_entry(uint32_t *res, uint32_t c, + int idx, uint32_t code, uint32_t len, + uint32_t type) +{ + uint32_t c1; + int l, i, p; + const uint8_t *d; + + if (type == DECOMP_TYPE_C1) { + res[0] = unicode_decomp_table2[idx]; + return 1; + } else { + d = unicode_decomp_data + unicode_decomp_table2[idx]; + switch(type) { + case DECOMP_TYPE_L1: + case DECOMP_TYPE_L2: + case DECOMP_TYPE_L3: + case DECOMP_TYPE_L4: + case DECOMP_TYPE_L5: + case DECOMP_TYPE_L6: + case DECOMP_TYPE_L7: + l = type - DECOMP_TYPE_L1 + 1; + d += (c - code) * l * 2; + for(i = 0; i < l; i++) { + if ((res[i] = unicode_get16(d + 2 * i)) == 0) + return 0; + } + return l; + case DECOMP_TYPE_LL1: + case DECOMP_TYPE_LL2: + { + uint32_t k, p; + l = type - DECOMP_TYPE_LL1 + 1; + k = (c - code) * l; + p = len * l * 2; + for(i = 0; i < l; i++) { + c1 = unicode_get16(d + 2 * k) | + (((d[p + (k / 4)] >> ((k % 4) * 2)) & 3) << 16); + if (!c1) + return 0; + res[i] = c1; + k++; + } + } + return l; + case DECOMP_TYPE_S1: + case DECOMP_TYPE_S2: + case DECOMP_TYPE_S3: + case DECOMP_TYPE_S4: + case DECOMP_TYPE_S5: + l = type - DECOMP_TYPE_S1 + 1; + d += (c - code) * l; + for(i = 0; i < l; i++) { + if ((res[i] = unicode_get_short_code(d[i])) == 0) + return 0; + } + return l; + case DECOMP_TYPE_I1: + l = 1; + p = 0; + goto decomp_type_i; + case DECOMP_TYPE_I2_0: + case DECOMP_TYPE_I2_1: + case DECOMP_TYPE_I3_1: + case DECOMP_TYPE_I3_2: + case DECOMP_TYPE_I4_1: + case DECOMP_TYPE_I4_2: + l = 2 + ((type - DECOMP_TYPE_I2_0) >> 1); + p = ((type - DECOMP_TYPE_I2_0) & 1) + (l > 2); + decomp_type_i: + for(i = 0; i < l; i++) { + c1 = unicode_get16(d + 2 * i); + if (i == p) + c1 += c - code; + res[i] = c1; + } + return l; + case DECOMP_TYPE_B18: + l = 18; + goto decomp_type_b; + case DECOMP_TYPE_B1: + case DECOMP_TYPE_B2: + case DECOMP_TYPE_B3: + case DECOMP_TYPE_B4: + case DECOMP_TYPE_B5: + case DECOMP_TYPE_B6: + case DECOMP_TYPE_B7: + case DECOMP_TYPE_B8: + l = type - DECOMP_TYPE_B1 + 1; + decomp_type_b: + { + uint32_t c_min; + c_min = unicode_get16(d); + d += 2 + (c - code) * l; + for(i = 0; i < l; i++) { + c1 = d[i]; + if (c1 == 0xff) + c1 = 0x20; + else + c1 += c_min; + res[i] = c1; + } + } + return l; + case DECOMP_TYPE_LS2: + d += (c - code) * 3; + if (!(res[0] = unicode_get16(d))) + return 0; + res[1] = unicode_get_short_code(d[2]); + return 2; + case DECOMP_TYPE_PAT3: + res[0] = unicode_get16(d); + res[2] = unicode_get16(d + 2); + d += 4 + (c - code) * 2; + res[1] = unicode_get16(d); + return 3; + case DECOMP_TYPE_S2_UL: + case DECOMP_TYPE_LS2_UL: + c1 = c - code; + if (type == DECOMP_TYPE_S2_UL) { + d += c1 & ~1; + c = unicode_get_short_code(*d); + d++; + } else { + d += (c1 >> 1) * 3; + c = unicode_get16(d); + d += 2; + } + if (c1 & 1) + c = unicode_get_lower_simple(c); + res[0] = c; + res[1] = unicode_get_short_code(*d); + return 2; + } + } + return 0; +} + + +/* return the length of the decomposition (length <= + UNICODE_DECOMP_LEN_MAX) or 0 if no decomposition */ +static int unicode_decomp_char(uint32_t *res, uint32_t c, BOOL is_compat1) +{ + uint32_t v, type, is_compat, code, len; + int idx_min, idx_max, idx; + + idx_min = 0; + idx_max = countof(unicode_decomp_table1) - 1; + while (idx_min <= idx_max) { + idx = (idx_max + idx_min) / 2; + v = unicode_decomp_table1[idx]; + code = v >> (32 - 18); + len = (v >> (32 - 18 - 7)) & 0x7f; + // printf("idx=%d code=%05x len=%d\n", idx, code, len); + if (c < code) { + idx_max = idx - 1; + } else if (c >= code + len) { + idx_min = idx + 1; + } else { + is_compat = v & 1; + if (is_compat1 < is_compat) + break; + type = (v >> (32 - 18 - 7 - 6)) & 0x3f; + return unicode_decomp_entry(res, c, idx, code, len, type); + } + } + return 0; +} + +/* return 0 if no pair found */ +static int unicode_compose_pair(uint32_t c0, uint32_t c1) +{ + uint32_t code, len, type, v, idx1, d_idx, d_offset, ch; + int idx_min, idx_max, idx, d; + uint32_t pair[2]; + + idx_min = 0; + idx_max = countof(unicode_comp_table) - 1; + while (idx_min <= idx_max) { + idx = (idx_max + idx_min) / 2; + idx1 = unicode_comp_table[idx]; + + /* idx1 represent an entry of the decomposition table */ + d_idx = idx1 >> 6; + d_offset = idx1 & 0x3f; + v = unicode_decomp_table1[d_idx]; + code = v >> (32 - 18); + len = (v >> (32 - 18 - 7)) & 0x7f; + type = (v >> (32 - 18 - 7 - 6)) & 0x3f; + ch = code + d_offset; + unicode_decomp_entry(pair, ch, d_idx, code, len, type); + d = c0 - pair[0]; + if (d == 0) + d = c1 - pair[1]; + if (d < 0) { + idx_max = idx - 1; + } else if (d > 0) { + idx_min = idx + 1; + } else { + return ch; + } + } + return 0; +} + +/* return the combining class of character c (between 0 and 255) */ +static int unicode_get_cc(uint32_t c) +{ + uint32_t code, n, type, cc, c1, b; + int pos; + const uint8_t *p; + + pos = get_index_pos(&code, c, + unicode_cc_index, sizeof(unicode_cc_index) / 3); + if (pos < 0) + return 0; + p = unicode_cc_table + pos; + /* Compressed run length encoding: + - 2 high order bits are combining class type + - 0:0, 1:230, 2:extra byte linear progression, 3:extra byte + - 00..2F: range length (add 1) + - 30..37: 3-bit range-length + 1 extra byte + - 38..3F: 3-bit range-length + 2 extra byte + */ + for(;;) { + b = *p++; + type = b >> 6; + n = b & 0x3f; + if (n < 48) { + } else if (n < 56) { + n = (n - 48) << 8; + n |= *p++; + n += 48; + } else { + n = (n - 56) << 8; + n |= *p++ << 8; + n |= *p++; + n += 48 + (1 << 11); + } + if (type <= 1) + p++; + c1 = code + n + 1; + if (c < c1) { + switch(type) { + case 0: + cc = p[-1]; + break; + case 1: + cc = p[-1] + c - code; + break; + case 2: + cc = 0; + break; + default: + case 3: + cc = 230; + break; + } + return cc; + } + code = c1; + } +} + +static void sort_cc(int *buf, int len) +{ + int i, j, k, cc, cc1, start, ch1; + + for(i = 0; i < len; i++) { + cc = unicode_get_cc(buf[i]); + if (cc != 0) { + start = i; + j = i + 1; + while (j < len) { + ch1 = buf[j]; + cc1 = unicode_get_cc(ch1); + if (cc1 == 0) + break; + k = j - 1; + while (k >= start) { + if (unicode_get_cc(buf[k]) <= cc1) + break; + buf[k + 1] = buf[k]; + k--; + } + buf[k + 1] = ch1; + j++; + } +#if 0 + printf("cc:"); + for(k = start; k < j; k++) { + printf(" %3d", unicode_get_cc(buf[k])); + } + printf("\n"); +#endif + i = j; + } + } +} + +static void to_nfd_rec(DynBuf *dbuf, + const int *src, int src_len, int is_compat) +{ + uint32_t c, v; + int i, l; + uint32_t res[UNICODE_DECOMP_LEN_MAX]; + + for(i = 0; i < src_len; i++) { + c = src[i]; + if (c >= 0xac00 && c < 0xd7a4) { + /* Hangul decomposition */ + c -= 0xac00; + dbuf_put_u32(dbuf, 0x1100 + c / 588); + dbuf_put_u32(dbuf, 0x1161 + (c % 588) / 28); + v = c % 28; + if (v != 0) + dbuf_put_u32(dbuf, 0x11a7 + v); + } else { + l = unicode_decomp_char(res, c, is_compat); + if (l) { + to_nfd_rec(dbuf, (int *)res, l, is_compat); + } else { + dbuf_put_u32(dbuf, c); + } + } + } +} + +/* return 0 if not found */ +static int compose_pair(uint32_t c0, uint32_t c1) +{ + /* Hangul composition */ + if (c0 >= 0x1100 && c0 < 0x1100 + 19 && + c1 >= 0x1161 && c1 < 0x1161 + 21) { + return 0xac00 + (c0 - 0x1100) * 588 + (c1 - 0x1161) * 28; + } else if (c0 >= 0xac00 && c0 < 0xac00 + 11172 && + (c0 - 0xac00) % 28 == 0 && + c1 >= 0x11a7 && c1 < 0x11a7 + 28) { + return c0 + c1 - 0x11a7; + } else { + return unicode_compose_pair(c0, c1); + } +} + +int unicode_normalize(uint32_t **pdst, const uint32_t *src, int src_len, + UnicodeNormalizationEnum n_type, + void *opaque, DynBufReallocFunc *realloc_func) +{ + int *buf, buf_len, i, p, starter_pos, cc, last_cc, out_len; + BOOL is_compat; + DynBuf dbuf_s, *dbuf = &dbuf_s; + + is_compat = n_type >> 1; + + dbuf_init2(dbuf, opaque, realloc_func); + if (dbuf_realloc(dbuf, sizeof(int) * src_len)) + goto fail; + + /* common case: latin1 is unaffected by NFC */ + if (n_type == UNICODE_NFC) { + for(i = 0; i < src_len; i++) { + if (src[i] >= 0x100) + goto not_latin1; + } + buf = (int *)dbuf->buf; + memcpy(buf, src, src_len * sizeof(int)); + *pdst = (uint32_t *)buf; + return src_len; + not_latin1: ; + } + + to_nfd_rec(dbuf, (const int *)src, src_len, is_compat); + if (dbuf_error(dbuf)) { + fail: + *pdst = NULL; + return -1; + } + buf = (int *)dbuf->buf; + buf_len = dbuf->size / sizeof(int); + + sort_cc(buf, buf_len); + + if (buf_len <= 1 || (n_type & 1) != 0) { + /* NFD / NFKD */ + *pdst = (uint32_t *)buf; + return buf_len; + } + + i = 1; + out_len = 1; + while (i < buf_len) { + /* find the starter character and test if it is blocked from + the character at 'i' */ + last_cc = unicode_get_cc(buf[i]); + starter_pos = out_len - 1; + while (starter_pos >= 0) { + cc = unicode_get_cc(buf[starter_pos]); + if (cc == 0) + break; + if (cc >= last_cc) + goto next; + last_cc = 256; + starter_pos--; + } + if (starter_pos >= 0 && + (p = compose_pair(buf[starter_pos], buf[i])) != 0) { + buf[starter_pos] = p; + i++; + } else { + next: + buf[out_len++] = buf[i++]; + } + } + *pdst = (uint32_t *)buf; + return out_len; +} + +/* char ranges for various unicode properties */ + +static int unicode_find_name(const char *name_table, const char *name) +{ + const char *p, *r; + int pos; + size_t name_len, len; + + p = name_table; + pos = 0; + name_len = strlen(name); + while (*p) { + for(;;) { + r = strchr(p, ','); + if (!r) + len = strlen(p); + else + len = r - p; + if (len == name_len && !memcmp(p, name, name_len)) + return pos; + p += len + 1; + if (!r) + break; + } + pos++; + } + return -1; +} + +/* 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2 + if not found */ +int unicode_script(CharRange *cr, + const char *script_name, BOOL is_ext) +{ + int script_idx; + const uint8_t *p, *p_end; + uint32_t c, c1, b, n, v, v_len, i, type; + CharRange cr1_s, *cr1; + CharRange cr2_s, *cr2 = &cr2_s; + BOOL is_common; + + script_idx = unicode_find_name(unicode_script_name_table, script_name); + if (script_idx < 0) + return -2; + /* Note: we remove the "Unknown" Script */ + script_idx += UNICODE_SCRIPT_Unknown + 1; + + is_common = (script_idx == UNICODE_SCRIPT_Common || + script_idx == UNICODE_SCRIPT_Inherited); + if (is_ext) { + cr1 = &cr1_s; + cr_init(cr1, cr->mem_opaque, cr->realloc_func); + cr_init(cr2, cr->mem_opaque, cr->realloc_func); + } else { + cr1 = cr; + } + + p = unicode_script_table; + p_end = unicode_script_table + countof(unicode_script_table); + c = 0; + while (p < p_end) { + b = *p++; + type = b >> 7; + n = b & 0x7f; + if (n < 96) { + } else if (n < 112) { + n = (n - 96) << 8; + n |= *p++; + n += 96; + } else { + n = (n - 112) << 16; + n |= *p++ << 8; + n |= *p++; + n += 96 + (1 << 12); + } + if (type == 0) + v = 0; + else + v = *p++; + c1 = c + n + 1; + if (v == script_idx) { + if (cr_add_interval(cr1, c, c1)) + goto fail; + } + c = c1; + } + + if (is_ext) { + /* add the script extensions */ + p = unicode_script_ext_table; + p_end = unicode_script_ext_table + countof(unicode_script_ext_table); + c = 0; + while (p < p_end) { + b = *p++; + if (b < 128) { + n = b; + } else if (b < 128 + 64) { + n = (b - 128) << 8; + n |= *p++; + n += 128; + } else { + n = (b - 128 - 64) << 16; + n |= *p++ << 8; + n |= *p++; + n += 128 + (1 << 14); + } + c1 = c + n + 1; + v_len = *p++; + if (is_common) { + if (v_len != 0) { + if (cr_add_interval(cr2, c, c1)) + goto fail; + } + } else { + for(i = 0; i < v_len; i++) { + if (p[i] == script_idx) { + if (cr_add_interval(cr2, c, c1)) + goto fail; + break; + } + } + } + p += v_len; + c = c1; + } + if (is_common) { + /* remove all the characters with script extensions */ + if (cr_invert(cr2)) + goto fail; + if (cr_op(cr, cr1->points, cr1->len, cr2->points, cr2->len, + CR_OP_INTER)) + goto fail; + } else { + if (cr_op(cr, cr1->points, cr1->len, cr2->points, cr2->len, + CR_OP_UNION)) + goto fail; + } + cr_free(cr1); + cr_free(cr2); + } + return 0; + fail: + if (is_ext) { + cr_free(cr1); + cr_free(cr2); + } + goto fail; +} + +#define M(id) (1U << UNICODE_GC_ ## id) + +static int unicode_general_category1(CharRange *cr, uint32_t gc_mask) +{ + const uint8_t *p, *p_end; + uint32_t c, c0, b, n, v; + + p = unicode_gc_table; + p_end = unicode_gc_table + countof(unicode_gc_table); + c = 0; + /* Compressed range encoding: + initial byte: + bits 0..4: category number (special case 31) + bits 5..7: range length (add 1) + special case bits 5..7 == 7: read an extra byte + - 00..7F: range length (add 7 + 1) + - 80..BF: 6-bits plus extra byte for range length (add 7 + 128) + - C0..FF: 6-bits plus 2 extra bytes for range length (add 7 + 128 + 16384) + */ + while (p < p_end) { + b = *p++; + n = b >> 5; + v = b & 0x1f; + if (n == 7) { + n = *p++; + if (n < 128) { + n += 7; + } else if (n < 128 + 64) { + n = (n - 128) << 8; + n |= *p++; + n += 7 + 128; + } else { + n = (n - 128 - 64) << 16; + n |= *p++ << 8; + n |= *p++; + n += 7 + 128 + (1 << 14); + } + } + c0 = c; + c += n + 1; + if (v == 31) { + /* run of Lu / Ll */ + b = gc_mask & (M(Lu) | M(Ll)); + if (b != 0) { + if (b == (M(Lu) | M(Ll))) { + goto add_range; + } else { + c0 += ((gc_mask & M(Ll)) != 0); + for(; c0 < c; c0 += 2) { + if (cr_add_interval(cr, c0, c0 + 1)) + return -1; + } + } + } + } else if ((gc_mask >> v) & 1) { + add_range: + if (cr_add_interval(cr, c0, c)) + return -1; + } + } + return 0; +} + +static int unicode_prop1(CharRange *cr, int prop_idx) +{ + const uint8_t *p, *p_end; + uint32_t c, c0, b, bit; + + p = unicode_prop_table[prop_idx]; + p_end = p + unicode_prop_len_table[prop_idx]; + c = 0; + bit = 0; + /* Compressed range encoding: + 00..3F: 2 packed lengths: 3-bit + 3-bit + 40..5F: 5-bits plus extra byte for length + 60..7F: 5-bits plus 2 extra bytes for length + 80..FF: 7-bit length + lengths must be incremented to get character count + Ranges alternate between false and true return value. + */ + while (p < p_end) { + c0 = c; + b = *p++; + if (b < 64) { + c += (b >> 3) + 1; + if (bit) { + if (cr_add_interval(cr, c0, c)) + return -1; + } + bit ^= 1; + c0 = c; + c += (b & 7) + 1; + } else if (b >= 0x80) { + c += b - 0x80 + 1; + } else if (b < 0x60) { + c += (((b - 0x40) << 8) | p[0]) + 1; + p++; + } else { + c += (((b - 0x60) << 16) | (p[0] << 8) | p[1]) + 1; + p += 2; + } + if (bit) { + if (cr_add_interval(cr, c0, c)) + return -1; + } + bit ^= 1; + } + return 0; +} + +#define CASE_U (1 << 0) +#define CASE_L (1 << 1) +#define CASE_F (1 << 2) + +/* use the case conversion table to generate range of characters. + CASE_U: set char if modified by uppercasing, + CASE_L: set char if modified by lowercasing, + CASE_F: set char if modified by case folding, + */ +static int unicode_case1(CharRange *cr, int case_mask) +{ +#define MR(x) (1 << RUN_TYPE_ ## x) + const uint32_t tab_run_mask[3] = { + MR(U) | MR(UF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(UF_D20) | + MR(UF_D1_EXT) | MR(U_EXT) | MR(UF_EXT2) | MR(UF_EXT3), + + MR(L) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(LF_EXT2), + + MR(UF) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(LF_EXT2) | MR(UF_D20) | MR(UF_D1_EXT) | MR(LF_EXT) | MR(UF_EXT2) | MR(UF_EXT3), + }; +#undef MR + uint32_t mask, v, code, type, len, i, idx; + + if (case_mask == 0) + return 0; + mask = 0; + for(i = 0; i < 3; i++) { + if ((case_mask >> i) & 1) + mask |= tab_run_mask[i]; + } + for(idx = 0; idx < countof(case_conv_table1); idx++) { + v = case_conv_table1[idx]; + type = (v >> (32 - 17 - 7 - 4)) & 0xf; + code = v >> (32 - 17); + len = (v >> (32 - 17 - 7)) & 0x7f; + if ((mask >> type) & 1) { + // printf("%d: type=%d %04x %04x\n", idx, type, code, code + len - 1); + switch(type) { + case RUN_TYPE_UL: + if ((case_mask & CASE_U) && (case_mask & (CASE_L | CASE_F))) + goto def_case; + code += ((case_mask & CASE_U) != 0); + for(i = 0; i < len; i += 2) { + if (cr_add_interval(cr, code + i, code + i + 1)) + return -1; + } + break; + case RUN_TYPE_LSU: + if ((case_mask & CASE_U) && (case_mask & (CASE_L | CASE_F))) + goto def_case; + if (!(case_mask & CASE_U)) { + if (cr_add_interval(cr, code, code + 1)) + return -1; + } + if (cr_add_interval(cr, code + 1, code + 2)) + return -1; + if (case_mask & CASE_U) { + if (cr_add_interval(cr, code + 2, code + 3)) + return -1; + } + break; + default: + def_case: + if (cr_add_interval(cr, code, code + len)) + return -1; + break; + } + } + } + return 0; +} + +static int point_cmp(const void *p1, const void *p2, void *arg) +{ + uint32_t v1 = *(uint32_t *)p1; + uint32_t v2 = *(uint32_t *)p2; + return (v1 > v2) - (v1 < v2); +} + +static void cr_sort_and_remove_overlap(CharRange *cr) +{ + uint32_t start, end, start1, end1, i, j; + + /* the resulting ranges are not necessarily sorted and may overlap */ + rqsort(cr->points, cr->len / 2, sizeof(cr->points[0]) * 2, point_cmp, NULL); + j = 0; + for(i = 0; i < cr->len; ) { + start = cr->points[i]; + end = cr->points[i + 1]; + i += 2; + while (i < cr->len) { + start1 = cr->points[i]; + end1 = cr->points[i + 1]; + if (start1 > end) { + /* |------| + * |-------| */ + break; + } else if (end1 <= end) { + /* |------| + * |--| */ + i += 2; + } else { + /* |------| + * |-------| */ + end = end1; + i += 2; + } + } + cr->points[j] = start; + cr->points[j + 1] = end; + j += 2; + } + cr->len = j; +} + +/* canonicalize a character set using the JS regex case folding rules + (see lre_canonicalize()) */ +int cr_regexp_canonicalize(CharRange *cr, BOOL is_unicode) +{ + CharRange cr_inter, cr_mask, cr_result, cr_sub; + uint32_t v, code, len, i, idx, start, end, c, d_start, d_end, d; + + cr_init(&cr_mask, cr->mem_opaque, cr->realloc_func); + cr_init(&cr_inter, cr->mem_opaque, cr->realloc_func); + cr_init(&cr_result, cr->mem_opaque, cr->realloc_func); + cr_init(&cr_sub, cr->mem_opaque, cr->realloc_func); + + if (unicode_case1(&cr_mask, is_unicode ? CASE_F : CASE_U)) + goto fail; + if (cr_op(&cr_inter, cr_mask.points, cr_mask.len, cr->points, cr->len, CR_OP_INTER)) + goto fail; + + if (cr_invert(&cr_mask)) + goto fail; + if (cr_op(&cr_sub, cr_mask.points, cr_mask.len, cr->points, cr->len, CR_OP_INTER)) + goto fail; + + /* cr_inter = cr & cr_mask */ + /* cr_sub = cr & ~cr_mask */ + + /* use the case conversion table to compute the result */ + d_start = -1; + d_end = -1; + idx = 0; + v = case_conv_table1[idx]; + code = v >> (32 - 17); + len = (v >> (32 - 17 - 7)) & 0x7f; + for(i = 0; i < cr_inter.len; i += 2) { + start = cr_inter.points[i]; + end = cr_inter.points[i + 1]; + + for(c = start; c < end; c++) { + for(;;) { + if (c >= code && c < code + len) + break; + idx++; + assert(idx < countof(case_conv_table1)); + v = case_conv_table1[idx]; + code = v >> (32 - 17); + len = (v >> (32 - 17 - 7)) & 0x7f; + } + d = lre_case_folding_entry(c, idx, v, is_unicode); + /* try to merge with the current interval */ + if (d_start == -1) { + d_start = d; + d_end = d + 1; + } else if (d_end == d) { + d_end++; + } else { + cr_add_interval(&cr_result, d_start, d_end); + d_start = d; + d_end = d + 1; + } + } + } + if (d_start != -1) { + if (cr_add_interval(&cr_result, d_start, d_end)) + goto fail; + } + + /* the resulting ranges are not necessarily sorted and may overlap */ + cr_sort_and_remove_overlap(&cr_result); + + /* or with the character not affected by the case folding */ + cr->len = 0; + if (cr_op(cr, cr_result.points, cr_result.len, cr_sub.points, cr_sub.len, CR_OP_UNION)) + goto fail; + + cr_free(&cr_inter); + cr_free(&cr_mask); + cr_free(&cr_result); + cr_free(&cr_sub); + return 0; + fail: + cr_free(&cr_inter); + cr_free(&cr_mask); + cr_free(&cr_result); + cr_free(&cr_sub); + return -1; +} + +typedef enum { + POP_GC, + POP_PROP, + POP_CASE, + POP_UNION, + POP_INTER, + POP_XOR, + POP_INVERT, + POP_END, +} PropOPEnum; + +#define POP_STACK_LEN_MAX 4 + +static int unicode_prop_ops(CharRange *cr, ...) +{ + va_list ap; + CharRange stack[POP_STACK_LEN_MAX]; + int stack_len, op, ret, i; + uint32_t a; + + va_start(ap, cr); + stack_len = 0; + for(;;) { + op = va_arg(ap, int); + switch(op) { + case POP_GC: + assert(stack_len < POP_STACK_LEN_MAX); + a = va_arg(ap, int); + cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func); + if (unicode_general_category1(&stack[stack_len - 1], a)) + goto fail; + break; + case POP_PROP: + assert(stack_len < POP_STACK_LEN_MAX); + a = va_arg(ap, int); + cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func); + if (unicode_prop1(&stack[stack_len - 1], a)) + goto fail; + break; + case POP_CASE: + assert(stack_len < POP_STACK_LEN_MAX); + a = va_arg(ap, int); + cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func); + if (unicode_case1(&stack[stack_len - 1], a)) + goto fail; + break; + case POP_UNION: + case POP_INTER: + case POP_XOR: + { + CharRange *cr1, *cr2, *cr3; + assert(stack_len >= 2); + assert(stack_len < POP_STACK_LEN_MAX); + cr1 = &stack[stack_len - 2]; + cr2 = &stack[stack_len - 1]; + cr3 = &stack[stack_len++]; + cr_init(cr3, cr->mem_opaque, cr->realloc_func); + if (cr_op(cr3, cr1->points, cr1->len, + cr2->points, cr2->len, op - POP_UNION + CR_OP_UNION)) + goto fail; + cr_free(cr1); + cr_free(cr2); + *cr1 = *cr3; + stack_len -= 2; + } + break; + case POP_INVERT: + assert(stack_len >= 1); + if (cr_invert(&stack[stack_len - 1])) + goto fail; + break; + case POP_END: + goto done; + default: + abort(); + } + } + done: + va_end(ap); + assert(stack_len == 1); + ret = cr_copy(cr, &stack[0]); + cr_free(&stack[0]); + return ret; + fail: + va_end(ap); + for(i = 0; i < stack_len; i++) + cr_free(&stack[i]); + return -1; +} + +static const uint32_t unicode_gc_mask_table[] = { + M(Lu) | M(Ll) | M(Lt), /* LC */ + M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo), /* L */ + M(Mn) | M(Mc) | M(Me), /* M */ + M(Nd) | M(Nl) | M(No), /* N */ + M(Sm) | M(Sc) | M(Sk) | M(So), /* S */ + M(Pc) | M(Pd) | M(Ps) | M(Pe) | M(Pi) | M(Pf) | M(Po), /* P */ + M(Zs) | M(Zl) | M(Zp), /* Z */ + M(Cc) | M(Cf) | M(Cs) | M(Co) | M(Cn), /* C */ +}; + +/* 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2 + if not found */ +int unicode_general_category(CharRange *cr, const char *gc_name) +{ + int gc_idx; + uint32_t gc_mask; + + gc_idx = unicode_find_name(unicode_gc_name_table, gc_name); + if (gc_idx < 0) + return -2; + if (gc_idx <= UNICODE_GC_Co) { + gc_mask = (uint64_t)1 << gc_idx; + } else { + gc_mask = unicode_gc_mask_table[gc_idx - UNICODE_GC_LC]; + } + return unicode_general_category1(cr, gc_mask); +} + + +/* 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2 + if not found */ +int unicode_prop(CharRange *cr, const char *prop_name) +{ + int prop_idx, ret; + + prop_idx = unicode_find_name(unicode_prop_name_table, prop_name); + if (prop_idx < 0) + return -2; + prop_idx += UNICODE_PROP_ASCII_Hex_Digit; + + ret = 0; + switch(prop_idx) { + case UNICODE_PROP_ASCII: + if (cr_add_interval(cr, 0x00, 0x7f + 1)) + return -1; + break; + case UNICODE_PROP_Any: + if (cr_add_interval(cr, 0x00000, 0x10ffff + 1)) + return -1; + break; + case UNICODE_PROP_Assigned: + ret = unicode_prop_ops(cr, + POP_GC, M(Cn), + POP_INVERT, + POP_END); + break; + case UNICODE_PROP_Math: + ret = unicode_prop_ops(cr, + POP_GC, M(Sm), + POP_PROP, UNICODE_PROP_Other_Math, + POP_UNION, + POP_END); + break; + case UNICODE_PROP_Lowercase: + ret = unicode_prop_ops(cr, + POP_GC, M(Ll), + POP_PROP, UNICODE_PROP_Other_Lowercase, + POP_UNION, + POP_END); + break; + case UNICODE_PROP_Uppercase: + ret = unicode_prop_ops(cr, + POP_GC, M(Lu), + POP_PROP, UNICODE_PROP_Other_Uppercase, + POP_UNION, + POP_END); + break; + case UNICODE_PROP_Cased: + ret = unicode_prop_ops(cr, + POP_GC, M(Lu) | M(Ll) | M(Lt), + POP_PROP, UNICODE_PROP_Other_Uppercase, + POP_UNION, + POP_PROP, UNICODE_PROP_Other_Lowercase, + POP_UNION, + POP_END); + break; + case UNICODE_PROP_Alphabetic: + ret = unicode_prop_ops(cr, + POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl), + POP_PROP, UNICODE_PROP_Other_Uppercase, + POP_UNION, + POP_PROP, UNICODE_PROP_Other_Lowercase, + POP_UNION, + POP_PROP, UNICODE_PROP_Other_Alphabetic, + POP_UNION, + POP_END); + break; + case UNICODE_PROP_Grapheme_Base: + ret = unicode_prop_ops(cr, + POP_GC, M(Cc) | M(Cf) | M(Cs) | M(Co) | M(Cn) | M(Zl) | M(Zp) | M(Me) | M(Mn), + POP_PROP, UNICODE_PROP_Other_Grapheme_Extend, + POP_UNION, + POP_INVERT, + POP_END); + break; + case UNICODE_PROP_Grapheme_Extend: + ret = unicode_prop_ops(cr, + POP_GC, M(Me) | M(Mn), + POP_PROP, UNICODE_PROP_Other_Grapheme_Extend, + POP_UNION, + POP_END); + break; + case UNICODE_PROP_XID_Start: + ret = unicode_prop_ops(cr, + POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl), + POP_PROP, UNICODE_PROP_Other_ID_Start, + POP_UNION, + POP_PROP, UNICODE_PROP_Pattern_Syntax, + POP_PROP, UNICODE_PROP_Pattern_White_Space, + POP_UNION, + POP_PROP, UNICODE_PROP_XID_Start1, + POP_UNION, + POP_INVERT, + POP_INTER, + POP_END); + break; + case UNICODE_PROP_XID_Continue: + ret = unicode_prop_ops(cr, + POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl) | + M(Mn) | M(Mc) | M(Nd) | M(Pc), + POP_PROP, UNICODE_PROP_Other_ID_Start, + POP_UNION, + POP_PROP, UNICODE_PROP_Other_ID_Continue, + POP_UNION, + POP_PROP, UNICODE_PROP_Pattern_Syntax, + POP_PROP, UNICODE_PROP_Pattern_White_Space, + POP_UNION, + POP_PROP, UNICODE_PROP_XID_Continue1, + POP_UNION, + POP_INVERT, + POP_INTER, + POP_END); + break; + case UNICODE_PROP_Changes_When_Uppercased: + ret = unicode_case1(cr, CASE_U); + break; + case UNICODE_PROP_Changes_When_Lowercased: + ret = unicode_case1(cr, CASE_L); + break; + case UNICODE_PROP_Changes_When_Casemapped: + ret = unicode_case1(cr, CASE_U | CASE_L | CASE_F); + break; + case UNICODE_PROP_Changes_When_Titlecased: + ret = unicode_prop_ops(cr, + POP_CASE, CASE_U, + POP_PROP, UNICODE_PROP_Changes_When_Titlecased1, + POP_XOR, + POP_END); + break; + case UNICODE_PROP_Changes_When_Casefolded: + ret = unicode_prop_ops(cr, + POP_CASE, CASE_F, + POP_PROP, UNICODE_PROP_Changes_When_Casefolded1, + POP_XOR, + POP_END); + break; + case UNICODE_PROP_Changes_When_NFKC_Casefolded: + ret = unicode_prop_ops(cr, + POP_CASE, CASE_F, + POP_PROP, UNICODE_PROP_Changes_When_NFKC_Casefolded1, + POP_XOR, + POP_END); + break; +#if 0 + case UNICODE_PROP_ID_Start: + ret = unicode_prop_ops(cr, + POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl), + POP_PROP, UNICODE_PROP_Other_ID_Start, + POP_UNION, + POP_PROP, UNICODE_PROP_Pattern_Syntax, + POP_PROP, UNICODE_PROP_Pattern_White_Space, + POP_UNION, + POP_INVERT, + POP_INTER, + POP_END); + break; + case UNICODE_PROP_ID_Continue: + ret = unicode_prop_ops(cr, + POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl) | + M(Mn) | M(Mc) | M(Nd) | M(Pc), + POP_PROP, UNICODE_PROP_Other_ID_Start, + POP_UNION, + POP_PROP, UNICODE_PROP_Other_ID_Continue, + POP_UNION, + POP_PROP, UNICODE_PROP_Pattern_Syntax, + POP_PROP, UNICODE_PROP_Pattern_White_Space, + POP_UNION, + POP_INVERT, + POP_INTER, + POP_END); + break; + case UNICODE_PROP_Case_Ignorable: + ret = unicode_prop_ops(cr, + POP_GC, M(Mn) | M(Cf) | M(Lm) | M(Sk), + POP_PROP, UNICODE_PROP_Case_Ignorable1, + POP_XOR, + POP_END); + break; +#else + /* we use the existing tables */ + case UNICODE_PROP_ID_Continue: + ret = unicode_prop_ops(cr, + POP_PROP, UNICODE_PROP_ID_Start, + POP_PROP, UNICODE_PROP_ID_Continue1, + POP_XOR, + POP_END); + break; +#endif + default: + if (prop_idx >= countof(unicode_prop_table)) + return -2; + ret = unicode_prop1(cr, prop_idx); + break; + } + return ret; +} + +#endif /* CONFIG_ALL_UNICODE */ + +/*---- lre codepoint categorizing functions ----*/ + +#define S UNICODE_C_SPACE +#define D UNICODE_C_DIGIT +#define X UNICODE_C_XDIGIT +#define U UNICODE_C_UPPER +#define L UNICODE_C_LOWER +#define _ UNICODE_C_UNDER +#define d UNICODE_C_DOLLAR + +uint8_t const lre_ctype_bits[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, + 0, S, S, S, S, S, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + + S, 0, 0, 0, d, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + X|D, X|D, X|D, X|D, X|D, X|D, X|D, X|D, + X|D, X|D, 0, 0, 0, 0, 0, 0, + + 0, X|U, X|U, X|U, X|U, X|U, X|U, U, + U, U, U, U, U, U, U, U, + U, U, U, U, U, U, U, U, + U, U, U, 0, 0, 0, 0, _, + + 0, X|L, X|L, X|L, X|L, X|L, X|L, L, + L, L, L, L, L, L, L, L, + L, L, L, L, L, L, L, L, + L, L, L, 0, 0, 0, 0, 0, + + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + + S, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, +}; + +#undef S +#undef D +#undef X +#undef U +#undef L +#undef _ +#undef d + +/* code point ranges for Zs,Zl or Zp property */ +static const uint16_t char_range_s[] = { + 10, + 0x0009, 0x000D + 1, + 0x0020, 0x0020 + 1, + 0x00A0, 0x00A0 + 1, + 0x1680, 0x1680 + 1, + 0x2000, 0x200A + 1, + /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */ + /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */ + 0x2028, 0x2029 + 1, + 0x202F, 0x202F + 1, + 0x205F, 0x205F + 1, + 0x3000, 0x3000 + 1, + /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */ + 0xFEFF, 0xFEFF + 1, +}; + +BOOL lre_is_space_non_ascii(uint32_t c) +{ + size_t i, n; + + n = countof(char_range_s); + for(i = 5; i < n; i += 2) { + uint32_t low = char_range_s[i]; + uint32_t high = char_range_s[i + 1]; + if (c < low) + return FALSE; + if (c < high) + return TRUE; + } + return FALSE; +} |