aboutsummaryrefslogtreecommitdiffstats
path: root/src/shared/quickjs/libregexp.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/shared/quickjs/libregexp.c')
-rw-r--r--src/shared/quickjs/libregexp.c2503
1 files changed, 2503 insertions, 0 deletions
diff --git a/src/shared/quickjs/libregexp.c b/src/shared/quickjs/libregexp.c
new file mode 100644
index 000000000..4db042941
--- /dev/null
+++ b/src/shared/quickjs/libregexp.c
@@ -0,0 +1,2503 @@
+/*
+ * Regular Expression Engine
+ *
+ * Copyright (c) 2017-2018 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <inttypes.h>
+#include <string.h>
+#include <assert.h>
+
+#include "cutils.h"
+#include "libregexp.h"
+#include "libunicode.h"
+
+/*
+ TODO:
+
+ - Add a lock step execution mode (=linear time execution guaranteed)
+ when the regular expression is "simple" i.e. no backreference nor
+ complicated lookahead. The opcodes are designed for this execution
+ model.
+*/
+
+#if defined(TEST)
+#define DUMP_REOP
+#endif
+
+typedef enum {
+#define DEF(id, size) REOP_ ## id,
+#include "libregexp-opcode.h"
+#undef DEF
+ REOP_COUNT,
+} REOPCodeEnum;
+
+#define CAPTURE_COUNT_MAX 255
+#define STACK_SIZE_MAX 255
+
+/* unicode code points */
+#define CP_LS 0x2028
+#define CP_PS 0x2029
+
+#define TMP_BUF_SIZE 128
+
+typedef struct {
+ DynBuf byte_code;
+ const uint8_t *buf_ptr;
+ const uint8_t *buf_end;
+ const uint8_t *buf_start;
+ int re_flags;
+ BOOL is_unicode;
+ BOOL ignore_case;
+ BOOL dotall;
+ int capture_count;
+ int total_capture_count; /* -1 = not computed yet */
+ int has_named_captures; /* -1 = don't know, 0 = no, 1 = yes */
+ void *opaque;
+ DynBuf group_names;
+ union {
+ char error_msg[TMP_BUF_SIZE];
+ char tmp_buf[TMP_BUF_SIZE];
+ } u;
+} REParseState;
+
+typedef struct {
+#ifdef DUMP_REOP
+ const char *name;
+#endif
+ uint8_t size;
+} REOpCode;
+
+static const REOpCode reopcode_info[REOP_COUNT] = {
+#ifdef DUMP_REOP
+#define DEF(id, size) { #id, size },
+#else
+#define DEF(id, size) { size },
+#endif
+#include "libregexp-opcode.h"
+#undef DEF
+};
+
+#define RE_HEADER_FLAGS 0
+#define RE_HEADER_CAPTURE_COUNT 1
+#define RE_HEADER_STACK_SIZE 2
+#define RE_HEADER_BYTECODE_LEN 3
+
+#define RE_HEADER_LEN 7
+
+static inline int is_digit(int c) {
+ return c >= '0' && c <= '9';
+}
+
+/* insert 'len' bytes at position 'pos'. Return < 0 if error. */
+static int dbuf_insert(DynBuf *s, int pos, int len)
+{
+ if (dbuf_realloc(s, s->size + len))
+ return -1;
+ memmove(s->buf + pos + len, s->buf + pos, s->size - pos);
+ s->size += len;
+ return 0;
+}
+
+static const uint16_t char_range_d[] = {
+ 1,
+ 0x0030, 0x0039 + 1,
+};
+
+/* code point ranges for Zs,Zl or Zp property */
+static const uint16_t char_range_s[] = {
+ 10,
+ 0x0009, 0x000D + 1,
+ 0x0020, 0x0020 + 1,
+ 0x00A0, 0x00A0 + 1,
+ 0x1680, 0x1680 + 1,
+ 0x2000, 0x200A + 1,
+ /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */
+ /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */
+ 0x2028, 0x2029 + 1,
+ 0x202F, 0x202F + 1,
+ 0x205F, 0x205F + 1,
+ 0x3000, 0x3000 + 1,
+ /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */
+ 0xFEFF, 0xFEFF + 1,
+};
+
+static const uint16_t char_range_w[] = {
+ 4,
+ 0x0030, 0x0039 + 1,
+ 0x0041, 0x005A + 1,
+ 0x005F, 0x005F + 1,
+ 0x0061, 0x007A + 1,
+};
+
+#define CLASS_RANGE_BASE 0x40000000
+
+typedef enum {
+ CHAR_RANGE_d,
+ CHAR_RANGE_D,
+ CHAR_RANGE_s,
+ CHAR_RANGE_S,
+ CHAR_RANGE_w,
+ CHAR_RANGE_W,
+} CharRangeEnum;
+
+static const uint16_t * const char_range_table[] = {
+ char_range_d,
+ char_range_s,
+ char_range_w,
+};
+
+static int cr_init_char_range(REParseState *s, CharRange *cr, uint32_t c)
+{
+ BOOL invert;
+ const uint16_t *c_pt;
+ int len, i;
+
+ invert = c & 1;
+ c_pt = char_range_table[c >> 1];
+ len = *c_pt++;
+ cr_init(cr, s->opaque, lre_realloc);
+ for(i = 0; i < len * 2; i++) {
+ if (cr_add_point(cr, c_pt[i]))
+ goto fail;
+ }
+ if (invert) {
+ if (cr_invert(cr))
+ goto fail;
+ }
+ return 0;
+ fail:
+ cr_free(cr);
+ return -1;
+}
+
+#ifdef DUMP_REOP
+static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
+ int buf_len)
+{
+ int pos, len, opcode, bc_len, re_flags, i;
+ uint32_t val;
+
+ assert(buf_len >= RE_HEADER_LEN);
+
+ re_flags = lre_get_flags(buf);
+ bc_len = get_u32(buf + RE_HEADER_BYTECODE_LEN);
+ assert(bc_len + RE_HEADER_LEN <= buf_len);
+ printf("flags: 0x%x capture_count=%d stack_size=%d\n",
+ re_flags, buf[RE_HEADER_CAPTURE_COUNT], buf[RE_HEADER_STACK_SIZE]);
+ if (re_flags & LRE_FLAG_NAMED_GROUPS) {
+ const char *p;
+ p = (char *)buf + RE_HEADER_LEN + bc_len;
+ printf("named groups: ");
+ for(i = 1; i < buf[RE_HEADER_CAPTURE_COUNT]; i++) {
+ if (i != 1)
+ printf(",");
+ printf("<%s>", p);
+ p += strlen(p) + 1;
+ }
+ printf("\n");
+ assert(p == (char *)(buf + buf_len));
+ }
+ printf("bytecode_len=%d\n", bc_len);
+
+ buf += RE_HEADER_LEN;
+ pos = 0;
+ while (pos < bc_len) {
+ printf("%5u: ", pos);
+ opcode = buf[pos];
+ len = reopcode_info[opcode].size;
+ if (opcode >= REOP_COUNT) {
+ printf(" invalid opcode=0x%02x\n", opcode);
+ break;
+ }
+ if ((pos + len) > bc_len) {
+ printf(" buffer overflow (opcode=0x%02x)\n", opcode);
+ break;
+ }
+ printf("%s", reopcode_info[opcode].name);
+ switch(opcode) {
+ case REOP_char:
+ val = get_u16(buf + pos + 1);
+ if (val >= ' ' && val <= 126)
+ printf(" '%c'", val);
+ else
+ printf(" 0x%04x", val);
+ break;
+ case REOP_char32:
+ val = get_u32(buf + pos + 1);
+ if (val >= ' ' && val <= 126)
+ printf(" '%c'", val);
+ else
+ printf(" 0x%08x", val);
+ break;
+ case REOP_goto:
+ case REOP_split_goto_first:
+ case REOP_split_next_first:
+ case REOP_loop:
+ case REOP_lookahead:
+ case REOP_negative_lookahead:
+ val = get_u32(buf + pos + 1);
+ val += (pos + 5);
+ printf(" %u", val);
+ break;
+ case REOP_simple_greedy_quant:
+ printf(" %u %u %u %u",
+ get_u32(buf + pos + 1) + (pos + 17),
+ get_u32(buf + pos + 1 + 4),
+ get_u32(buf + pos + 1 + 8),
+ get_u32(buf + pos + 1 + 12));
+ break;
+ case REOP_save_start:
+ case REOP_save_end:
+ case REOP_back_reference:
+ case REOP_backward_back_reference:
+ printf(" %u", buf[pos + 1]);
+ break;
+ case REOP_save_reset:
+ printf(" %u %u", buf[pos + 1], buf[pos + 2]);
+ break;
+ case REOP_push_i32:
+ val = get_u32(buf + pos + 1);
+ printf(" %d", val);
+ break;
+ case REOP_range:
+ {
+ int n, i;
+ n = get_u16(buf + pos + 1);
+ len += n * 4;
+ for(i = 0; i < n * 2; i++) {
+ val = get_u16(buf + pos + 3 + i * 2);
+ printf(" 0x%04x", val);
+ }
+ }
+ break;
+ case REOP_range32:
+ {
+ int n, i;
+ n = get_u16(buf + pos + 1);
+ len += n * 8;
+ for(i = 0; i < n * 2; i++) {
+ val = get_u32(buf + pos + 3 + i * 4);
+ printf(" 0x%08x", val);
+ }
+ }
+ break;
+ default:
+ break;
+ }
+ printf("\n");
+ pos += len;
+ }
+}
+#endif
+
+static void re_emit_op(REParseState *s, int op)
+{
+ dbuf_putc(&s->byte_code, op);
+}
+
+/* return the offset of the u32 value */
+static int re_emit_op_u32(REParseState *s, int op, uint32_t val)
+{
+ int pos;
+ dbuf_putc(&s->byte_code, op);
+ pos = s->byte_code.size;
+ dbuf_put_u32(&s->byte_code, val);
+ return pos;
+}
+
+static int re_emit_goto(REParseState *s, int op, uint32_t val)
+{
+ int pos;
+ dbuf_putc(&s->byte_code, op);
+ pos = s->byte_code.size;
+ dbuf_put_u32(&s->byte_code, val - (pos + 4));
+ return pos;
+}
+
+static void re_emit_op_u8(REParseState *s, int op, uint32_t val)
+{
+ dbuf_putc(&s->byte_code, op);
+ dbuf_putc(&s->byte_code, val);
+}
+
+static void re_emit_op_u16(REParseState *s, int op, uint32_t val)
+{
+ dbuf_putc(&s->byte_code, op);
+ dbuf_put_u16(&s->byte_code, val);
+}
+
+static int FORMAT_ATTR(2, 3) re_parse_error(REParseState *s, const char *fmt, ...)
+{
+ va_list ap;
+ va_start(ap, fmt);
+ vsnprintf(s->u.error_msg, sizeof(s->u.error_msg), fmt, ap);
+ va_end(ap);
+ return -1;
+}
+
+static int re_parse_out_of_memory(REParseState *s)
+{
+ return re_parse_error(s, "out of memory");
+}
+
+/* If allow_overflow is false, return -1 in case of
+ overflow. Otherwise return INT32_MAX. */
+static int parse_digits(const uint8_t **pp, BOOL allow_overflow)
+{
+ const uint8_t *p;
+ uint64_t v;
+ int c;
+
+ p = *pp;
+ v = 0;
+ for(;;) {
+ c = *p;
+ if (c < '0' || c > '9')
+ break;
+ v = v * 10 + c - '0';
+ if (v >= INT32_MAX) {
+ if (allow_overflow)
+ v = INT32_MAX;
+ else
+ return -1;
+ }
+ p++;
+ }
+ *pp = p;
+ return v;
+}
+
+static int re_parse_expect(REParseState *s, const uint8_t **pp, int c)
+{
+ const uint8_t *p;
+ p = *pp;
+ if (*p != c)
+ return re_parse_error(s, "expecting '%c'", c);
+ p++;
+ *pp = p;
+ return 0;
+}
+
+/* Parse an escape sequence, *pp points after the '\':
+ allow_utf16 value:
+ 0 : no UTF-16 escapes allowed
+ 1 : UTF-16 escapes allowed
+ 2 : UTF-16 escapes allowed and escapes of surrogate pairs are
+ converted to a unicode character (unicode regexp case).
+
+ Return the unicode char and update *pp if recognized,
+ return -1 if malformed escape,
+ return -2 otherwise. */
+int lre_parse_escape(const uint8_t **pp, int allow_utf16)
+{
+ const uint8_t *p;
+ uint32_t c;
+
+ p = *pp;
+ c = *p++;
+ switch(c) {
+ case 'b':
+ c = '\b';
+ break;
+ case 'f':
+ c = '\f';
+ break;
+ case 'n':
+ c = '\n';
+ break;
+ case 'r':
+ c = '\r';
+ break;
+ case 't':
+ c = '\t';
+ break;
+ case 'v':
+ c = '\v';
+ break;
+ case 'x':
+ case 'u':
+ {
+ int h, n, i;
+ uint32_t c1;
+
+ if (*p == '{' && allow_utf16) {
+ p++;
+ c = 0;
+ for(;;) {
+ h = from_hex(*p++);
+ if (h < 0)
+ return -1;
+ c = (c << 4) | h;
+ if (c > 0x10FFFF)
+ return -1;
+ if (*p == '}')
+ break;
+ }
+ p++;
+ } else {
+ if (c == 'x') {
+ n = 2;
+ } else {
+ n = 4;
+ }
+
+ c = 0;
+ for(i = 0; i < n; i++) {
+ h = from_hex(*p++);
+ if (h < 0) {
+ return -1;
+ }
+ c = (c << 4) | h;
+ }
+ if (is_hi_surrogate(c) &&
+ allow_utf16 == 2 && p[0] == '\\' && p[1] == 'u') {
+ /* convert an escaped surrogate pair into a
+ unicode char */
+ c1 = 0;
+ for(i = 0; i < 4; i++) {
+ h = from_hex(p[2 + i]);
+ if (h < 0)
+ break;
+ c1 = (c1 << 4) | h;
+ }
+ if (i == 4 && is_lo_surrogate(c1)) {
+ p += 6;
+ c = from_surrogate(c, c1);
+ }
+ }
+ }
+ }
+ break;
+ case '0': case '1': case '2': case '3':
+ case '4': case '5': case '6': case '7':
+ c -= '0';
+ if (allow_utf16 == 2) {
+ /* only accept \0 not followed by digit */
+ if (c != 0 || is_digit(*p))
+ return -1;
+ } else {
+ /* parse a legacy octal sequence */
+ uint32_t v;
+ v = *p - '0';
+ if (v > 7)
+ break;
+ c = (c << 3) | v;
+ p++;
+ if (c >= 32)
+ break;
+ v = *p - '0';
+ if (v > 7)
+ break;
+ c = (c << 3) | v;
+ p++;
+ }
+ break;
+ default:
+ return -2;
+ }
+ *pp = p;
+ return c;
+}
+
+#ifdef CONFIG_ALL_UNICODE
+/* XXX: we use the same chars for name and value */
+static BOOL is_unicode_char(int c)
+{
+ return ((c >= '0' && c <= '9') ||
+ (c >= 'A' && c <= 'Z') ||
+ (c >= 'a' && c <= 'z') ||
+ (c == '_'));
+}
+
+static int parse_unicode_property(REParseState *s, CharRange *cr,
+ const uint8_t **pp, BOOL is_inv)
+{
+ const uint8_t *p;
+ char name[64], value[64];
+ char *q;
+ BOOL script_ext;
+ int ret;
+
+ p = *pp;
+ if (*p != '{')
+ return re_parse_error(s, "expecting '{' after \\p");
+ p++;
+ q = name;
+ while (is_unicode_char(*p)) {
+ if ((q - name) >= sizeof(name) - 1)
+ goto unknown_property_name;
+ *q++ = *p++;
+ }
+ *q = '\0';
+ q = value;
+ if (*p == '=') {
+ p++;
+ while (is_unicode_char(*p)) {
+ if ((q - value) >= sizeof(value) - 1)
+ return re_parse_error(s, "unknown unicode property value");
+ *q++ = *p++;
+ }
+ }
+ *q = '\0';
+ if (*p != '}')
+ return re_parse_error(s, "expecting '}'");
+ p++;
+ // printf("name=%s value=%s\n", name, value);
+
+ if (!strcmp(name, "Script") || !strcmp(name, "sc")) {
+ script_ext = FALSE;
+ goto do_script;
+ } else if (!strcmp(name, "Script_Extensions") || !strcmp(name, "scx")) {
+ script_ext = TRUE;
+ do_script:
+ cr_init(cr, s->opaque, lre_realloc);
+ ret = unicode_script(cr, value, script_ext);
+ if (ret) {
+ cr_free(cr);
+ if (ret == -2)
+ return re_parse_error(s, "unknown unicode script");
+ else
+ goto out_of_memory;
+ }
+ } else if (!strcmp(name, "General_Category") || !strcmp(name, "gc")) {
+ cr_init(cr, s->opaque, lre_realloc);
+ ret = unicode_general_category(cr, value);
+ if (ret) {
+ cr_free(cr);
+ if (ret == -2)
+ return re_parse_error(s, "unknown unicode general category");
+ else
+ goto out_of_memory;
+ }
+ } else if (value[0] == '\0') {
+ cr_init(cr, s->opaque, lre_realloc);
+ ret = unicode_general_category(cr, name);
+ if (ret == -1) {
+ cr_free(cr);
+ goto out_of_memory;
+ }
+ if (ret < 0) {
+ ret = unicode_prop(cr, name);
+ if (ret) {
+ cr_free(cr);
+ if (ret == -2)
+ goto unknown_property_name;
+ else
+ goto out_of_memory;
+ }
+ }
+ } else {
+ unknown_property_name:
+ return re_parse_error(s, "unknown unicode property name");
+ }
+
+ if (is_inv) {
+ if (cr_invert(cr)) {
+ cr_free(cr);
+ return -1;
+ }
+ }
+ *pp = p;
+ return 0;
+ out_of_memory:
+ return re_parse_out_of_memory(s);
+}
+#endif /* CONFIG_ALL_UNICODE */
+
+/* return -1 if error otherwise the character or a class range
+ (CLASS_RANGE_BASE). In case of class range, 'cr' is
+ initialized. Otherwise, it is ignored. */
+static int get_class_atom(REParseState *s, CharRange *cr,
+ const uint8_t **pp, BOOL inclass)
+{
+ const uint8_t *p;
+ uint32_t c;
+ int ret;
+
+ p = *pp;
+
+ c = *p;
+ switch(c) {
+ case '\\':
+ p++;
+ if (p >= s->buf_end)
+ goto unexpected_end;
+ c = *p++;
+ switch(c) {
+ case 'd':
+ c = CHAR_RANGE_d;
+ goto class_range;
+ case 'D':
+ c = CHAR_RANGE_D;
+ goto class_range;
+ case 's':
+ c = CHAR_RANGE_s;
+ goto class_range;
+ case 'S':
+ c = CHAR_RANGE_S;
+ goto class_range;
+ case 'w':
+ c = CHAR_RANGE_w;
+ goto class_range;
+ case 'W':
+ c = CHAR_RANGE_W;
+ class_range:
+ if (cr_init_char_range(s, cr, c))
+ return -1;
+ c = CLASS_RANGE_BASE;
+ break;
+ case 'c':
+ c = *p;
+ if ((c >= 'a' && c <= 'z') ||
+ (c >= 'A' && c <= 'Z') ||
+ (((c >= '0' && c <= '9') || c == '_') &&
+ inclass && !s->is_unicode)) { /* Annex B.1.4 */
+ c &= 0x1f;
+ p++;
+ } else if (s->is_unicode) {
+ goto invalid_escape;
+ } else {
+ /* otherwise return '\' and 'c' */
+ p--;
+ c = '\\';
+ }
+ break;
+#ifdef CONFIG_ALL_UNICODE
+ case 'p':
+ case 'P':
+ if (s->is_unicode) {
+ if (parse_unicode_property(s, cr, &p, (c == 'P')))
+ return -1;
+ c = CLASS_RANGE_BASE;
+ break;
+ }
+ /* fall thru */
+#endif
+ default:
+ p--;
+ ret = lre_parse_escape(&p, s->is_unicode * 2);
+ if (ret >= 0) {
+ c = ret;
+ } else {
+ if (ret == -2 && *p != '\0' && strchr("^$\\.*+?()[]{}|/", *p)) {
+ /* always valid to escape these characters */
+ goto normal_char;
+ } else if (s->is_unicode) {
+ invalid_escape:
+ return re_parse_error(s, "invalid escape sequence in regular expression");
+ } else {
+ /* just ignore the '\' */
+ goto normal_char;
+ }
+ }
+ break;
+ }
+ break;
+ case '\0':
+ if (p >= s->buf_end) {
+ unexpected_end:
+ return re_parse_error(s, "unexpected end");
+ }
+ /* fall thru */
+ default:
+ normal_char:
+ /* normal char */
+ if (c >= 128) {
+ c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
+ if ((unsigned)c > 0xffff && !s->is_unicode) {
+ /* XXX: should handle non BMP-1 code points */
+ return re_parse_error(s, "malformed unicode char");
+ }
+ } else {
+ p++;
+ }
+ break;
+ }
+ *pp = p;
+ return c;
+}
+
+static int re_emit_range(REParseState *s, const CharRange *cr)
+{
+ int len, i;
+ uint32_t high;
+
+ len = (unsigned)cr->len / 2;
+ if (len >= 65535)
+ return re_parse_error(s, "too many ranges");
+ if (len == 0) {
+ /* not sure it can really happen. Emit a match that is always
+ false */
+ re_emit_op_u32(s, REOP_char32, -1);
+ } else {
+ high = cr->points[cr->len - 1];
+ if (high == UINT32_MAX)
+ high = cr->points[cr->len - 2];
+ if (high <= 0xffff) {
+ /* can use 16 bit ranges with the conversion that 0xffff =
+ infinity */
+ re_emit_op_u16(s, REOP_range, len);
+ for(i = 0; i < cr->len; i += 2) {
+ dbuf_put_u16(&s->byte_code, cr->points[i]);
+ high = cr->points[i + 1] - 1;
+ if (high == UINT32_MAX - 1)
+ high = 0xffff;
+ dbuf_put_u16(&s->byte_code, high);
+ }
+ } else {
+ re_emit_op_u16(s, REOP_range32, len);
+ for(i = 0; i < cr->len; i += 2) {
+ dbuf_put_u32(&s->byte_code, cr->points[i]);
+ dbuf_put_u32(&s->byte_code, cr->points[i + 1] - 1);
+ }
+ }
+ }
+ return 0;
+}
+
+static int re_parse_char_class(REParseState *s, const uint8_t **pp)
+{
+ const uint8_t *p;
+ uint32_t c1, c2;
+ CharRange cr_s, *cr = &cr_s;
+ CharRange cr1_s, *cr1 = &cr1_s;
+ BOOL invert;
+
+ cr_init(cr, s->opaque, lre_realloc);
+ p = *pp;
+ p++; /* skip '[' */
+
+ invert = FALSE;
+ if (*p == '^') {
+ p++;
+ invert = TRUE;
+ }
+
+ for(;;) {
+ if (*p == ']')
+ break;
+ c1 = get_class_atom(s, cr1, &p, TRUE);
+ if ((int)c1 < 0)
+ goto fail;
+ if (*p == '-' && p[1] != ']') {
+ const uint8_t *p0 = p + 1;
+ if (c1 >= CLASS_RANGE_BASE) {
+ if (s->is_unicode) {
+ cr_free(cr1);
+ goto invalid_class_range;
+ }
+ /* Annex B: match '-' character */
+ goto class_atom;
+ }
+ c2 = get_class_atom(s, cr1, &p0, TRUE);
+ if ((int)c2 < 0)
+ goto fail;
+ if (c2 >= CLASS_RANGE_BASE) {
+ cr_free(cr1);
+ if (s->is_unicode) {
+ goto invalid_class_range;
+ }
+ /* Annex B: match '-' character */
+ goto class_atom;
+ }
+ p = p0;
+ if (c2 < c1) {
+ invalid_class_range:
+ re_parse_error(s, "invalid class range");
+ goto fail;
+ }
+ if (cr_union_interval(cr, c1, c2))
+ goto memory_error;
+ } else {
+ class_atom:
+ if (c1 >= CLASS_RANGE_BASE) {
+ int ret;
+ ret = cr_union1(cr, cr1->points, cr1->len);
+ cr_free(cr1);
+ if (ret)
+ goto memory_error;
+ } else {
+ if (cr_union_interval(cr, c1, c1))
+ goto memory_error;
+ }
+ }
+ }
+ if (s->ignore_case) {
+ if (cr_regexp_canonicalize(cr, s->is_unicode))
+ goto memory_error;
+ }
+ if (invert) {
+ if (cr_invert(cr))
+ goto memory_error;
+ }
+ if (re_emit_range(s, cr))
+ goto fail;
+ cr_free(cr);
+ p++; /* skip ']' */
+ *pp = p;
+ return 0;
+ memory_error:
+ re_parse_out_of_memory(s);
+ fail:
+ cr_free(cr);
+ return -1;
+}
+
+/* Return:
+ - true if the opcodes may not advance the char pointer
+ - false if the opcodes always advance the char pointer
+*/
+static BOOL re_need_check_advance(const uint8_t *bc_buf, int bc_buf_len)
+{
+ int pos, opcode, len;
+ uint32_t val;
+ BOOL ret;
+
+ ret = TRUE;
+ pos = 0;
+ while (pos < bc_buf_len) {
+ opcode = bc_buf[pos];
+ len = reopcode_info[opcode].size;
+ switch(opcode) {
+ case REOP_range:
+ val = get_u16(bc_buf + pos + 1);
+ len += val * 4;
+ goto simple_char;
+ case REOP_range32:
+ val = get_u16(bc_buf + pos + 1);
+ len += val * 8;
+ goto simple_char;
+ case REOP_char:
+ case REOP_char32:
+ case REOP_dot:
+ case REOP_any:
+ simple_char:
+ ret = FALSE;
+ break;
+ case REOP_line_start:
+ case REOP_line_end:
+ case REOP_push_i32:
+ case REOP_push_char_pos:
+ case REOP_drop:
+ case REOP_word_boundary:
+ case REOP_not_word_boundary:
+ case REOP_prev:
+ /* no effect */
+ break;
+ case REOP_save_start:
+ case REOP_save_end:
+ case REOP_save_reset:
+ case REOP_back_reference:
+ case REOP_backward_back_reference:
+ break;
+ default:
+ /* safe behavior: we cannot predict the outcome */
+ return TRUE;
+ }
+ pos += len;
+ }
+ return ret;
+}
+
+/* return -1 if a simple quantifier cannot be used. Otherwise return
+ the number of characters in the atom. */
+static int re_is_simple_quantifier(const uint8_t *bc_buf, int bc_buf_len)
+{
+ int pos, opcode, len, count;
+ uint32_t val;
+
+ count = 0;
+ pos = 0;
+ while (pos < bc_buf_len) {
+ opcode = bc_buf[pos];
+ len = reopcode_info[opcode].size;
+ switch(opcode) {
+ case REOP_range:
+ val = get_u16(bc_buf + pos + 1);
+ len += val * 4;
+ goto simple_char;
+ case REOP_range32:
+ val = get_u16(bc_buf + pos + 1);
+ len += val * 8;
+ goto simple_char;
+ case REOP_char:
+ case REOP_char32:
+ case REOP_dot:
+ case REOP_any:
+ simple_char:
+ count++;
+ break;
+ case REOP_line_start:
+ case REOP_line_end:
+ case REOP_word_boundary:
+ case REOP_not_word_boundary:
+ break;
+ default:
+ return -1;
+ }
+ pos += len;
+ }
+ return count;
+}
+
+/* '*pp' is the first char after '<' */
+static int re_parse_group_name(char *buf, int buf_size, const uint8_t **pp)
+{
+ const uint8_t *p, *p1;
+ uint32_t c, d;
+ char *q;
+
+ p = *pp;
+ q = buf;
+ for(;;) {
+ c = *p;
+ if (c == '\\') {
+ p++;
+ if (*p != 'u')
+ return -1;
+ c = lre_parse_escape(&p, 2); // accept surrogate pairs
+ } else if (c == '>') {
+ break;
+ } else if (c >= 128) {
+ c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
+ if (is_hi_surrogate(c)) {
+ d = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p1);
+ if (is_lo_surrogate(d)) {
+ c = from_surrogate(c, d);
+ p = p1;
+ }
+ }
+ } else {
+ p++;
+ }
+ if (c > 0x10FFFF)
+ return -1;
+ if (q == buf) {
+ if (!lre_js_is_ident_first(c))
+ return -1;
+ } else {
+ if (!lre_js_is_ident_next(c))
+ return -1;
+ }
+ if ((q - buf + UTF8_CHAR_LEN_MAX + 1) > buf_size)
+ return -1;
+ if (c < 128) {
+ *q++ = c;
+ } else {
+ q += unicode_to_utf8((uint8_t*)q, c);
+ }
+ }
+ if (q == buf)
+ return -1;
+ *q = '\0';
+ p++;
+ *pp = p;
+ return 0;
+}
+
+/* if capture_name = NULL: return the number of captures + 1.
+ Otherwise, return the capture index corresponding to capture_name
+ or -1 if none */
+static int re_parse_captures(REParseState *s, int *phas_named_captures,
+ const char *capture_name)
+{
+ const uint8_t *p;
+ int capture_index;
+ char name[TMP_BUF_SIZE];
+
+ capture_index = 1;
+ *phas_named_captures = 0;
+ for (p = s->buf_start; p < s->buf_end; p++) {
+ switch (*p) {
+ case '(':
+ if (p[1] == '?') {
+ if (p[2] == '<' && p[3] != '=' && p[3] != '!') {
+ *phas_named_captures = 1;
+ /* potential named capture */
+ if (capture_name) {
+ p += 3;
+ if (re_parse_group_name(name, sizeof(name), &p) == 0) {
+ if (!strcmp(name, capture_name))
+ return capture_index;
+ }
+ }
+ capture_index++;
+ if (capture_index >= CAPTURE_COUNT_MAX)
+ goto done;
+ }
+ } else {
+ capture_index++;
+ if (capture_index >= CAPTURE_COUNT_MAX)
+ goto done;
+ }
+ break;
+ case '\\':
+ p++;
+ break;
+ case '[':
+ for (p += 1 + (*p == ']'); p < s->buf_end && *p != ']'; p++) {
+ if (*p == '\\')
+ p++;
+ }
+ break;
+ }
+ }
+ done:
+ if (capture_name)
+ return -1;
+ else
+ return capture_index;
+}
+
+static int re_count_captures(REParseState *s)
+{
+ if (s->total_capture_count < 0) {
+ s->total_capture_count = re_parse_captures(s, &s->has_named_captures,
+ NULL);
+ }
+ return s->total_capture_count;
+}
+
+static BOOL re_has_named_captures(REParseState *s)
+{
+ if (s->has_named_captures < 0)
+ re_count_captures(s);
+ return s->has_named_captures;
+}
+
+static int find_group_name(REParseState *s, const char *name)
+{
+ const char *p, *buf_end;
+ size_t len, name_len;
+ int capture_index;
+
+ p = (char *)s->group_names.buf;
+ if (!p) return -1;
+ buf_end = (char *)s->group_names.buf + s->group_names.size;
+ name_len = strlen(name);
+ capture_index = 1;
+ while (p < buf_end) {
+ len = strlen(p);
+ if (len == name_len && memcmp(name, p, name_len) == 0)
+ return capture_index;
+ p += len + 1;
+ capture_index++;
+ }
+ return -1;
+}
+
+static int re_parse_disjunction(REParseState *s, BOOL is_backward_dir);
+
+static int re_parse_term(REParseState *s, BOOL is_backward_dir)
+{
+ const uint8_t *p;
+ int c, last_atom_start, quant_min, quant_max, last_capture_count;
+ BOOL greedy, add_zero_advance_check, is_neg, is_backward_lookahead;
+ CharRange cr_s, *cr = &cr_s;
+
+ last_atom_start = -1;
+ last_capture_count = 0;
+ p = s->buf_ptr;
+ c = *p;
+ switch(c) {
+ case '^':
+ p++;
+ re_emit_op(s, REOP_line_start);
+ break;
+ case '$':
+ p++;
+ re_emit_op(s, REOP_line_end);
+ break;
+ case '.':
+ p++;
+ last_atom_start = s->byte_code.size;
+ last_capture_count = s->capture_count;
+ if (is_backward_dir)
+ re_emit_op(s, REOP_prev);
+ re_emit_op(s, s->dotall ? REOP_any : REOP_dot);
+ if (is_backward_dir)
+ re_emit_op(s, REOP_prev);
+ break;
+ case '{':
+ if (s->is_unicode) {
+ return re_parse_error(s, "syntax error");
+ } else if (!is_digit(p[1])) {
+ /* Annex B: we accept '{' not followed by digits as a
+ normal atom */
+ goto parse_class_atom;
+ } else {
+ const uint8_t *p1 = p + 1;
+ /* Annex B: error if it is like a repetition count */
+ parse_digits(&p1, TRUE);
+ if (*p1 == ',') {
+ p1++;
+ if (is_digit(*p1)) {
+ parse_digits(&p1, TRUE);
+ }
+ }
+ if (*p1 != '}') {
+ goto parse_class_atom;
+ }
+ }
+ /* fall thru */
+ case '*':
+ case '+':
+ case '?':
+ return re_parse_error(s, "nothing to repeat");
+ case '(':
+ if (p[1] == '?') {
+ if (p[2] == ':') {
+ p += 3;
+ last_atom_start = s->byte_code.size;
+ last_capture_count = s->capture_count;
+ s->buf_ptr = p;
+ if (re_parse_disjunction(s, is_backward_dir))
+ return -1;
+ p = s->buf_ptr;
+ if (re_parse_expect(s, &p, ')'))
+ return -1;
+ } else if ((p[2] == '=' || p[2] == '!')) {
+ is_neg = (p[2] == '!');
+ is_backward_lookahead = FALSE;
+ p += 3;
+ goto lookahead;
+ } else if (p[2] == '<' &&
+ (p[3] == '=' || p[3] == '!')) {
+ int pos;
+ is_neg = (p[3] == '!');
+ is_backward_lookahead = TRUE;
+ p += 4;
+ /* lookahead */
+ lookahead:
+ /* Annex B allows lookahead to be used as an atom for
+ the quantifiers */
+ if (!s->is_unicode && !is_backward_lookahead) {
+ last_atom_start = s->byte_code.size;
+ last_capture_count = s->capture_count;
+ }
+ pos = re_emit_op_u32(s, REOP_lookahead + is_neg, 0);
+ s->buf_ptr = p;
+ if (re_parse_disjunction(s, is_backward_lookahead))
+ return -1;
+ p = s->buf_ptr;
+ if (re_parse_expect(s, &p, ')'))
+ return -1;
+ re_emit_op(s, REOP_match);
+ /* jump after the 'match' after the lookahead is successful */
+ if (dbuf_error(&s->byte_code))
+ return -1;
+ put_u32(s->byte_code.buf + pos, s->byte_code.size - (pos + 4));
+ } else if (p[2] == '<') {
+ p += 3;
+ if (re_parse_group_name(s->u.tmp_buf, sizeof(s->u.tmp_buf),
+ &p)) {
+ return re_parse_error(s, "invalid group name");
+ }
+ if (find_group_name(s, s->u.tmp_buf) > 0) {
+ return re_parse_error(s, "duplicate group name");
+ }
+ /* group name with a trailing zero */
+ dbuf_put(&s->group_names, (uint8_t *)s->u.tmp_buf,
+ strlen(s->u.tmp_buf) + 1);
+ s->has_named_captures = 1;
+ goto parse_capture;
+ } else {
+ return re_parse_error(s, "invalid group");
+ }
+ } else {
+ int capture_index;
+ p++;
+ /* capture without group name */
+ dbuf_putc(&s->group_names, 0);
+ parse_capture:
+ if (s->capture_count >= CAPTURE_COUNT_MAX)
+ return re_parse_error(s, "too many captures");
+ last_atom_start = s->byte_code.size;
+ last_capture_count = s->capture_count;
+ capture_index = s->capture_count++;
+ re_emit_op_u8(s, REOP_save_start + is_backward_dir,
+ capture_index);
+
+ s->buf_ptr = p;
+ if (re_parse_disjunction(s, is_backward_dir))
+ return -1;
+ p = s->buf_ptr;
+
+ re_emit_op_u8(s, REOP_save_start + 1 - is_backward_dir,
+ capture_index);
+
+ if (re_parse_expect(s, &p, ')'))
+ return -1;
+ }
+ break;
+ case '\\':
+ switch(p[1]) {
+ case 'b':
+ case 'B':
+ re_emit_op(s, REOP_word_boundary + (p[1] != 'b'));
+ p += 2;
+ break;
+ case 'k':
+ {
+ const uint8_t *p1;
+ int dummy_res;
+
+ p1 = p;
+ if (p1[2] != '<') {
+ /* annex B: we tolerate invalid group names in non
+ unicode mode if there is no named capture
+ definition */
+ if (s->is_unicode || re_has_named_captures(s))
+ return re_parse_error(s, "expecting group name");
+ else
+ goto parse_class_atom;
+ }
+ p1 += 3;
+ if (re_parse_group_name(s->u.tmp_buf, sizeof(s->u.tmp_buf),
+ &p1)) {
+ if (s->is_unicode || re_has_named_captures(s))
+ return re_parse_error(s, "invalid group name");
+ else
+ goto parse_class_atom;
+ }
+ c = find_group_name(s, s->u.tmp_buf);
+ if (c < 0) {
+ /* no capture name parsed before, try to look
+ after (inefficient, but hopefully not common */
+ c = re_parse_captures(s, &dummy_res, s->u.tmp_buf);
+ if (c < 0) {
+ if (s->is_unicode || re_has_named_captures(s))
+ return re_parse_error(s, "group name not defined");
+ else
+ goto parse_class_atom;
+ }
+ }
+ p = p1;
+ }
+ goto emit_back_reference;
+ case '0':
+ p += 2;
+ c = 0;
+ if (s->is_unicode) {
+ if (is_digit(*p)) {
+ return re_parse_error(s, "invalid decimal escape in regular expression");
+ }
+ } else {
+ /* Annex B.1.4: accept legacy octal */
+ if (*p >= '0' && *p <= '7') {
+ c = *p++ - '0';
+ if (*p >= '0' && *p <= '7') {
+ c = (c << 3) + *p++ - '0';
+ }
+ }
+ }
+ goto normal_char;
+ case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8':
+ case '9':
+ {
+ const uint8_t *q = ++p;
+
+ c = parse_digits(&p, FALSE);
+ if (c < 0 || (c >= s->capture_count && c >= re_count_captures(s))) {
+ if (!s->is_unicode) {
+ /* Annex B.1.4: accept legacy octal */
+ p = q;
+ if (*p <= '7') {
+ c = 0;
+ if (*p <= '3')
+ c = *p++ - '0';
+ if (*p >= '0' && *p <= '7') {
+ c = (c << 3) + *p++ - '0';
+ if (*p >= '0' && *p <= '7') {
+ c = (c << 3) + *p++ - '0';
+ }
+ }
+ } else {
+ c = *p++;
+ }
+ goto normal_char;
+ }
+ return re_parse_error(s, "back reference out of range in regular expression");
+ }
+ emit_back_reference:
+ last_atom_start = s->byte_code.size;
+ last_capture_count = s->capture_count;
+ re_emit_op_u8(s, REOP_back_reference + is_backward_dir, c);
+ }
+ break;
+ default:
+ goto parse_class_atom;
+ }
+ break;
+ case '[':
+ last_atom_start = s->byte_code.size;
+ last_capture_count = s->capture_count;
+ if (is_backward_dir)
+ re_emit_op(s, REOP_prev);
+ if (re_parse_char_class(s, &p))
+ return -1;
+ if (is_backward_dir)
+ re_emit_op(s, REOP_prev);
+ break;
+ case ']':
+ case '}':
+ if (s->is_unicode)
+ return re_parse_error(s, "syntax error");
+ goto parse_class_atom;
+ default:
+ parse_class_atom:
+ c = get_class_atom(s, cr, &p, FALSE);
+ if ((int)c < 0)
+ return -1;
+ normal_char:
+ last_atom_start = s->byte_code.size;
+ last_capture_count = s->capture_count;
+ if (is_backward_dir)
+ re_emit_op(s, REOP_prev);
+ if (c >= CLASS_RANGE_BASE) {
+ int ret;
+ /* Note: canonicalization is not needed */
+ ret = re_emit_range(s, cr);
+ cr_free(cr);
+ if (ret)
+ return -1;
+ } else {
+ if (s->ignore_case)
+ c = lre_canonicalize(c, s->is_unicode);
+ if (c <= 0xffff)
+ re_emit_op_u16(s, REOP_char, c);
+ else
+ re_emit_op_u32(s, REOP_char32, c);
+ }
+ if (is_backward_dir)
+ re_emit_op(s, REOP_prev);
+ break;
+ }
+
+ /* quantifier */
+ if (last_atom_start >= 0) {
+ c = *p;
+ switch(c) {
+ case '*':
+ p++;
+ quant_min = 0;
+ quant_max = INT32_MAX;
+ goto quantifier;
+ case '+':
+ p++;
+ quant_min = 1;
+ quant_max = INT32_MAX;
+ goto quantifier;
+ case '?':
+ p++;
+ quant_min = 0;
+ quant_max = 1;
+ goto quantifier;
+ case '{':
+ {
+ const uint8_t *p1 = p;
+ /* As an extension (see ES6 annex B), we accept '{' not
+ followed by digits as a normal atom */
+ if (!is_digit(p[1])) {
+ if (s->is_unicode)
+ goto invalid_quant_count;
+ break;
+ }
+ p++;
+ quant_min = parse_digits(&p, TRUE);
+ quant_max = quant_min;
+ if (*p == ',') {
+ p++;
+ if (is_digit(*p)) {
+ quant_max = parse_digits(&p, TRUE);
+ if (quant_max < quant_min) {
+ invalid_quant_count:
+ return re_parse_error(s, "invalid repetition count");
+ }
+ } else {
+ quant_max = INT32_MAX; /* infinity */
+ }
+ }
+ if (*p != '}' && !s->is_unicode) {
+ /* Annex B: normal atom if invalid '{' syntax */
+ p = p1;
+ break;
+ }
+ if (re_parse_expect(s, &p, '}'))
+ return -1;
+ }
+ quantifier:
+ greedy = TRUE;
+ if (*p == '?') {
+ p++;
+ greedy = FALSE;
+ }
+ if (last_atom_start < 0) {
+ return re_parse_error(s, "nothing to repeat");
+ }
+ if (greedy) {
+ int len, pos;
+
+ if (quant_max > 0) {
+ /* specific optimization for simple quantifiers */
+ if (dbuf_error(&s->byte_code))
+ goto out_of_memory;
+ len = re_is_simple_quantifier(s->byte_code.buf + last_atom_start,
+ s->byte_code.size - last_atom_start);
+ if (len > 0) {
+ re_emit_op(s, REOP_match);
+
+ if (dbuf_insert(&s->byte_code, last_atom_start, 17))
+ goto out_of_memory;
+ pos = last_atom_start;
+ s->byte_code.buf[pos++] = REOP_simple_greedy_quant;
+ put_u32(&s->byte_code.buf[pos],
+ s->byte_code.size - last_atom_start - 17);
+ pos += 4;
+ put_u32(&s->byte_code.buf[pos], quant_min);
+ pos += 4;
+ put_u32(&s->byte_code.buf[pos], quant_max);
+ pos += 4;
+ put_u32(&s->byte_code.buf[pos], len);
+ pos += 4;
+ goto done;
+ }
+ }
+
+ if (dbuf_error(&s->byte_code))
+ goto out_of_memory;
+ /* the spec tells that if there is no advance when
+ running the atom after the first quant_min times,
+ then there is no match. We remove this test when we
+ are sure the atom always advances the position. */
+ add_zero_advance_check = re_need_check_advance(s->byte_code.buf + last_atom_start,
+ s->byte_code.size - last_atom_start);
+ } else {
+ add_zero_advance_check = FALSE;
+ }
+
+ {
+ int len, pos;
+ len = s->byte_code.size - last_atom_start;
+ if (quant_min == 0) {
+ /* need to reset the capture in case the atom is
+ not executed */
+ if (last_capture_count != s->capture_count) {
+ if (dbuf_insert(&s->byte_code, last_atom_start, 3))
+ goto out_of_memory;
+ s->byte_code.buf[last_atom_start++] = REOP_save_reset;
+ s->byte_code.buf[last_atom_start++] = last_capture_count;
+ s->byte_code.buf[last_atom_start++] = s->capture_count - 1;
+ }
+ if (quant_max == 0) {
+ s->byte_code.size = last_atom_start;
+ } else if (quant_max == 1 || quant_max == INT32_MAX) {
+ BOOL has_goto = (quant_max == INT32_MAX);
+ if (dbuf_insert(&s->byte_code, last_atom_start, 5 + add_zero_advance_check))
+ goto out_of_memory;
+ s->byte_code.buf[last_atom_start] = REOP_split_goto_first +
+ greedy;
+ put_u32(s->byte_code.buf + last_atom_start + 1,
+ len + 5 * has_goto + add_zero_advance_check * 2);
+ if (add_zero_advance_check) {
+ s->byte_code.buf[last_atom_start + 1 + 4] = REOP_push_char_pos;
+ re_emit_op(s, REOP_check_advance);
+ }
+ if (has_goto)
+ re_emit_goto(s, REOP_goto, last_atom_start);
+ } else {
+ if (dbuf_insert(&s->byte_code, last_atom_start, 10 + add_zero_advance_check))
+ goto out_of_memory;
+ pos = last_atom_start;
+ s->byte_code.buf[pos++] = REOP_push_i32;
+ put_u32(s->byte_code.buf + pos, quant_max);
+ pos += 4;
+ s->byte_code.buf[pos++] = REOP_split_goto_first + greedy;
+ put_u32(s->byte_code.buf + pos, len + 5 + add_zero_advance_check * 2);
+ pos += 4;
+ if (add_zero_advance_check) {
+ s->byte_code.buf[pos++] = REOP_push_char_pos;
+ re_emit_op(s, REOP_check_advance);
+ }
+ re_emit_goto(s, REOP_loop, last_atom_start + 5);
+ re_emit_op(s, REOP_drop);
+ }
+ } else if (quant_min == 1 && quant_max == INT32_MAX &&
+ !add_zero_advance_check) {
+ re_emit_goto(s, REOP_split_next_first - greedy,
+ last_atom_start);
+ } else {
+ if (quant_min == 1) {
+ /* nothing to add */
+ } else {
+ if (dbuf_insert(&s->byte_code, last_atom_start, 5))
+ goto out_of_memory;
+ s->byte_code.buf[last_atom_start] = REOP_push_i32;
+ put_u32(s->byte_code.buf + last_atom_start + 1,
+ quant_min);
+ last_atom_start += 5;
+ re_emit_goto(s, REOP_loop, last_atom_start);
+ re_emit_op(s, REOP_drop);
+ }
+ if (quant_max == INT32_MAX) {
+ pos = s->byte_code.size;
+ re_emit_op_u32(s, REOP_split_goto_first + greedy,
+ len + 5 + add_zero_advance_check * 2);
+ if (add_zero_advance_check)
+ re_emit_op(s, REOP_push_char_pos);
+ /* copy the atom */
+ dbuf_put_self(&s->byte_code, last_atom_start, len);
+ if (add_zero_advance_check)
+ re_emit_op(s, REOP_check_advance);
+ re_emit_goto(s, REOP_goto, pos);
+ } else if (quant_max > quant_min) {
+ re_emit_op_u32(s, REOP_push_i32, quant_max - quant_min);
+ pos = s->byte_code.size;
+ re_emit_op_u32(s, REOP_split_goto_first + greedy,
+ len + 5 + add_zero_advance_check * 2);
+ if (add_zero_advance_check)
+ re_emit_op(s, REOP_push_char_pos);
+ /* copy the atom */
+ dbuf_put_self(&s->byte_code, last_atom_start, len);
+ if (add_zero_advance_check)
+ re_emit_op(s, REOP_check_advance);
+ re_emit_goto(s, REOP_loop, pos);
+ re_emit_op(s, REOP_drop);
+ }
+ }
+ last_atom_start = -1;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+ done:
+ s->buf_ptr = p;
+ return 0;
+ out_of_memory:
+ return re_parse_out_of_memory(s);
+}
+
+static int re_parse_alternative(REParseState *s, BOOL is_backward_dir)
+{
+ const uint8_t *p;
+ int ret;
+ size_t start, term_start, end, term_size;
+
+ start = s->byte_code.size;
+ for(;;) {
+ p = s->buf_ptr;
+ if (p >= s->buf_end)
+ break;
+ if (*p == '|' || *p == ')')
+ break;
+ term_start = s->byte_code.size;
+ ret = re_parse_term(s, is_backward_dir);
+ if (ret)
+ return ret;
+ if (is_backward_dir) {
+ /* reverse the order of the terms (XXX: inefficient, but
+ speed is not really critical here) */
+ end = s->byte_code.size;
+ term_size = end - term_start;
+ if (dbuf_realloc(&s->byte_code, end + term_size))
+ return -1;
+ memmove(s->byte_code.buf + start + term_size,
+ s->byte_code.buf + start,
+ end - start);
+ memcpy(s->byte_code.buf + start, s->byte_code.buf + end,
+ term_size);
+ }
+ }
+ return 0;
+}
+
+static int re_parse_disjunction(REParseState *s, BOOL is_backward_dir)
+{
+ int start, len, pos;
+
+ if (lre_check_stack_overflow(s->opaque, 0))
+ return re_parse_error(s, "stack overflow");
+
+ start = s->byte_code.size;
+ if (re_parse_alternative(s, is_backward_dir))
+ return -1;
+ while (*s->buf_ptr == '|') {
+ s->buf_ptr++;
+
+ len = s->byte_code.size - start;
+
+ /* insert a split before the first alternative */
+ if (dbuf_insert(&s->byte_code, start, 5)) {
+ return re_parse_out_of_memory(s);
+ }
+ s->byte_code.buf[start] = REOP_split_next_first;
+ put_u32(s->byte_code.buf + start + 1, len + 5);
+
+ pos = re_emit_op_u32(s, REOP_goto, 0);
+
+ if (re_parse_alternative(s, is_backward_dir))
+ return -1;
+
+ /* patch the goto */
+ len = s->byte_code.size - (pos + 4);
+ put_u32(s->byte_code.buf + pos, len);
+ }
+ return 0;
+}
+
+/* the control flow is recursive so the analysis can be linear */
+static int compute_stack_size(const uint8_t *bc_buf, int bc_buf_len)
+{
+ int stack_size, stack_size_max, pos, opcode, len;
+ uint32_t val;
+
+ stack_size = 0;
+ stack_size_max = 0;
+ bc_buf += RE_HEADER_LEN;
+ bc_buf_len -= RE_HEADER_LEN;
+ pos = 0;
+ while (pos < bc_buf_len) {
+ opcode = bc_buf[pos];
+ len = reopcode_info[opcode].size;
+ assert(opcode < REOP_COUNT);
+ assert((pos + len) <= bc_buf_len);
+ switch(opcode) {
+ case REOP_push_i32:
+ case REOP_push_char_pos:
+ stack_size++;
+ if (stack_size > stack_size_max) {
+ if (stack_size > STACK_SIZE_MAX)
+ return -1;
+ stack_size_max = stack_size;
+ }
+ break;
+ case REOP_drop:
+ case REOP_check_advance:
+ assert(stack_size > 0);
+ stack_size--;
+ break;
+ case REOP_range:
+ val = get_u16(bc_buf + pos + 1);
+ len += val * 4;
+ break;
+ case REOP_range32:
+ val = get_u16(bc_buf + pos + 1);
+ len += val * 8;
+ break;
+ }
+ pos += len;
+ }
+ return stack_size_max;
+}
+
+/* 'buf' must be a zero terminated UTF-8 string of length buf_len.
+ Return NULL if error and allocate an error message in *perror_msg,
+ otherwise the compiled bytecode and its length in plen.
+*/
+uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size,
+ const char *buf, size_t buf_len, int re_flags,
+ void *opaque)
+{
+ REParseState s_s, *s = &s_s;
+ int stack_size;
+ BOOL is_sticky;
+
+ memset(s, 0, sizeof(*s));
+ s->opaque = opaque;
+ s->buf_ptr = (const uint8_t *)buf;
+ s->buf_end = s->buf_ptr + buf_len;
+ s->buf_start = s->buf_ptr;
+ s->re_flags = re_flags;
+ s->is_unicode = ((re_flags & LRE_FLAG_UNICODE) != 0);
+ is_sticky = ((re_flags & LRE_FLAG_STICKY) != 0);
+ s->ignore_case = ((re_flags & LRE_FLAG_IGNORECASE) != 0);
+ s->dotall = ((re_flags & LRE_FLAG_DOTALL) != 0);
+ s->capture_count = 1;
+ s->total_capture_count = -1;
+ s->has_named_captures = -1;
+
+ dbuf_init2(&s->byte_code, opaque, lre_realloc);
+ dbuf_init2(&s->group_names, opaque, lre_realloc);
+
+ dbuf_putc(&s->byte_code, re_flags); /* first element is the flags */
+ dbuf_putc(&s->byte_code, 0); /* second element is the number of captures */
+ dbuf_putc(&s->byte_code, 0); /* stack size */
+ dbuf_put_u32(&s->byte_code, 0); /* bytecode length */
+
+ if (!is_sticky) {
+ /* iterate thru all positions (about the same as .*?( ... ) )
+ . We do it without an explicit loop so that lock step
+ thread execution will be possible in an optimized
+ implementation */
+ re_emit_op_u32(s, REOP_split_goto_first, 1 + 5);
+ re_emit_op(s, REOP_any);
+ re_emit_op_u32(s, REOP_goto, -(5 + 1 + 5));
+ }
+ re_emit_op_u8(s, REOP_save_start, 0);
+
+ if (re_parse_disjunction(s, FALSE)) {
+ error:
+ dbuf_free(&s->byte_code);
+ dbuf_free(&s->group_names);
+ pstrcpy(error_msg, error_msg_size, s->u.error_msg);
+ *plen = 0;
+ return NULL;
+ }
+
+ re_emit_op_u8(s, REOP_save_end, 0);
+
+ re_emit_op(s, REOP_match);
+
+ if (*s->buf_ptr != '\0') {
+ re_parse_error(s, "extraneous characters at the end");
+ goto error;
+ }
+
+ if (dbuf_error(&s->byte_code)) {
+ re_parse_out_of_memory(s);
+ goto error;
+ }
+
+ stack_size = compute_stack_size(s->byte_code.buf, s->byte_code.size);
+ if (stack_size < 0) {
+ re_parse_error(s, "too many imbricated quantifiers");
+ goto error;
+ }
+
+ s->byte_code.buf[RE_HEADER_CAPTURE_COUNT] = s->capture_count;
+ s->byte_code.buf[RE_HEADER_STACK_SIZE] = stack_size;
+ put_u32(s->byte_code.buf + RE_HEADER_BYTECODE_LEN,
+ s->byte_code.size - RE_HEADER_LEN);
+
+ /* add the named groups if needed */
+ if (s->group_names.size > (s->capture_count - 1)) {
+ dbuf_put(&s->byte_code, s->group_names.buf, s->group_names.size);
+ s->byte_code.buf[RE_HEADER_FLAGS] |= LRE_FLAG_NAMED_GROUPS;
+ }
+ dbuf_free(&s->group_names);
+
+#ifdef DUMP_REOP
+ lre_dump_bytecode(s->byte_code.buf, s->byte_code.size);
+#endif
+
+ error_msg[0] = '\0';
+ *plen = s->byte_code.size;
+ return s->byte_code.buf;
+}
+
+static BOOL is_line_terminator(uint32_t c)
+{
+ return (c == '\n' || c == '\r' || c == CP_LS || c == CP_PS);
+}
+
+static BOOL is_word_char(uint32_t c)
+{
+ return ((c >= '0' && c <= '9') ||
+ (c >= 'a' && c <= 'z') ||
+ (c >= 'A' && c <= 'Z') ||
+ (c == '_'));
+}
+
+#define GET_CHAR(c, cptr, cbuf_end, cbuf_type) \
+ do { \
+ if (cbuf_type == 0) { \
+ c = *cptr++; \
+ } else { \
+ const uint16_t *_p = (const uint16_t *)cptr; \
+ const uint16_t *_end = (const uint16_t *)cbuf_end; \
+ c = *_p++; \
+ if (is_hi_surrogate(c) && cbuf_type == 2) { \
+ if (_p < _end && is_lo_surrogate(*_p)) { \
+ c = from_surrogate(c, *_p++); \
+ } \
+ } \
+ cptr = (const void *)_p; \
+ } \
+ } while (0)
+
+#define PEEK_CHAR(c, cptr, cbuf_end, cbuf_type) \
+ do { \
+ if (cbuf_type == 0) { \
+ c = cptr[0]; \
+ } else { \
+ const uint16_t *_p = (const uint16_t *)cptr; \
+ const uint16_t *_end = (const uint16_t *)cbuf_end; \
+ c = *_p++; \
+ if (is_hi_surrogate(c) && cbuf_type == 2) { \
+ if (_p < _end && is_lo_surrogate(*_p)) { \
+ c = from_surrogate(c, *_p); \
+ } \
+ } \
+ } \
+ } while (0)
+
+#define PEEK_PREV_CHAR(c, cptr, cbuf_start, cbuf_type) \
+ do { \
+ if (cbuf_type == 0) { \
+ c = cptr[-1]; \
+ } else { \
+ const uint16_t *_p = (const uint16_t *)cptr - 1; \
+ const uint16_t *_start = (const uint16_t *)cbuf_start; \
+ c = *_p; \
+ if (is_lo_surrogate(c) && cbuf_type == 2) { \
+ if (_p > _start && is_hi_surrogate(_p[-1])) { \
+ c = from_surrogate(*--_p, c); \
+ } \
+ } \
+ } \
+ } while (0)
+
+#define GET_PREV_CHAR(c, cptr, cbuf_start, cbuf_type) \
+ do { \
+ if (cbuf_type == 0) { \
+ cptr--; \
+ c = cptr[0]; \
+ } else { \
+ const uint16_t *_p = (const uint16_t *)cptr - 1; \
+ const uint16_t *_start = (const uint16_t *)cbuf_start; \
+ c = *_p; \
+ if (is_lo_surrogate(c) && cbuf_type == 2) { \
+ if (_p > _start && is_hi_surrogate(_p[-1])) { \
+ c = from_surrogate(*--_p, c); \
+ } \
+ } \
+ cptr = (const void *)_p; \
+ } \
+ } while (0)
+
+#define PREV_CHAR(cptr, cbuf_start, cbuf_type) \
+ do { \
+ if (cbuf_type == 0) { \
+ cptr--; \
+ } else { \
+ const uint16_t *_p = (const uint16_t *)cptr - 1; \
+ const uint16_t *_start = (const uint16_t *)cbuf_start; \
+ if (is_lo_surrogate(*_p) && cbuf_type == 2) { \
+ if (_p > _start && is_hi_surrogate(_p[-1])) { \
+ --_p; \
+ } \
+ } \
+ cptr = (const void *)_p; \
+ } \
+ } while (0)
+
+typedef uintptr_t StackInt;
+
+typedef enum {
+ RE_EXEC_STATE_SPLIT,
+ RE_EXEC_STATE_LOOKAHEAD,
+ RE_EXEC_STATE_NEGATIVE_LOOKAHEAD,
+ RE_EXEC_STATE_GREEDY_QUANT,
+} REExecStateEnum;
+
+typedef struct REExecState {
+ REExecStateEnum type : 8;
+ uint8_t stack_len;
+ size_t count; /* only used for RE_EXEC_STATE_GREEDY_QUANT */
+ const uint8_t *cptr;
+ const uint8_t *pc;
+ void *buf[0];
+} REExecState;
+
+typedef struct {
+ const uint8_t *cbuf;
+ const uint8_t *cbuf_end;
+ /* 0 = 8 bit chars, 1 = 16 bit chars, 2 = 16 bit chars, UTF-16 */
+ int cbuf_type;
+ int capture_count;
+ int stack_size_max;
+ BOOL multi_line;
+ BOOL ignore_case;
+ BOOL is_unicode;
+ void *opaque; /* used for stack overflow check */
+
+ size_t state_size;
+ uint8_t *state_stack;
+ size_t state_stack_size;
+ size_t state_stack_len;
+} REExecContext;
+
+static int push_state(REExecContext *s,
+ uint8_t **capture,
+ StackInt *stack, size_t stack_len,
+ const uint8_t *pc, const uint8_t *cptr,
+ REExecStateEnum type, size_t count)
+{
+ REExecState *rs;
+ uint8_t *new_stack;
+ size_t new_size, i, n;
+ StackInt *stack_buf;
+
+ if (unlikely((s->state_stack_len + 1) > s->state_stack_size)) {
+ /* reallocate the stack */
+ new_size = s->state_stack_size * 3 / 2;
+ if (new_size < 8)
+ new_size = 8;
+ new_stack = lre_realloc(s->opaque, s->state_stack, new_size * s->state_size);
+ if (!new_stack)
+ return -1;
+ s->state_stack_size = new_size;
+ s->state_stack = new_stack;
+ }
+ rs = (REExecState *)(s->state_stack + s->state_stack_len * s->state_size);
+ s->state_stack_len++;
+ rs->type = type;
+ rs->count = count;
+ rs->stack_len = stack_len;
+ rs->cptr = cptr;
+ rs->pc = pc;
+ n = 2 * s->capture_count;
+ for(i = 0; i < n; i++)
+ rs->buf[i] = capture[i];
+ stack_buf = (StackInt *)(rs->buf + n);
+ for(i = 0; i < stack_len; i++)
+ stack_buf[i] = stack[i];
+ return 0;
+}
+
+/* return 1 if match, 0 if not match or -1 if error. */
+static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
+ StackInt *stack, int stack_len,
+ const uint8_t *pc, const uint8_t *cptr,
+ BOOL no_recurse)
+{
+ int opcode, ret;
+ int cbuf_type;
+ uint32_t val, c;
+ const uint8_t *cbuf_end;
+
+ cbuf_type = s->cbuf_type;
+ cbuf_end = s->cbuf_end;
+
+ for(;;) {
+ // printf("top=%p: pc=%d\n", th_list.top, (int)(pc - (bc_buf + RE_HEADER_LEN)));
+ opcode = *pc++;
+ switch(opcode) {
+ case REOP_match:
+ {
+ REExecState *rs;
+ if (no_recurse)
+ return (intptr_t)cptr;
+ ret = 1;
+ goto recurse;
+ no_match:
+ if (no_recurse)
+ return 0;
+ ret = 0;
+ recurse:
+ for(;;) {
+ if (s->state_stack_len == 0)
+ return ret;
+ rs = (REExecState *)(s->state_stack +
+ (s->state_stack_len - 1) * s->state_size);
+ if (rs->type == RE_EXEC_STATE_SPLIT) {
+ if (!ret) {
+ pop_state:
+ memcpy(capture, rs->buf,
+ sizeof(capture[0]) * 2 * s->capture_count);
+ pop_state1:
+ pc = rs->pc;
+ cptr = rs->cptr;
+ stack_len = rs->stack_len;
+ memcpy(stack, rs->buf + 2 * s->capture_count,
+ stack_len * sizeof(stack[0]));
+ s->state_stack_len--;
+ break;
+ }
+ } else if (rs->type == RE_EXEC_STATE_GREEDY_QUANT) {
+ if (!ret) {
+ uint32_t char_count, i;
+ memcpy(capture, rs->buf,
+ sizeof(capture[0]) * 2 * s->capture_count);
+ stack_len = rs->stack_len;
+ memcpy(stack, rs->buf + 2 * s->capture_count,
+ stack_len * sizeof(stack[0]));
+ pc = rs->pc;
+ cptr = rs->cptr;
+ /* go backward */
+ char_count = get_u32(pc + 12);
+ for(i = 0; i < char_count; i++) {
+ PREV_CHAR(cptr, s->cbuf, cbuf_type);
+ }
+ pc = (pc + 16) + (int)get_u32(pc);
+ rs->cptr = cptr;
+ rs->count--;
+ if (rs->count == 0) {
+ s->state_stack_len--;
+ }
+ break;
+ }
+ } else {
+ ret = ((rs->type == RE_EXEC_STATE_LOOKAHEAD && ret) ||
+ (rs->type == RE_EXEC_STATE_NEGATIVE_LOOKAHEAD && !ret));
+ if (ret) {
+ /* keep the capture in case of positive lookahead */
+ if (rs->type == RE_EXEC_STATE_LOOKAHEAD)
+ goto pop_state1;
+ else
+ goto pop_state;
+ }
+ }
+ s->state_stack_len--;
+ }
+ }
+ break;
+ case REOP_char32:
+ val = get_u32(pc);
+ pc += 4;
+ goto test_char;
+ case REOP_char:
+ val = get_u16(pc);
+ pc += 2;
+ test_char:
+ if (cptr >= cbuf_end)
+ goto no_match;
+ GET_CHAR(c, cptr, cbuf_end, cbuf_type);
+ if (s->ignore_case) {
+ c = lre_canonicalize(c, s->is_unicode);
+ }
+ if (val != c)
+ goto no_match;
+ break;
+ case REOP_split_goto_first:
+ case REOP_split_next_first:
+ {
+ const uint8_t *pc1;
+
+ val = get_u32(pc);
+ pc += 4;
+ if (opcode == REOP_split_next_first) {
+ pc1 = pc + (int)val;
+ } else {
+ pc1 = pc;
+ pc = pc + (int)val;
+ }
+ ret = push_state(s, capture, stack, stack_len,
+ pc1, cptr, RE_EXEC_STATE_SPLIT, 0);
+ if (ret < 0)
+ return -1;
+ break;
+ }
+ case REOP_lookahead:
+ case REOP_negative_lookahead:
+ val = get_u32(pc);
+ pc += 4;
+ ret = push_state(s, capture, stack, stack_len,
+ pc + (int)val, cptr,
+ RE_EXEC_STATE_LOOKAHEAD + opcode - REOP_lookahead,
+ 0);
+ if (ret < 0)
+ return -1;
+ break;
+
+ case REOP_goto:
+ val = get_u32(pc);
+ pc += 4 + (int)val;
+ break;
+ case REOP_line_start:
+ if (cptr == s->cbuf)
+ break;
+ if (!s->multi_line)
+ goto no_match;
+ PEEK_PREV_CHAR(c, cptr, s->cbuf, cbuf_type);
+ if (!is_line_terminator(c))
+ goto no_match;
+ break;
+ case REOP_line_end:
+ if (cptr == cbuf_end)
+ break;
+ if (!s->multi_line)
+ goto no_match;
+ PEEK_CHAR(c, cptr, cbuf_end, cbuf_type);
+ if (!is_line_terminator(c))
+ goto no_match;
+ break;
+ case REOP_dot:
+ if (cptr == cbuf_end)
+ goto no_match;
+ GET_CHAR(c, cptr, cbuf_end, cbuf_type);
+ if (is_line_terminator(c))
+ goto no_match;
+ break;
+ case REOP_any:
+ if (cptr == cbuf_end)
+ goto no_match;
+ GET_CHAR(c, cptr, cbuf_end, cbuf_type);
+ break;
+ case REOP_save_start:
+ case REOP_save_end:
+ val = *pc++;
+ assert(val < s->capture_count);
+ capture[2 * val + opcode - REOP_save_start] = (uint8_t *)cptr;
+ break;
+ case REOP_save_reset:
+ {
+ uint32_t val2;
+ val = pc[0];
+ val2 = pc[1];
+ pc += 2;
+ assert(val2 < s->capture_count);
+ while (val <= val2) {
+ capture[2 * val] = NULL;
+ capture[2 * val + 1] = NULL;
+ val++;
+ }
+ }
+ break;
+ case REOP_push_i32:
+ val = get_u32(pc);
+ pc += 4;
+ stack[stack_len++] = val;
+ break;
+ case REOP_drop:
+ stack_len--;
+ break;
+ case REOP_loop:
+ val = get_u32(pc);
+ pc += 4;
+ if (--stack[stack_len - 1] != 0) {
+ pc += (int)val;
+ }
+ break;
+ case REOP_push_char_pos:
+ stack[stack_len++] = (uintptr_t)cptr;
+ break;
+ case REOP_check_advance:
+ if (stack[--stack_len] == (uintptr_t)cptr)
+ goto no_match;
+ break;
+ case REOP_word_boundary:
+ case REOP_not_word_boundary:
+ {
+ BOOL v1, v2;
+ /* char before */
+ if (cptr == s->cbuf) {
+ v1 = FALSE;
+ } else {
+ PEEK_PREV_CHAR(c, cptr, s->cbuf, cbuf_type);
+ v1 = is_word_char(c);
+ }
+ /* current char */
+ if (cptr >= cbuf_end) {
+ v2 = FALSE;
+ } else {
+ PEEK_CHAR(c, cptr, cbuf_end, cbuf_type);
+ v2 = is_word_char(c);
+ }
+ if (v1 ^ v2 ^ (REOP_not_word_boundary - opcode))
+ goto no_match;
+ }
+ break;
+ case REOP_back_reference:
+ case REOP_backward_back_reference:
+ {
+ const uint8_t *cptr1, *cptr1_end, *cptr1_start;
+ uint32_t c1, c2;
+
+ val = *pc++;
+ if (val >= s->capture_count)
+ goto no_match;
+ cptr1_start = capture[2 * val];
+ cptr1_end = capture[2 * val + 1];
+ if (!cptr1_start || !cptr1_end)
+ break;
+ if (opcode == REOP_back_reference) {
+ cptr1 = cptr1_start;
+ while (cptr1 < cptr1_end) {
+ if (cptr >= cbuf_end)
+ goto no_match;
+ GET_CHAR(c1, cptr1, cptr1_end, cbuf_type);
+ GET_CHAR(c2, cptr, cbuf_end, cbuf_type);
+ if (s->ignore_case) {
+ c1 = lre_canonicalize(c1, s->is_unicode);
+ c2 = lre_canonicalize(c2, s->is_unicode);
+ }
+ if (c1 != c2)
+ goto no_match;
+ }
+ } else {
+ cptr1 = cptr1_end;
+ while (cptr1 > cptr1_start) {
+ if (cptr == s->cbuf)
+ goto no_match;
+ GET_PREV_CHAR(c1, cptr1, cptr1_start, cbuf_type);
+ GET_PREV_CHAR(c2, cptr, s->cbuf, cbuf_type);
+ if (s->ignore_case) {
+ c1 = lre_canonicalize(c1, s->is_unicode);
+ c2 = lre_canonicalize(c2, s->is_unicode);
+ }
+ if (c1 != c2)
+ goto no_match;
+ }
+ }
+ }
+ break;
+ case REOP_range:
+ {
+ int n;
+ uint32_t low, high, idx_min, idx_max, idx;
+
+ n = get_u16(pc); /* n must be >= 1 */
+ pc += 2;
+ if (cptr >= cbuf_end)
+ goto no_match;
+ GET_CHAR(c, cptr, cbuf_end, cbuf_type);
+ if (s->ignore_case) {
+ c = lre_canonicalize(c, s->is_unicode);
+ }
+ idx_min = 0;
+ low = get_u16(pc + 0 * 4);
+ if (c < low)
+ goto no_match;
+ idx_max = n - 1;
+ high = get_u16(pc + idx_max * 4 + 2);
+ /* 0xffff in for last value means +infinity */
+ if (unlikely(c >= 0xffff) && high == 0xffff)
+ goto range_match;
+ if (c > high)
+ goto no_match;
+ while (idx_min <= idx_max) {
+ idx = (idx_min + idx_max) / 2;
+ low = get_u16(pc + idx * 4);
+ high = get_u16(pc + idx * 4 + 2);
+ if (c < low)
+ idx_max = idx - 1;
+ else if (c > high)
+ idx_min = idx + 1;
+ else
+ goto range_match;
+ }
+ goto no_match;
+ range_match:
+ pc += 4 * n;
+ }
+ break;
+ case REOP_range32:
+ {
+ int n;
+ uint32_t low, high, idx_min, idx_max, idx;
+
+ n = get_u16(pc); /* n must be >= 1 */
+ pc += 2;
+ if (cptr >= cbuf_end)
+ goto no_match;
+ GET_CHAR(c, cptr, cbuf_end, cbuf_type);
+ if (s->ignore_case) {
+ c = lre_canonicalize(c, s->is_unicode);
+ }
+ idx_min = 0;
+ low = get_u32(pc + 0 * 8);
+ if (c < low)
+ goto no_match;
+ idx_max = n - 1;
+ high = get_u32(pc + idx_max * 8 + 4);
+ if (c > high)
+ goto no_match;
+ while (idx_min <= idx_max) {
+ idx = (idx_min + idx_max) / 2;
+ low = get_u32(pc + idx * 8);
+ high = get_u32(pc + idx * 8 + 4);
+ if (c < low)
+ idx_max = idx - 1;
+ else if (c > high)
+ idx_min = idx + 1;
+ else
+ goto range32_match;
+ }
+ goto no_match;
+ range32_match:
+ pc += 8 * n;
+ }
+ break;
+ case REOP_prev:
+ /* go to the previous char */
+ if (cptr == s->cbuf)
+ goto no_match;
+ PREV_CHAR(cptr, s->cbuf, cbuf_type);
+ break;
+ case REOP_simple_greedy_quant:
+ {
+ uint32_t next_pos, quant_min, quant_max;
+ size_t q;
+ intptr_t res;
+ const uint8_t *pc1;
+
+ next_pos = get_u32(pc);
+ quant_min = get_u32(pc + 4);
+ quant_max = get_u32(pc + 8);
+ pc += 16;
+ pc1 = pc;
+ pc += (int)next_pos;
+
+ q = 0;
+ for(;;) {
+ res = lre_exec_backtrack(s, capture, stack, stack_len,
+ pc1, cptr, TRUE);
+ if (res == -1)
+ return res;
+ if (!res)
+ break;
+ cptr = (uint8_t *)res;
+ q++;
+ if (q >= quant_max && quant_max != INT32_MAX)
+ break;
+ }
+ if (q < quant_min)
+ goto no_match;
+ if (q > quant_min) {
+ /* will examine all matches down to quant_min */
+ ret = push_state(s, capture, stack, stack_len,
+ pc1 - 16, cptr,
+ RE_EXEC_STATE_GREEDY_QUANT,
+ q - quant_min);
+ if (ret < 0)
+ return -1;
+ }
+ }
+ break;
+ default:
+ abort();
+ }
+ }
+}
+
+/* Return 1 if match, 0 if not match or -1 if error. cindex is the
+ starting position of the match and must be such as 0 <= cindex <=
+ clen. */
+int lre_exec(uint8_t **capture,
+ const uint8_t *bc_buf, const uint8_t *cbuf, int cindex, int clen,
+ int cbuf_type, void *opaque)
+{
+ REExecContext s_s, *s = &s_s;
+ int re_flags, i, alloca_size, ret;
+ StackInt *stack_buf;
+
+ re_flags = lre_get_flags(bc_buf);
+ s->multi_line = (re_flags & LRE_FLAG_MULTILINE) != 0;
+ s->ignore_case = (re_flags & LRE_FLAG_IGNORECASE) != 0;
+ s->is_unicode = (re_flags & LRE_FLAG_UNICODE) != 0;
+ s->capture_count = bc_buf[RE_HEADER_CAPTURE_COUNT];
+ s->stack_size_max = bc_buf[RE_HEADER_STACK_SIZE];
+ s->cbuf = cbuf;
+ s->cbuf_end = cbuf + (clen << cbuf_type);
+ s->cbuf_type = cbuf_type;
+ if (s->cbuf_type == 1 && s->is_unicode)
+ s->cbuf_type = 2;
+ s->opaque = opaque;
+
+ s->state_size = sizeof(REExecState) +
+ s->capture_count * sizeof(capture[0]) * 2 +
+ s->stack_size_max * sizeof(stack_buf[0]);
+ s->state_stack = NULL;
+ s->state_stack_len = 0;
+ s->state_stack_size = 0;
+
+ for(i = 0; i < s->capture_count * 2; i++)
+ capture[i] = NULL;
+ alloca_size = s->stack_size_max * sizeof(stack_buf[0]);
+ stack_buf = alloca(alloca_size);
+ ret = lre_exec_backtrack(s, capture, stack_buf, 0, bc_buf + RE_HEADER_LEN,
+ cbuf + (cindex << cbuf_type), FALSE);
+ lre_realloc(s->opaque, s->state_stack, 0);
+ return ret;
+}
+
+int lre_get_capture_count(const uint8_t *bc_buf)
+{
+ return bc_buf[RE_HEADER_CAPTURE_COUNT];
+}
+
+int lre_get_flags(const uint8_t *bc_buf)
+{
+ return bc_buf[RE_HEADER_FLAGS];
+}
+
+/* Return NULL if no group names. Otherwise, return a pointer to
+ 'capture_count - 1' zero terminated UTF-8 strings. */
+const char *lre_get_groupnames(const uint8_t *bc_buf)
+{
+ uint32_t re_bytecode_len;
+ if ((lre_get_flags(bc_buf) & LRE_FLAG_NAMED_GROUPS) == 0)
+ return NULL;
+ re_bytecode_len = get_u32(bc_buf + RE_HEADER_BYTECODE_LEN);
+ return (const char *)(bc_buf + RE_HEADER_LEN + re_bytecode_len);
+}
+
+#ifdef TEST
+
+BOOL lre_check_stack_overflow(void *opaque, size_t alloca_size)
+{
+ return FALSE;
+}
+
+void *lre_realloc(void *opaque, void *ptr, size_t size)
+{
+ return realloc(ptr, size);
+}
+
+int main(int argc, char **argv)
+{
+ int len, flags, ret, i;
+ uint8_t *bc;
+ char error_msg[64];
+ uint8_t *capture[CAPTURE_COUNT_MAX * 2];
+ const char *input;
+ int input_len, capture_count;
+
+ if (argc < 4) {
+ printf("usage: %s regexp flags input\n", argv[0]);
+ return 1;
+ }
+ flags = atoi(argv[2]);
+ bc = lre_compile(&len, error_msg, sizeof(error_msg), argv[1],
+ strlen(argv[1]), flags, NULL);
+ if (!bc) {
+ fprintf(stderr, "error: %s\n", error_msg);
+ exit(1);
+ }
+
+ input = argv[3];
+ input_len = strlen(input);
+
+ ret = lre_exec(capture, bc, (uint8_t *)input, 0, input_len, 0, NULL);
+ printf("ret=%d\n", ret);
+ if (ret == 1) {
+ capture_count = lre_get_capture_count(bc);
+ for(i = 0; i < 2 * capture_count; i++) {
+ uint8_t *ptr;
+ ptr = capture[i];
+ printf("%d: ", i);
+ if (!ptr)
+ printf("<nil>");
+ else
+ printf("%u", (int)(ptr - (uint8_t *)input));
+ printf("\n");
+ }
+ }
+ return 0;
+}
+#endif