summaryrefslogtreecommitdiffstats
path: root/src/3rdparty/pcre2/src/pcre2_dfa_match.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/3rdparty/pcre2/src/pcre2_dfa_match.c')
-rw-r--r--src/3rdparty/pcre2/src/pcre2_dfa_match.c252
1 files changed, 185 insertions, 67 deletions
diff --git a/src/3rdparty/pcre2/src/pcre2_dfa_match.c b/src/3rdparty/pcre2/src/pcre2_dfa_match.c
index 060dc7669a..caae65248f 100644
--- a/src/3rdparty/pcre2/src/pcre2_dfa_match.c
+++ b/src/3rdparty/pcre2/src/pcre2_dfa_match.c
@@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
- New API code Copyright (c) 2016-2021 University of Cambridge
+ New API code Copyright (c) 2016-2023 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -168,7 +168,7 @@ static const uint8_t coptable[] = {
0, /* KetRmax */
0, /* KetRmin */
0, /* KetRpos */
- 0, /* Reverse */
+ 0, 0, /* Reverse, Vreverse */
0, /* Assert */
0, /* Assert not */
0, /* Assert behind */
@@ -187,7 +187,8 @@ static const uint8_t coptable[] = {
0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
0, 0, /* COMMIT, COMMIT_ARG */
0, 0, 0, /* FAIL, ACCEPT, ASSERT_ACCEPT */
- 0, 0, 0 /* CLOSE, SKIPZERO, DEFINE */
+ 0, 0, 0, /* CLOSE, SKIPZERO, DEFINE */
+ 0, 0 /* \B and \b in UCP mode */
};
/* This table identifies those opcodes that inspect a character. It is used to
@@ -245,7 +246,7 @@ static const uint8_t poptable[] = {
0, /* KetRmax */
0, /* KetRmin */
0, /* KetRpos */
- 0, /* Reverse */
+ 0, 0, /* Reverse, Vreverse */
0, /* Assert */
0, /* Assert not */
0, /* Assert behind */
@@ -264,7 +265,8 @@ static const uint8_t poptable[] = {
0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
0, 0, /* COMMIT, COMMIT_ARG */
0, 0, 0, /* FAIL, ACCEPT, ASSERT_ACCEPT */
- 0, 0, 0 /* CLOSE, SKIPZERO, DEFINE */
+ 0, 0, 0, /* CLOSE, SKIPZERO, DEFINE */
+ 1, 1 /* \B and \b in UCP mode */
};
/* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
@@ -350,7 +352,7 @@ Returns: the return from the callout
*/
static int
-do_callout(PCRE2_SPTR code, PCRE2_SIZE *offsets, PCRE2_SPTR current_subject,
+do_callout_dfa(PCRE2_SPTR code, PCRE2_SIZE *offsets, PCRE2_SPTR current_subject,
PCRE2_SPTR ptr, dfa_match_block *mb, PCRE2_SIZE extracode,
PCRE2_SIZE *lengthptr)
{
@@ -426,7 +428,7 @@ overflow. */
else
{
- uint32_t newsize = (rws->size >= UINT32_MAX/2)? UINT32_MAX/2 : rws->size * 2;
+ uint32_t newsize = (rws->size >= UINT32_MAX/(sizeof(int)*2))? UINT32_MAX/sizeof(int) : rws->size * 2;
uint32_t newsizeK = newsize/(1024/sizeof(int));
if (newsizeK + mb->heap_used > mb->heap_limit)
@@ -589,7 +591,7 @@ if (*this_start_code == OP_ASSERTBACK || *this_start_code == OP_ASSERTBACK_NOT)
end_code = this_start_code;
do
{
- size_t back = (size_t)GET(end_code, 2+LINK_SIZE);
+ size_t back = (size_t)GET2(end_code, 2+LINK_SIZE);
if (back > max_back) max_back = back;
end_code += GET(end_code, 1);
}
@@ -633,8 +635,8 @@ if (*this_start_code == OP_ASSERTBACK || *this_start_code == OP_ASSERTBACK_NOT)
end_code = this_start_code;
do
{
- uint32_t revlen = (end_code[1+LINK_SIZE] == OP_REVERSE)? 1 + LINK_SIZE : 0;
- size_t back = (revlen == 0)? 0 : (size_t)GET(end_code, 2+LINK_SIZE);
+ uint32_t revlen = (end_code[1+LINK_SIZE] == OP_REVERSE)? 1 + IMM2_SIZE : 0;
+ size_t back = (revlen == 0)? 0 : (size_t)GET2(end_code, 2+LINK_SIZE);
if (back <= gone_back)
{
int bstate = (int)(end_code - start_code + 1 + LINK_SIZE + revlen);
@@ -1100,6 +1102,8 @@ for (;;)
/*-----------------------------------------------------------------*/
case OP_WORD_BOUNDARY:
case OP_NOT_WORD_BOUNDARY:
+ case OP_NOT_UCP_WORD_BOUNDARY:
+ case OP_UCP_WORD_BOUNDARY:
{
int left_word, right_word;
@@ -1112,13 +1116,13 @@ for (;;)
#endif
GETCHARTEST(d, temp);
#ifdef SUPPORT_UNICODE
- if ((mb->poptions & PCRE2_UCP) != 0)
+ if (codevalue == OP_UCP_WORD_BOUNDARY ||
+ codevalue == OP_NOT_UCP_WORD_BOUNDARY)
{
- if (d == '_') left_word = TRUE; else
- {
- uint32_t cat = UCD_CATEGORY(d);
- left_word = (cat == ucp_L || cat == ucp_N);
- }
+ int chartype = UCD_CHARTYPE(d);
+ int category = PRIV(ucp_gentype)[chartype];
+ left_word = (category == ucp_L || category == ucp_N ||
+ chartype == ucp_Mn || chartype == ucp_Pc);
}
else
#endif
@@ -1137,13 +1141,13 @@ for (;;)
mb->last_used_ptr = temp;
}
#ifdef SUPPORT_UNICODE
- if ((mb->poptions & PCRE2_UCP) != 0)
+ if (codevalue == OP_UCP_WORD_BOUNDARY ||
+ codevalue == OP_NOT_UCP_WORD_BOUNDARY)
{
- if (c == '_') right_word = TRUE; else
- {
- uint32_t cat = UCD_CATEGORY(c);
- right_word = (cat == ucp_L || cat == ucp_N);
- }
+ int chartype = UCD_CHARTYPE(c);
+ int category = PRIV(ucp_gentype)[chartype];
+ right_word = (category == ucp_L || category == ucp_N ||
+ chartype == ucp_Mn || chartype == ucp_Pc);
}
else
#endif
@@ -1151,7 +1155,9 @@ for (;;)
}
else right_word = FALSE;
- if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
+ if ((left_word == right_word) ==
+ (codevalue == OP_NOT_WORD_BOUNDARY ||
+ codevalue == OP_NOT_UCP_WORD_BOUNDARY))
{ ADD_ACTIVE(state_offset + 1, 0); }
}
break;
@@ -1168,6 +1174,7 @@ for (;;)
if (clen > 0)
{
BOOL OK;
+ int chartype;
const uint32_t *cp;
const ucd_record * prop = GET_UCD(c);
switch(code[1])
@@ -1177,8 +1184,9 @@ for (;;)
break;
case PT_LAMP:
- OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
- prop->chartype == ucp_Lt;
+ chartype = prop->chartype;
+ OK = chartype == ucp_Lu || chartype == ucp_Ll ||
+ chartype == ucp_Lt;
break;
case PT_GC:
@@ -1193,11 +1201,17 @@ for (;;)
OK = prop->script == code[2];
break;
+ case PT_SCX:
+ OK = (prop->script == code[2] ||
+ MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), code[2]) != 0);
+ break;
+
/* These are specials for combination cases. */
case PT_ALNUM:
- OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
- PRIV(ucp_gentype)[prop->chartype] == ucp_N;
+ chartype = prop->chartype;
+ OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
+ PRIV(ucp_gentype)[chartype] == ucp_N;
break;
/* Perl space used to exclude VT, but from Perl 5.18 it is included,
@@ -1220,12 +1234,20 @@ for (;;)
break;
case PT_WORD:
- OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
- PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
- c == CHAR_UNDERSCORE;
+ chartype = prop->chartype;
+ OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
+ PRIV(ucp_gentype)[chartype] == ucp_N ||
+ chartype == ucp_Mn || chartype == ucp_Pc;
break;
case PT_CLIST:
+#if PCRE2_CODE_UNIT_WIDTH == 32
+ if (c > MAX_UTF_CODE_POINT)
+ {
+ OK = FALSE;
+ break;
+ }
+#endif
cp = PRIV(ucd_caseless_sets) + code[2];
for (;;)
{
@@ -1240,6 +1262,15 @@ for (;;)
c >= 0xe000;
break;
+ case PT_BIDICL:
+ OK = UCD_BIDICLASS(c) == code[2];
+ break;
+
+ case PT_BOOL:
+ OK = MAPBIT(PRIV(ucd_boolprop_sets) +
+ UCD_BPROPS_PROP(prop), code[2]) != 0;
+ break;
+
/* Should never occur, but keep compilers from grumbling. */
default:
@@ -1426,6 +1457,7 @@ for (;;)
if (clen > 0)
{
BOOL OK;
+ int chartype;
const uint32_t *cp;
const ucd_record * prop = GET_UCD(c);
switch(code[2])
@@ -1435,8 +1467,8 @@ for (;;)
break;
case PT_LAMP:
- OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
- prop->chartype == ucp_Lt;
+ chartype = prop->chartype;
+ OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
break;
case PT_GC:
@@ -1451,11 +1483,17 @@ for (;;)
OK = prop->script == code[3];
break;
+ case PT_SCX:
+ OK = (prop->script == code[3] ||
+ MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), code[3]) != 0);
+ break;
+
/* These are specials for combination cases. */
case PT_ALNUM:
- OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
- PRIV(ucp_gentype)[prop->chartype] == ucp_N;
+ chartype = prop->chartype;
+ OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
+ PRIV(ucp_gentype)[chartype] == ucp_N;
break;
/* Perl space used to exclude VT, but from Perl 5.18 it is included,
@@ -1478,12 +1516,20 @@ for (;;)
break;
case PT_WORD:
- OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
- PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
- c == CHAR_UNDERSCORE;
+ chartype = prop->chartype;
+ OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
+ PRIV(ucp_gentype)[chartype] == ucp_N ||
+ chartype == ucp_Mn || chartype == ucp_Pc;
break;
case PT_CLIST:
+#if PCRE2_CODE_UNIT_WIDTH == 32
+ if (c > MAX_UTF_CODE_POINT)
+ {
+ OK = FALSE;
+ break;
+ }
+#endif
cp = PRIV(ucd_caseless_sets) + code[3];
for (;;)
{
@@ -1498,6 +1544,15 @@ for (;;)
c >= 0xe000;
break;
+ case PT_BIDICL:
+ OK = UCD_BIDICLASS(c) == code[3];
+ break;
+
+ case PT_BOOL:
+ OK = MAPBIT(PRIV(ucd_boolprop_sets) +
+ UCD_BPROPS_PROP(prop), code[3]) != 0;
+ break;
+
/* Should never occur, but keep compilers from grumbling. */
default:
@@ -1667,6 +1722,7 @@ for (;;)
if (clen > 0)
{
BOOL OK;
+ int chartype;
const uint32_t *cp;
const ucd_record * prop = GET_UCD(c);
switch(code[2])
@@ -1676,8 +1732,8 @@ for (;;)
break;
case PT_LAMP:
- OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
- prop->chartype == ucp_Lt;
+ chartype = prop->chartype;
+ OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
break;
case PT_GC:
@@ -1692,11 +1748,17 @@ for (;;)
OK = prop->script == code[3];
break;
+ case PT_SCX:
+ OK = (prop->script == code[3] ||
+ MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), code[3]) != 0);
+ break;
+
/* These are specials for combination cases. */
case PT_ALNUM:
- OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
- PRIV(ucp_gentype)[prop->chartype] == ucp_N;
+ chartype = prop->chartype;
+ OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
+ PRIV(ucp_gentype)[chartype] == ucp_N;
break;
/* Perl space used to exclude VT, but from Perl 5.18 it is included,
@@ -1719,12 +1781,20 @@ for (;;)
break;
case PT_WORD:
- OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
- PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
- c == CHAR_UNDERSCORE;
+ chartype = prop->chartype;
+ OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
+ PRIV(ucp_gentype)[chartype] == ucp_N ||
+ chartype == ucp_Mn || chartype == ucp_Pc;
break;
case PT_CLIST:
+#if PCRE2_CODE_UNIT_WIDTH == 32
+ if (c > MAX_UTF_CODE_POINT)
+ {
+ OK = FALSE;
+ break;
+ }
+#endif
cp = PRIV(ucd_caseless_sets) + code[3];
for (;;)
{
@@ -1739,6 +1809,15 @@ for (;;)
c >= 0xe000;
break;
+ case PT_BIDICL:
+ OK = UCD_BIDICLASS(c) == code[3];
+ break;
+
+ case PT_BOOL:
+ OK = MAPBIT(PRIV(ucd_boolprop_sets) +
+ UCD_BPROPS_PROP(prop), code[3]) != 0;
+ break;
+
/* Should never occur, but keep compilers from grumbling. */
default:
@@ -1933,6 +2012,7 @@ for (;;)
if (clen > 0)
{
BOOL OK;
+ int chartype;
const uint32_t *cp;
const ucd_record * prop = GET_UCD(c);
switch(code[1 + IMM2_SIZE + 1])
@@ -1942,8 +2022,8 @@ for (;;)
break;
case PT_LAMP:
- OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
- prop->chartype == ucp_Lt;
+ chartype = prop->chartype;
+ OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
break;
case PT_GC:
@@ -1958,11 +2038,18 @@ for (;;)
OK = prop->script == code[1 + IMM2_SIZE + 2];
break;
+ case PT_SCX:
+ OK = (prop->script == code[1 + IMM2_SIZE + 2] ||
+ MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop),
+ code[1 + IMM2_SIZE + 2]) != 0);
+ break;
+
/* These are specials for combination cases. */
case PT_ALNUM:
- OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
- PRIV(ucp_gentype)[prop->chartype] == ucp_N;
+ chartype = prop->chartype;
+ OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
+ PRIV(ucp_gentype)[chartype] == ucp_N;
break;
/* Perl space used to exclude VT, but from Perl 5.18 it is included,
@@ -1985,12 +2072,20 @@ for (;;)
break;
case PT_WORD:
- OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
- PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
- c == CHAR_UNDERSCORE;
+ chartype = prop->chartype;
+ OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
+ PRIV(ucp_gentype)[chartype] == ucp_N ||
+ chartype == ucp_Mn || chartype == ucp_Pc;
break;
case PT_CLIST:
+#if PCRE2_CODE_UNIT_WIDTH == 32
+ if (c > MAX_UTF_CODE_POINT)
+ {
+ OK = FALSE;
+ break;
+ }
+#endif
cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2];
for (;;)
{
@@ -2005,6 +2100,15 @@ for (;;)
c >= 0xe000;
break;
+ case PT_BIDICL:
+ OK = UCD_BIDICLASS(c) == code[1 + IMM2_SIZE + 2];
+ break;
+
+ case PT_BOOL:
+ OK = MAPBIT(PRIV(ucd_boolprop_sets) +
+ UCD_BPROPS_PROP(prop), code[1 + IMM2_SIZE + 2]) != 0;
+ break;
+
/* Should never occur, but keep compilers from grumbling. */
default:
@@ -2742,7 +2846,7 @@ for (;;)
|| code[LINK_SIZE + 1] == OP_CALLOUT_STR)
{
PCRE2_SIZE callout_length;
- rrc = do_callout(code, offsets, current_subject, ptr, mb,
+ rrc = do_callout_dfa(code, offsets, current_subject, ptr, mb,
1 + LINK_SIZE, &callout_length);
if (rrc < 0) return rrc; /* Abandon */
if (rrc > 0) break; /* Fail this thread */
@@ -2837,7 +2941,6 @@ for (;;)
int *local_workspace;
PCRE2_SIZE *local_offsets;
RWS_anchor *rws = (RWS_anchor *)RWS;
- dfa_recursion_info *ri;
PCRE2_SPTR callpat = start_code + GET(code, 1);
uint32_t recno = (callpat == mb->start_code)? 0 :
GET2(callpat, 1 + LINK_SIZE);
@@ -2854,18 +2957,24 @@ for (;;)
rws->free -= RWS_RSIZE + RWS_OVEC_RSIZE;
/* Check for repeating a recursion without advancing the subject
- pointer. This should catch convoluted mutual recursions. (Some simple
- cases are caught at compile time.) */
+ pointer or last used character. This should catch convoluted mutual
+ recursions. (Some simple cases are caught at compile time.) */
- for (ri = mb->recursive; ri != NULL; ri = ri->prevrec)
- if (recno == ri->group_num && ptr == ri->subject_position)
+ for (dfa_recursion_info *ri = mb->recursive;
+ ri != NULL;
+ ri = ri->prevrec)
+ {
+ if (recno == ri->group_num && ptr == ri->subject_position &&
+ mb->last_used_ptr == ri->last_used_ptr)
return PCRE2_ERROR_RECURSELOOP;
+ }
/* Remember this recursion and where we started it so as to
catch infinite loops. */
new_recursive.group_num = recno;
new_recursive.subject_position = ptr;
+ new_recursive.last_used_ptr = mb->last_used_ptr;
new_recursive.prevrec = mb->recursive;
mb->recursive = &new_recursive;
@@ -3139,7 +3248,7 @@ for (;;)
case OP_CALLOUT_STR:
{
PCRE2_SIZE callout_length;
- rrc = do_callout(code, offsets, current_subject, ptr, mb, 0,
+ rrc = do_callout_dfa(code, offsets, current_subject, ptr, mb, 0,
&callout_length);
if (rrc < 0) return rrc; /* Abandon */
if (rrc == 0)
@@ -3285,20 +3394,22 @@ rws->next = NULL;
rws->size = RWS_BASE_SIZE;
rws->free = RWS_BASE_SIZE - RWS_ANCHOR_SIZE;
-/* A length equal to PCRE2_ZERO_TERMINATED implies a zero-terminated
-subject string. */
+/* Recognize NULL, length 0 as an empty string. */
-if (length == PCRE2_ZERO_TERMINATED)
- {
- length = PRIV(strlen)(subject);
- was_zero_terminated = 1;
- }
+if (subject == NULL && length == 0) subject = (PCRE2_SPTR)"";
/* Plausibility checks */
if ((options & ~PUBLIC_DFA_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION;
if (re == NULL || subject == NULL || workspace == NULL || match_data == NULL)
return PCRE2_ERROR_NULL;
+
+if (length == PCRE2_ZERO_TERMINATED)
+ {
+ length = PRIV(strlen)(subject);
+ was_zero_terminated = 1;
+ }
+
if (wscount < 20) return PCRE2_ERROR_DFA_WSSIZE;
if (start_offset > length) return PCRE2_ERROR_BADOFFSET;
@@ -3365,7 +3476,7 @@ anchored = (options & (PCRE2_ANCHORED|PCRE2_DFA_RESTART)) != 0 ||
where to start. */
startline = (re->flags & PCRE2_STARTLINE) != 0;
-firstline = (re->overall_options & PCRE2_FIRSTLINE) != 0;
+firstline = !anchored && (re->overall_options & PCRE2_FIRSTLINE) != 0;
bumpalong_limit = end_subject;
/* Initialize and set up the fixed fields in the callout block, with a pointer
@@ -3935,8 +4046,9 @@ for (;;)
match_data->ovector[0] = (PCRE2_SIZE)(start_match - subject);
match_data->ovector[1] = (PCRE2_SIZE)(end_subject - subject);
}
+ match_data->subject_length = length;
match_data->leftchar = (PCRE2_SIZE)(mb->start_used_ptr - subject);
- match_data->rightchar = (PCRE2_SIZE)( mb->last_used_ptr - subject);
+ match_data->rightchar = (PCRE2_SIZE)(mb->last_used_ptr - subject);
match_data->startchar = (PCRE2_SIZE)(start_match - subject);
match_data->rc = rc;
@@ -3998,4 +4110,10 @@ while (rws->next != NULL)
return rc;
}
+/* These #undefs are here to enable unity builds with CMake. */
+
+#undef NLBLOCK /* Block containing newline information */
+#undef PSSTART /* Field containing processed string start */
+#undef PSEND /* Field containing processed string end */
+
/* End of pcre2_dfa_match.c */