summaryrefslogtreecommitdiffstats
path: root/src/3rdparty/pcre2/src/pcre2_auto_possess.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/3rdparty/pcre2/src/pcre2_auto_possess.c')
-rw-r--r--src/3rdparty/pcre2/src/pcre2_auto_possess.c98
1 files changed, 66 insertions, 32 deletions
diff --git a/src/3rdparty/pcre2/src/pcre2_auto_possess.c b/src/3rdparty/pcre2/src/pcre2_auto_possess.c
index 5b95b9b8a8..210d13d37a 100644
--- a/src/3rdparty/pcre2/src/pcre2_auto_possess.c
+++ b/src/3rdparty/pcre2/src/pcre2_auto_possess.c
@@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
- New API code Copyright (c) 2016-2019 University of Cambridge
+ New API code Copyright (c) 2016-2022 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -123,18 +123,21 @@ opcode is used to select the column. The values are as follows:
*/
static const uint8_t propposstab[PT_TABSIZE][PT_TABSIZE] = {
-/* ANY LAMP GC PC SC ALNUM SPACE PXSPACE WORD CLIST UCNC */
- { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_ANY */
- { 0, 3, 0, 0, 0, 3, 1, 1, 0, 0, 0 }, /* PT_LAMP */
- { 0, 0, 2, 4, 0, 9, 10, 10, 11, 0, 0 }, /* PT_GC */
- { 0, 0, 5, 2, 0, 15, 16, 16, 17, 0, 0 }, /* PT_PC */
- { 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 }, /* PT_SC */
- { 0, 3, 6, 12, 0, 3, 1, 1, 0, 0, 0 }, /* PT_ALNUM */
- { 0, 1, 7, 13, 0, 1, 3, 3, 1, 0, 0 }, /* PT_SPACE */
- { 0, 1, 7, 13, 0, 1, 3, 3, 1, 0, 0 }, /* PT_PXSPACE */
- { 0, 0, 8, 14, 0, 0, 1, 1, 3, 0, 0 }, /* PT_WORD */
- { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_CLIST */
- { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 } /* PT_UCNC */
+/* ANY LAMP GC PC SC SCX ALNUM SPACE PXSPACE WORD CLIST UCNC BIDICL BOOL */
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_ANY */
+ { 0, 3, 0, 0, 0, 0, 3, 1, 1, 0, 0, 0, 0, 0 }, /* PT_LAMP */
+ { 0, 0, 2, 4, 0, 0, 9, 10, 10, 11, 0, 0, 0, 0 }, /* PT_GC */
+ { 0, 0, 5, 2, 0, 0, 15, 16, 16, 17, 0, 0, 0, 0 }, /* PT_PC */
+ { 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_SC */
+ { 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_SCX */
+ { 0, 3, 6, 12, 0, 0, 3, 1, 1, 0, 0, 0, 0, 0 }, /* PT_ALNUM */
+ { 0, 1, 7, 13, 0, 0, 1, 3, 3, 1, 0, 0, 0, 0 }, /* PT_SPACE */
+ { 0, 1, 7, 13, 0, 0, 1, 3, 3, 1, 0, 0, 0, 0 }, /* PT_PXSPACE */
+ { 0, 0, 8, 14, 0, 0, 0, 1, 1, 3, 0, 0, 0, 0 }, /* PT_WORD */
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_CLIST */
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0 }, /* PT_UCNC */
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_BIDICL */
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } /* PT_BOOL */
};
/* This table is used to check whether auto-possessification is possible
@@ -196,6 +199,7 @@ static BOOL
check_char_prop(uint32_t c, unsigned int ptype, unsigned int pdata,
BOOL negated)
{
+BOOL ok;
const uint32_t *p;
const ucd_record *prop = GET_UCD(c);
@@ -215,6 +219,11 @@ switch(ptype)
case PT_SC:
return (pdata == prop->script) == negated;
+ case PT_SCX:
+ ok = (pdata == prop->script
+ || MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), pdata) != 0);
+ return ok == negated;
+
/* These are specials */
case PT_ALNUM:
@@ -251,6 +260,14 @@ switch(ptype)
if (c == *p++) return negated;
}
break; /* Control never reaches here */
+
+ /* Haven't yet thought these through. */
+
+ case PT_BIDICL:
+ return FALSE;
+
+ case PT_BOOL:
+ return FALSE;
}
return FALSE;
@@ -292,6 +309,7 @@ possessification, and if so, fills a list with its properties.
Arguments:
code points to start of expression
utf TRUE if in UTF mode
+ ucp TRUE if in UCP mode
fcc points to the case-flipping table
list points to output list
list[0] will be filled with the opcode
@@ -304,7 +322,7 @@ Returns: points to the start of the next opcode if *code is accepted
*/
static PCRE2_SPTR
-get_chr_property_list(PCRE2_SPTR code, BOOL utf, const uint8_t *fcc,
+get_chr_property_list(PCRE2_SPTR code, BOOL utf, BOOL ucp, const uint8_t *fcc,
uint32_t *list)
{
PCRE2_UCHAR c = *code;
@@ -316,7 +334,8 @@ uint32_t chr;
uint32_t *clist_dest;
const uint32_t *clist_src;
#else
-(void)utf; /* Suppress "unused parameter" compiler warning */
+(void)utf; /* Suppress "unused parameter" compiler warnings */
+(void)ucp;
#endif
list[0] = c;
@@ -396,7 +415,7 @@ switch(c)
list[2] = chr;
#ifdef SUPPORT_UNICODE
- if (chr < 128 || (chr < 256 && !utf))
+ if (chr < 128 || (chr < 256 && !utf && !ucp))
list[3] = fcc[chr];
else
list[3] = UCD_OTHERCASE(chr);
@@ -488,6 +507,7 @@ switch(c)
list[2] = (uint32_t)(end - code);
return end;
}
+
return NULL; /* Opcode not accepted */
}
@@ -503,6 +523,7 @@ which case the base cannot be possessified.
Arguments:
code points to the byte code
utf TRUE in UTF mode
+ ucp TRUE in UCP mode
cb compile data block
base_list the data list of the base opcode
base_end the end of the base opcode
@@ -512,7 +533,7 @@ Returns: TRUE if the auto-possessification is possible
*/
static BOOL
-compare_opcodes(PCRE2_SPTR code, BOOL utf, const compile_block *cb,
+compare_opcodes(PCRE2_SPTR code, BOOL utf, BOOL ucp, const compile_block *cb,
const uint32_t *base_list, PCRE2_SPTR base_end, int *rec_limit)
{
PCRE2_UCHAR c;
@@ -539,6 +560,8 @@ matches to an empty string (also represented by a non-zero value). */
for(;;)
{
+ PCRE2_SPTR bracode;
+
/* All operations move the code pointer forward.
Therefore infinite recursions are not possible. */
@@ -596,7 +619,8 @@ for(;;)
recursions. (This could be improved by keeping a list of group numbers that
are called by recursion.) */
- switch(*(code - GET(code, 1)))
+ bracode = code - GET(code, 1);
+ switch(*bracode)
{
case OP_CBRA:
case OP_SCBRA:
@@ -615,16 +639,19 @@ for(;;)
break;
/* Atomic sub-patterns and assertions can always auto-possessify their
- last iterator. However, if the group was entered as a result of checking
- a previous iterator, this is not possible. */
+ last iterator except for variable length lookbehinds. However, if the
+ group was entered as a result of checking a previous iterator, this is
+ not possible. */
case OP_ASSERT:
case OP_ASSERT_NOT:
- case OP_ASSERTBACK:
- case OP_ASSERTBACK_NOT:
case OP_ONCE:
return !entered_a_group;
+ case OP_ASSERTBACK:
+ case OP_ASSERTBACK_NOT:
+ return (bracode[1+LINK_SIZE] == OP_VREVERSE)? FALSE : !entered_a_group;
+
/* Non-atomic assertions - don't possessify last iterator. This needs
more thought. */
@@ -651,7 +678,7 @@ for(;;)
while (*next_code == OP_ALT)
{
- if (!compare_opcodes(code, utf, cb, base_list, base_end, rec_limit))
+ if (!compare_opcodes(code, utf, ucp, cb, base_list, base_end, rec_limit))
return FALSE;
code = next_code + 1 + LINK_SIZE;
next_code += GET(next_code, 1);
@@ -672,7 +699,8 @@ for(;;)
/* The bracket content will be checked by the OP_BRA/OP_CBRA case above. */
next_code += 1 + LINK_SIZE;
- if (!compare_opcodes(next_code, utf, cb, base_list, base_end, rec_limit))
+ if (!compare_opcodes(next_code, utf, ucp, cb, base_list, base_end,
+ rec_limit))
return FALSE;
code += PRIV(OP_lengths)[c];
@@ -688,7 +716,7 @@ for(;;)
/* We now have the next appropriate opcode to compare with the base. Check
for a supported opcode, and load its properties. */
- code = get_chr_property_list(code, utf, cb->fcc, list);
+ code = get_chr_property_list(code, utf, ucp, cb->fcc, list);
if (code == NULL) return FALSE; /* Unsupported */
/* If either opcode is a small character list, set pointers for comparing
@@ -1100,7 +1128,6 @@ leaving the remainder of the pattern unpossessified.
Arguments:
code points to start of the byte code
- utf TRUE in UTF mode
cb compile data block
Returns: 0 for success
@@ -1108,13 +1135,15 @@ Returns: 0 for success
*/
int
-PRIV(auto_possessify)(PCRE2_UCHAR *code, BOOL utf, const compile_block *cb)
+PRIV(auto_possessify)(PCRE2_UCHAR *code, const compile_block *cb)
{
PCRE2_UCHAR c;
PCRE2_SPTR end;
PCRE2_UCHAR *repeat_opcode;
uint32_t list[8];
int rec_limit = 1000; /* Was 10,000 but clang+ASAN uses a lot of stack. */
+BOOL utf = (cb->external_options & PCRE2_UTF) != 0;
+BOOL ucp = (cb->external_options & PCRE2_UCP) != 0;
for (;;)
{
@@ -1126,10 +1155,11 @@ for (;;)
{
c -= get_repeat_base(c) - OP_STAR;
end = (c <= OP_MINUPTO) ?
- get_chr_property_list(code, utf, cb->fcc, list) : NULL;
+ get_chr_property_list(code, utf, ucp, cb->fcc, list) : NULL;
list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
- if (end != NULL && compare_opcodes(end, utf, cb, list, end, &rec_limit))
+ if (end != NULL && compare_opcodes(end, utf, ucp, cb, list, end,
+ &rec_limit))
{
switch(c)
{
@@ -1180,12 +1210,16 @@ for (;;)
c = *repeat_opcode;
if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
{
- /* end must not be NULL. */
- end = get_chr_property_list(code, utf, cb->fcc, list);
+ /* The return from get_chr_property_list() will never be NULL when
+ *code (aka c) is one of the three class opcodes. However, gcc with
+ -fanalyzer notes that a NULL return is possible, and grumbles. Hence we
+ put in a check. */
+ end = get_chr_property_list(code, utf, ucp, cb->fcc, list);
list[1] = (c & 1) == 0;
- if (compare_opcodes(end, utf, cb, list, end, &rec_limit))
+ if (end != NULL &&
+ compare_opcodes(end, utf, ucp, cb, list, end, &rec_limit))
{
switch (c)
{