summaryrefslogtreecommitdiffstats
path: root/src/3rdparty/pcre2/src/pcre2_study.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/3rdparty/pcre2/src/pcre2_study.c')
-rw-r--r--src/3rdparty/pcre2/src/pcre2_study.c192
1 files changed, 125 insertions, 67 deletions
diff --git a/src/3rdparty/pcre2/src/pcre2_study.c b/src/3rdparty/pcre2/src/pcre2_study.c
index db08266745..b92686759d 100644
--- a/src/3rdparty/pcre2/src/pcre2_study.c
+++ b/src/3rdparty/pcre2/src/pcre2_study.c
@@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
- New API code Copyright (c) 2016 University of Cambridge
+ New API code Copyright (c) 2016-2017 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -46,9 +46,11 @@ collecting data (e.g. minimum matching length). */
#include "config.h"
#endif
-
#include "pcre2_internal.h"
+/* The maximum remembered capturing brackets minimum. */
+
+#define MAX_CACHE_BACKREF 128
/* Set a bit in the starting code unit bit map. */
@@ -71,6 +73,12 @@ length is 16-bits long (on the grounds that anything longer than that is
pathological), so we give up when we reach that amount. This also means that
integer overflow for really crazy patterns cannot happen.
+Backreference minimum lengths are cached to speed up multiple references. This
+function is called only when the highest back reference in the pattern is less
+than or equal to MAX_CACHE_BACKREF, which is one less than the size of the
+caching vector. The zeroth element contains the number of the highest set
+value.
+
Arguments:
re compiled pattern block
code pointer to start of group (the bracket)
@@ -78,6 +86,7 @@ Arguments:
utf UTF flag
recurses chain of recurse_check to catch mutual recursion
countptr pointer to call count (to catch over complexity)
+ backref_cache vector for caching back references.
Returns: the minimum length
-1 \C in UTF-8 mode
@@ -90,7 +99,8 @@ Returns: the minimum length
static int
find_minlength(const pcre2_real_code *re, PCRE2_SPTR code,
- PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr)
+ PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr,
+ int *backref_cache)
{
int length = -1;
int prev_cap_recno = -1;
@@ -101,8 +111,8 @@ uint32_t once_fudge = 0;
BOOL had_recurse = FALSE;
BOOL dupcapused = (re->flags & PCRE2_DUPCAPUSED) != 0;
recurse_check this_recurse;
-register int branchlength = 0;
-register PCRE2_UCHAR *cc = (PCRE2_UCHAR *)code + 1 + LINK_SIZE;
+int branchlength = 0;
+PCRE2_UCHAR *cc = (PCRE2_UCHAR *)code + 1 + LINK_SIZE;
/* If this is a "could be empty" group, its minimum length is 0. */
@@ -124,7 +134,7 @@ for (;;)
{
int d, min, recno;
PCRE2_UCHAR *cs, *ce;
- register PCRE2_UCHAR op = *cc;
+ PCRE2_UCHAR op = *cc;
if (branchlength >= UINT16_MAX) return UINT16_MAX;
@@ -146,12 +156,12 @@ for (;;)
}
goto PROCESS_NON_CAPTURE;
- /* There's a special case of OP_ONCE, when it is wrapped round an
+ case OP_BRA:
+ /* There's a special case of OP_BRA, when it is wrapped round a repeated
OP_RECURSE. We'd like to process the latter at this level so that
remembering the value works for repeated cases. So we do nothing, but
set a fudge value to skip over the OP_KET after the recurse. */
- case OP_ONCE:
if (cc[1+LINK_SIZE] == OP_RECURSE && cc[2*(1+LINK_SIZE)] == OP_KET)
{
once_fudge = 1 + LINK_SIZE;
@@ -160,13 +170,13 @@ for (;;)
}
/* Fall through */
- case OP_ONCE_NC:
- case OP_BRA:
+ case OP_ONCE:
case OP_SBRA:
case OP_BRAPOS:
case OP_SBRAPOS:
PROCESS_NON_CAPTURE:
- d = find_minlength(re, cc, startcode, utf, recurses, countptr);
+ d = find_minlength(re, cc, startcode, utf, recurses, countptr,
+ backref_cache);
if (d < 0) return d;
branchlength += d;
do cc += GET(cc, 1); while (*cc == OP_ALT);
@@ -182,11 +192,12 @@ for (;;)
case OP_SCBRA:
case OP_CBRAPOS:
case OP_SCBRAPOS:
- recno = dupcapused? prev_cap_recno - 1 : (int)GET2(cc, 1+LINK_SIZE);
- if (recno != prev_cap_recno)
+ recno = (int)GET2(cc, 1+LINK_SIZE);
+ if (dupcapused || recno != prev_cap_recno)
{
prev_cap_recno = recno;
- prev_cap_d = find_minlength(re, cc, startcode, utf, recurses, countptr);
+ prev_cap_d = find_minlength(re, cc, startcode, utf, recurses, countptr,
+ backref_cache);
if (prev_cap_d < 0) return prev_cap_d;
}
branchlength += prev_cap_d;
@@ -456,38 +467,52 @@ for (;;)
d = INT_MAX;
- /* Scan all groups with the same name */
+ /* Scan all groups with the same name; find the shortest. */
while (count-- > 0)
{
- ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, GET2(slot, 0));
- if (cs == NULL) return -2;
- do ce += GET(ce, 1); while (*ce == OP_ALT);
- if (cc > cs && cc < ce) /* Simple recursion */
- {
- d = 0;
- had_recurse = TRUE;
- break;
- }
+ int dd, i;
+ recno = GET2(slot, 0);
+
+ if (recno <= backref_cache[0] && backref_cache[recno] >= 0)
+ dd = backref_cache[recno];
else
{
- recurse_check *r = recurses;
- for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
- if (r != NULL) /* Mutual recursion */
+ ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno);
+ if (cs == NULL) return -2;
+ do ce += GET(ce, 1); while (*ce == OP_ALT);
+ if (cc > cs && cc < ce) /* Simple recursion */
{
- d = 0;
+ dd = 0;
had_recurse = TRUE;
- break;
}
else
{
- int dd;
- this_recurse.prev = recurses;
- this_recurse.group = cs;
- dd = find_minlength(re, cs, startcode, utf, &this_recurse, countptr);
- if (dd < d) d = dd;
+ recurse_check *r = recurses;
+ for (r = recurses; r != NULL; r = r->prev)
+ if (r->group == cs) break;
+ if (r != NULL) /* Mutual recursion */
+ {
+ dd = 0;
+ had_recurse = TRUE;
+ }
+ else
+ {
+ this_recurse.prev = recurses;
+ this_recurse.group = cs;
+ dd = find_minlength(re, cs, startcode, utf, &this_recurse,
+ countptr, backref_cache);
+ if (dd < 0) return dd;
+ }
}
+
+ backref_cache[recno] = dd;
+ for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
+ backref_cache[0] = recno;
}
+
+ if (dd < d) d = dd;
+ if (d <= 0) break; /* No point looking at any more */
slot += re->name_entry_size;
}
}
@@ -501,34 +526,48 @@ for (;;)
case OP_REF:
case OP_REFI:
if (dupcapused) return -1;
- if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
+ recno = GET2(cc, 1);
+ if (recno <= backref_cache[0] && backref_cache[recno] >= 0)
+ d = backref_cache[recno];
+ else
{
- ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
- if (cs == NULL) return -2;
- do ce += GET(ce, 1); while (*ce == OP_ALT);
- if (cc > cs && cc < ce) /* Simple recursion */
- {
- d = 0;
- had_recurse = TRUE;
- }
- else
+ int i;
+ if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
{
- recurse_check *r = recurses;
- for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
- if (r != NULL) /* Mutual recursion */
+ ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno);
+ if (cs == NULL) return -2;
+ do ce += GET(ce, 1); while (*ce == OP_ALT);
+ if (cc > cs && cc < ce) /* Simple recursion */
{
d = 0;
had_recurse = TRUE;
}
else
{
- this_recurse.prev = recurses;
- this_recurse.group = cs;
- d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr);
+ recurse_check *r = recurses;
+ for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
+ if (r != NULL) /* Mutual recursion */
+ {
+ d = 0;
+ had_recurse = TRUE;
+ }
+ else
+ {
+ this_recurse.prev = recurses;
+ this_recurse.group = cs;
+ d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr,
+ backref_cache);
+ if (d < 0) return d;
+ }
}
}
+ else d = 0;
+
+ backref_cache[recno] = d;
+ for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
+ backref_cache[0] = recno;
}
- else d = 0;
+
cc += 1 + IMM2_SIZE;
/* Handle repeated back references */
@@ -601,7 +640,7 @@ for (;;)
this_recurse.prev = recurses;
this_recurse.group = cs;
prev_recurse_d = find_minlength(re, cs, startcode, utf, &this_recurse,
- countptr);
+ countptr, backref_cache);
if (prev_recurse_d < 0) return prev_recurse_d;
prev_recurse_recno = recno;
branchlength += prev_recurse_d;
@@ -747,6 +786,7 @@ if (utf)
if (caseless)
{
+#ifdef SUPPORT_UNICODE
if (utf)
{
#if PCRE2_CODE_UNIT_WIDTH == 8
@@ -759,10 +799,12 @@ if (caseless)
if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
#endif
}
+ else
+#endif /* SUPPORT_UNICODE */
/* Not UTF */
- else if (MAX_255(c)) SET_BIT(re->tables[fcc_offset + c]);
+ if (MAX_255(c)) SET_BIT(re->tables[fcc_offset + c]);
}
return p;
@@ -792,7 +834,7 @@ Returns: nothing
static void
set_type_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
{
-register uint32_t c;
+uint32_t c;
for (c = 0; c < table_limit; c++)
re->start_bitmap[c] |= re->tables[c+cbits_offset+cbit_type];
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
@@ -833,7 +875,7 @@ Returns: nothing
static void
set_nottype_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
{
-register uint32_t c;
+uint32_t c;
for (c = 0; c < table_limit; c++)
re->start_bitmap[c] |= ~(re->tables[c+cbits_offset+cbit_type]);
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
@@ -873,7 +915,7 @@ Returns: SSB_FAIL => Failed to find any starting code units
static int
set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf)
{
-register uint32_t c;
+uint32_t c;
int yield = SSB_DONE;
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
@@ -911,7 +953,6 @@ do
case OP_ALLANY:
case OP_ANY:
case OP_ANYBYTE:
- case OP_CIRC:
case OP_CIRCM:
case OP_CLOSE:
case OP_COMMIT:
@@ -979,6 +1020,13 @@ do
case OP_THEN_ARG:
return SSB_FAIL;
+ /* OP_CIRC happens only at the start of an anchored branch (multiline ^
+ uses OP_CIRCM). Skip over it. */
+
+ case OP_CIRC:
+ tcode += PRIV(OP_lengths)[OP_CIRC];
+ break;
+
/* A "real" property test implies no starting bits, but the fake property
PT_CLIST identifies a list of characters. These lists are short, as they
are used for characters with more than one "other case", so there is no
@@ -1025,7 +1073,6 @@ do
case OP_CBRAPOS:
case OP_SCBRAPOS:
case OP_ONCE:
- case OP_ONCE_NC:
case OP_ASSERT:
rc = set_start_bits(re, tcode, utf);
if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
@@ -1407,6 +1454,10 @@ do
classmap = ((tcode[1 + LINK_SIZE] & XCL_MAP) == 0)? NULL :
(uint8_t *)(tcode + 1 + LINK_SIZE + 1);
#endif
+ /* It seems that the fall through comment must be outside the #ifdef if
+ it is to avoid the gcc compiler warning. */
+
+ /* Fall through */
/* Enter here for a negative non-XCLASS. In the 8-bit library, if we are
in UTF mode, any byte with a value >= 0xc4 is a potentially valid starter
@@ -1534,24 +1585,31 @@ BOOL utf = (re->overall_options & PCRE2_UTF) != 0;
code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
re->name_entry_size * re->name_count;
-/* For an anchored pattern, or an unanchored pattern that has a first code
-unit, or a multiline pattern that matches only at "line start", there is no
-point in seeking a list of starting code units. */
+/* For a pattern that has a first code unit, or a multiline pattern that
+matches only at "line start", there is no point in seeking a list of starting
+code units. */
-if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
- (re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
+if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
{
int rc = set_start_bits(re, code, utf);
if (rc == SSB_UNKNOWN) return 1;
if (rc == SSB_DONE) re->flags |= PCRE2_FIRSTMAPSET;
}
-/* Find the minimum length of subject string. If it can match an empty string,
-the minimum length is already known. */
+/* Find the minimum length of subject string. If the pattern can match an empty
+string, the minimum length is already known. If there are more back references
+than the size of the vector we are going to cache them in, do nothing. A
+pattern that complicated will probably take a long time to analyze and may in
+any case turn out to be too complicated. Note that back reference minima are
+held as 16-bit numbers. */
-if ((re->flags & PCRE2_MATCH_EMPTY) == 0)
+if ((re->flags & PCRE2_MATCH_EMPTY) == 0 &&
+ re->top_backref <= MAX_CACHE_BACKREF)
{
- switch(min = find_minlength(re, code, code, utf, NULL, &count))
+ int backref_cache[MAX_CACHE_BACKREF+1];
+ backref_cache[0] = 0; /* Highest one that is set */
+ min = find_minlength(re, code, code, utf, NULL, &count, backref_cache);
+ switch(min)
{
case -1: /* \C in UTF mode or (*ACCEPT) or over-complex regex */
break; /* Leave minlength unchanged (will be zero) */