summaryrefslogtreecommitdiffstats
path: root/src/3rdparty/pcre2/src/pcre2_match.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/3rdparty/pcre2/src/pcre2_match.c')
-rw-r--r--src/3rdparty/pcre2/src/pcre2_match.c577
1 files changed, 405 insertions, 172 deletions
diff --git a/src/3rdparty/pcre2/src/pcre2_match.c b/src/3rdparty/pcre2/src/pcre2_match.c
index e3f78c2ca3..168b9fad01 100644
--- a/src/3rdparty/pcre2/src/pcre2_match.c
+++ b/src/3rdparty/pcre2/src/pcre2_match.c
@@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
- New API code Copyright (c) 2015-2020 University of Cambridge
+ New API code Copyright (c) 2015-2022 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -49,7 +49,7 @@ POSSIBILITY OF SUCH DAMAGE.
/* #define DEBUG_SHOW_OPS */
/* #define DEBUG_SHOW_RMATCH */
-#ifdef DEBUG_FRAME_DISPLAY
+#ifdef DEBUG_FRAMES_DISPLAY
#include <stdarg.h>
#endif
@@ -159,7 +159,8 @@ enum { RM100=100, RM101 };
#ifdef SUPPORT_UNICODE
enum { RM200=200, RM201, RM202, RM203, RM204, RM205, RM206, RM207,
RM208, RM209, RM210, RM211, RM212, RM213, RM214, RM215,
- RM216, RM217, RM218, RM219, RM220, RM221, RM222 };
+ RM216, RM217, RM218, RM219, RM220, RM221, RM222, RM223,
+ RM224, RM225 };
#endif
/* Define short names for general fields in the current backtrack frame, which
@@ -203,6 +204,7 @@ Arguments:
P a previous frame of interest
frame_size the frame size
mb points to the match block
+ match_data points to the match data block
s identification text
Returns: nothing
@@ -210,7 +212,7 @@ Returns: nothing
static void
display_frames(FILE *f, heapframe *F, heapframe *P, PCRE2_SIZE frame_size,
- match_block *mb, const char *s, ...)
+ match_block *mb, pcre2_match_data *match_data, const char *s, ...)
{
uint32_t i;
heapframe *Q;
@@ -222,10 +224,10 @@ vfprintf(f, s, ap);
va_end(ap);
if (P != NULL) fprintf(f, " P=%lu",
- ((char *)P - (char *)(mb->match_frames))/frame_size);
+ ((char *)P - (char *)(match_data->heapframes))/frame_size);
fprintf(f, "\n");
-for (i = 0, Q = mb->match_frames;
+for (i = 0, Q = match_data->heapframes;
Q <= F;
i++, Q = (heapframe *)((char *)Q + frame_size))
{
@@ -489,10 +491,16 @@ A version did exist that used individual frames on the heap instead of calling
match() recursively, but this ran substantially slower. The current version is
a refactoring that uses a vector of frames to remember backtracking points.
This runs no slower, and possibly even a bit faster than the original recursive
-implementation. An initial vector of size START_FRAMES_SIZE (enough for maybe
-50 frames) is allocated on the system stack. If this is not big enough, the
-heap is used for a larger vector.
-
+implementation.
+
+At first, an initial vector of size START_FRAMES_SIZE (enough for maybe 50
+frames) was allocated on the system stack. If this was not big enough, the heap
+was used for a larger vector. However, it turns out that there are environments
+where taking as little as 20KiB from the system stack is an embarrassment.
+After another refactoring, the heap is used exclusively, but a pointer the
+frames vector and its size are cached in the match_data block, so that there is
+no new memory allocation if the same match_data block is used for multiple
+matches (unless the frames vector has to be extended).
*******************************************************************************
******************************************************************************/
@@ -565,10 +573,9 @@ made performance worse.
Arguments:
start_eptr starting character in subject
start_ecode starting position in compiled code
- ovector pointer to the final output vector
- oveccount number of pairs in ovector
top_bracket number of capturing parentheses in the pattern
frame_size size of each backtracking frame
+ match_data pointer to the match_data block
mb pointer to "static" variables block
Returns: MATCH_MATCH if matched ) these values are >= 0
@@ -579,17 +586,19 @@ Returns: MATCH_MATCH if matched ) these values are >= 0
*/
static int
-match(PCRE2_SPTR start_eptr, PCRE2_SPTR start_ecode, PCRE2_SIZE *ovector,
- uint16_t oveccount, uint16_t top_bracket, PCRE2_SIZE frame_size,
- match_block *mb)
+match(PCRE2_SPTR start_eptr, PCRE2_SPTR start_ecode, uint16_t top_bracket,
+ PCRE2_SIZE frame_size, pcre2_match_data *match_data, match_block *mb)
{
/* Frame-handling variables */
heapframe *F; /* Current frame pointer */
heapframe *N = NULL; /* Temporary frame pointers */
heapframe *P = NULL;
+
+heapframe *frames_top; /* End of frames vector */
heapframe *assert_accept_frame = NULL; /* For passing back a frame with captures */
-PCRE2_SIZE frame_copy_size; /* Amount to copy when creating a new frame */
+PCRE2_SIZE heapframes_size; /* Usable size of frames vector */
+PCRE2_SIZE frame_copy_size; /* Amount to copy when creating a new frame */
/* Local variables that do not need to be preserved over calls to RRMATCH(). */
@@ -626,10 +635,14 @@ copied when a new frame is created. */
frame_copy_size = frame_size - offsetof(heapframe, eptr);
-/* Set up the first current frame at the start of the vector, and initialize
-fields that are not reset for new frames. */
+/* Set up the first frame and the end of the frames vector. We set the local
+heapframes_size to the usuable amount of the vector, that is, a whole number of
+frames. */
+
+F = match_data->heapframes;
+heapframes_size = (match_data->heapframes_size / frame_size) * frame_size;
+frames_top = (heapframe *)((char *)F + heapframes_size);
-F = mb->match_frames;
Frdepth = 0; /* "Recursion" depth */
Fcapture_last = 0; /* Number of most recent capture */
Fcurrent_recurse = RECURSE_UNSET; /* Not pattern recursing. */
@@ -645,34 +658,35 @@ backtracking point. */
MATCH_RECURSE:
-/* Set up a new backtracking frame. If the vector is full, get a new one
-on the heap, doubling the size, but constrained by the heap limit. */
+/* Set up a new backtracking frame. If the vector is full, get a new one,
+doubling the size, but constrained by the heap limit (which is in KiB). */
N = (heapframe *)((char *)F + frame_size);
-if (N >= mb->match_frames_top)
+if (N >= frames_top)
{
- PCRE2_SIZE newsize = mb->frame_vector_size * 2;
heapframe *new;
+ PCRE2_SIZE newsize = match_data->heapframes_size * 2;
- if ((newsize / 1024) > mb->heap_limit)
+ if (newsize > mb->heap_limit)
{
- PCRE2_SIZE maxsize = ((mb->heap_limit * 1024)/frame_size) * frame_size;
- if (mb->frame_vector_size >= maxsize) return PCRE2_ERROR_HEAPLIMIT;
+ PCRE2_SIZE maxsize = (mb->heap_limit/frame_size) * frame_size;
+ if (match_data->heapframes_size >= maxsize) return PCRE2_ERROR_HEAPLIMIT;
newsize = maxsize;
}
- new = mb->memctl.malloc(newsize, mb->memctl.memory_data);
+ new = match_data->memctl.malloc(newsize, match_data->memctl.memory_data);
if (new == NULL) return PCRE2_ERROR_NOMEMORY;
- memcpy(new, mb->match_frames, mb->frame_vector_size);
+ memcpy(new, match_data->heapframes, heapframes_size);
- F = (heapframe *)((char *)new + ((char *)F - (char *)mb->match_frames));
+ F = (heapframe *)((char *)new + ((char *)F - (char *)match_data->heapframes));
N = (heapframe *)((char *)F + frame_size);
- if (mb->match_frames != mb->stack_frames)
- mb->memctl.free(mb->match_frames, mb->memctl.memory_data);
- mb->match_frames = new;
- mb->match_frames_top = (heapframe *)((char *)mb->match_frames + newsize);
- mb->frame_vector_size = newsize;
+ match_data->memctl.free(match_data->heapframes, match_data->memctl.memory_data);
+ match_data->heapframes = new;
+ match_data->heapframes_size = newsize;
+
+ heapframes_size = (newsize / frame_size) * frame_size;
+ frames_top = (heapframe *)((char *)new + heapframes_size);
}
#ifdef DEBUG_SHOW_RMATCH
@@ -730,7 +744,7 @@ recursion value. */
if (group_frame_type != 0)
{
- Flast_group_offset = (char *)F - (char *)mb->match_frames;
+ Flast_group_offset = (char *)F - (char *)match_data->heapframes;
if (GF_IDMASK(group_frame_type) == GF_RECURSE)
Fcurrent_recurse = GF_DATAMASK(group_frame_type);
group_frame_type = 0;
@@ -772,7 +786,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
for(;;)
{
if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL;
- N = (heapframe *)((char *)mb->match_frames + offset);
+ N = (heapframe *)((char *)match_data->heapframes + offset);
P = (heapframe *)((char *)N - frame_size);
if (N->group_frame_type == (GF_CAPTURE | number)) break;
offset = P->last_group_offset;
@@ -810,7 +824,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
for(;;)
{
if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL;
- N = (heapframe *)((char *)mb->match_frames + offset);
+ N = (heapframe *)((char *)match_data->heapframes + offset);
P = (heapframe *)((char *)N - frame_size);
if (GF_IDMASK(N->group_frame_type) == GF_RECURSE) break;
offset = P->last_group_offset;
@@ -818,10 +832,12 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
/* N is now the frame of the recursion; the previous frame is at the
OP_RECURSE position. Go back there, copying the current subject position
- and mark, and move on past the OP_RECURSE. */
+ and mark, and the start_match position (\K might have changed it), and
+ then move on past the OP_RECURSE. */
P->eptr = Feptr;
P->mark = Fmark;
+ P->start_match = Fstart_match;
F = P;
Fecode += 1 + LINK_SIZE;
continue;
@@ -861,14 +877,15 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
mb->mark = Fmark; /* and the last success mark */
if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
- ovector[0] = Fstart_match - mb->start_subject;
- ovector[1] = Feptr - mb->start_subject;
+ match_data->ovector[0] = Fstart_match - mb->start_subject;
+ match_data->ovector[1] = Feptr - mb->start_subject;
/* Set i to the smaller of the sizes of the external and frame ovectors. */
- i = 2 * ((top_bracket + 1 > oveccount)? oveccount : top_bracket + 1);
- memcpy(ovector + 2, Fovector, (i - 2) * sizeof(PCRE2_SIZE));
- while (--i >= Foffset_top + 2) ovector[i] = PCRE2_UNSET;
+ i = 2 * ((top_bracket + 1 > match_data->oveccount)?
+ match_data->oveccount : top_bracket + 1);
+ memcpy(match_data->ovector + 2, Fovector, (i - 2) * sizeof(PCRE2_SIZE));
+ while (--i >= Foffset_top + 2) match_data->ovector[i] = PCRE2_UNSET;
return MATCH_MATCH; /* Note: NOT RRETURN */
@@ -2419,40 +2436,49 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
{
const uint32_t *cp;
const ucd_record *prop = GET_UCD(fc);
+ BOOL notmatch = Fop == OP_NOTPROP;
switch(Fecode[1])
{
case PT_ANY:
- if (Fop == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
+ if (notmatch) RRETURN(MATCH_NOMATCH);
break;
case PT_LAMP:
if ((prop->chartype == ucp_Lu ||
prop->chartype == ucp_Ll ||
- prop->chartype == ucp_Lt) == (Fop == OP_NOTPROP))
+ prop->chartype == ucp_Lt) == notmatch)
RRETURN(MATCH_NOMATCH);
break;
case PT_GC:
- if ((Fecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (Fop == OP_PROP))
+ if ((Fecode[2] == PRIV(ucp_gentype)[prop->chartype]) == notmatch)
RRETURN(MATCH_NOMATCH);
break;
case PT_PC:
- if ((Fecode[2] != prop->chartype) == (Fop == OP_PROP))
+ if ((Fecode[2] == prop->chartype) == notmatch)
RRETURN(MATCH_NOMATCH);
break;
case PT_SC:
- if ((Fecode[2] != prop->script) == (Fop == OP_PROP))
+ if ((Fecode[2] == prop->script) == notmatch)
RRETURN(MATCH_NOMATCH);
break;
+ case PT_SCX:
+ {
+ BOOL ok = (Fecode[2] == prop->script ||
+ MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), Fecode[2]) != 0);
+ if (ok == notmatch) RRETURN(MATCH_NOMATCH);
+ }
+ break;
+
/* These are specials */
case PT_ALNUM:
if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
- PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (Fop == OP_NOTPROP))
+ PRIV(ucp_gentype)[prop->chartype] == ucp_N) == notmatch)
RRETURN(MATCH_NOMATCH);
break;
@@ -2466,12 +2492,12 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
{
HSPACE_CASES:
VSPACE_CASES:
- if (Fop == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
+ if (notmatch) RRETURN(MATCH_NOMATCH);
break;
default:
- if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) ==
- (Fop == OP_NOTPROP)) RRETURN(MATCH_NOMATCH);
+ if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == notmatch)
+ RRETURN(MATCH_NOMATCH);
break;
}
break;
@@ -2479,7 +2505,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
case PT_WORD:
if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
- fc == CHAR_UNDERSCORE) == (Fop == OP_NOTPROP))
+ fc == CHAR_UNDERSCORE) == notmatch)
RRETURN(MATCH_NOMATCH);
break;
@@ -2488,19 +2514,32 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
for (;;)
{
if (fc < *cp)
- { if (Fop == OP_PROP) { RRETURN(MATCH_NOMATCH); } else break; }
+ { if (notmatch) break; else { RRETURN(MATCH_NOMATCH); } }
if (fc == *cp++)
- { if (Fop == OP_PROP) break; else { RRETURN(MATCH_NOMATCH); } }
+ { if (notmatch) { RRETURN(MATCH_NOMATCH); } else break; }
}
break;
case PT_UCNC:
if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT ||
fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) ||
- fc >= 0xe000) == (Fop == OP_NOTPROP))
+ fc >= 0xe000) == notmatch)
RRETURN(MATCH_NOMATCH);
break;
+ case PT_BIDICL:
+ if ((UCD_BIDICLASS_PROP(prop) == Fecode[2]) == notmatch)
+ RRETURN(MATCH_NOMATCH);
+ break;
+
+ case PT_BOOL:
+ {
+ BOOL ok = MAPBIT(PRIV(ucd_boolprop_sets) +
+ UCD_BPROPS_PROP(prop), Fecode[2]) != 0;
+ if (ok == notmatch) RRETURN(MATCH_NOMATCH);
+ }
+ break;
+
/* This should never occur */
default:
@@ -2614,18 +2653,20 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
/* First, ensure the minimum number of matches are present. Use inline
code for maximizing the speed, and do the type test once at the start
- (i.e. keep it out of the loop). The code for UTF mode is separated out for
- tidiness, except for Unicode property tests. */
+ (i.e. keep it out of the loops). As there are no calls to RMATCH in the
+ loops, we can use an ordinary variable for "notmatch". The code for UTF
+ mode is separated out for tidiness, except for Unicode property tests. */
if (Lmin > 0)
{
#ifdef SUPPORT_UNICODE
if (proptype >= 0) /* Property tests in all modes */
{
+ BOOL notmatch = Lctype == OP_NOTPROP;
switch(proptype)
{
case PT_ANY:
- if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
+ if (notmatch) RRETURN(MATCH_NOMATCH);
for (i = 1; i <= Lmin; i++)
{
if (Feptr >= mb->end_subject)
@@ -2650,7 +2691,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
chartype = UCD_CHARTYPE(fc);
if ((chartype == ucp_Lu ||
chartype == ucp_Ll ||
- chartype == ucp_Lt) == (Lctype == OP_NOTPROP))
+ chartype == ucp_Lt) == notmatch)
RRETURN(MATCH_NOMATCH);
}
break;
@@ -2664,7 +2705,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
RRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(fc, Feptr);
- if ((UCD_CATEGORY(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
+ if ((UCD_CATEGORY(fc) == Lpropvalue) == notmatch)
RRETURN(MATCH_NOMATCH);
}
break;
@@ -2678,7 +2719,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
RRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(fc, Feptr);
- if ((UCD_CHARTYPE(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
+ if ((UCD_CHARTYPE(fc) == Lpropvalue) == notmatch)
RRETURN(MATCH_NOMATCH);
}
break;
@@ -2692,7 +2733,26 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
RRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(fc, Feptr);
- if ((UCD_SCRIPT(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
+ if ((UCD_SCRIPT(fc) == Lpropvalue) == notmatch)
+ RRETURN(MATCH_NOMATCH);
+ }
+ break;
+
+ case PT_SCX:
+ for (i = 1; i <= Lmin; i++)
+ {
+ BOOL ok;
+ const ucd_record *prop;
+ if (Feptr >= mb->end_subject)
+ {
+ SCHECK_PARTIAL();
+ RRETURN(MATCH_NOMATCH);
+ }
+ GETCHARINCTEST(fc, Feptr);
+ prop = GET_UCD(fc);
+ ok = (prop->script == Lpropvalue ||
+ MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), Lpropvalue) != 0);
+ if (ok == notmatch)
RRETURN(MATCH_NOMATCH);
}
break;
@@ -2708,7 +2768,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
}
GETCHARINCTEST(fc, Feptr);
category = UCD_CATEGORY(fc);
- if ((category == ucp_L || category == ucp_N) == (Lctype == OP_NOTPROP))
+ if ((category == ucp_L || category == ucp_N) == notmatch)
RRETURN(MATCH_NOMATCH);
}
break;
@@ -2731,11 +2791,11 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
{
HSPACE_CASES:
VSPACE_CASES:
- if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
+ if (notmatch) RRETURN(MATCH_NOMATCH);
break;
default:
- if ((UCD_CATEGORY(fc) == ucp_Z) == (Lctype == OP_NOTPROP))
+ if ((UCD_CATEGORY(fc) == ucp_Z) == notmatch)
RRETURN(MATCH_NOMATCH);
break;
}
@@ -2754,7 +2814,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
GETCHARINCTEST(fc, Feptr);
category = UCD_CATEGORY(fc);
if ((category == ucp_L || category == ucp_N ||
- fc == CHAR_UNDERSCORE) == (Lctype == OP_NOTPROP))
+ fc == CHAR_UNDERSCORE) == notmatch)
RRETURN(MATCH_NOMATCH);
}
break;
@@ -2774,12 +2834,12 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
{
if (fc < *cp)
{
- if (Lctype == OP_NOTPROP) break;
+ if (notmatch) break;
RRETURN(MATCH_NOMATCH);
}
if (fc == *cp++)
{
- if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
+ if (notmatch) RRETURN(MATCH_NOMATCH);
break;
}
}
@@ -2797,7 +2857,40 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
GETCHARINCTEST(fc, Feptr);
if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT ||
fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) ||
- fc >= 0xe000) == (Lctype == OP_NOTPROP))
+ fc >= 0xe000) == notmatch)
+ RRETURN(MATCH_NOMATCH);
+ }
+ break;
+
+ case PT_BIDICL:
+ for (i = 1; i <= Lmin; i++)
+ {
+ if (Feptr >= mb->end_subject)
+ {
+ SCHECK_PARTIAL();
+ RRETURN(MATCH_NOMATCH);
+ }
+ GETCHARINCTEST(fc, Feptr);
+ if ((UCD_BIDICLASS(fc) == Lpropvalue) == notmatch)
+ RRETURN(MATCH_NOMATCH);
+ }
+ break;
+
+ case PT_BOOL:
+ for (i = 1; i <= Lmin; i++)
+ {
+ BOOL ok;
+ const ucd_record *prop;
+ if (Feptr >= mb->end_subject)
+ {
+ SCHECK_PARTIAL();
+ RRETURN(MATCH_NOMATCH);
+ }
+ GETCHARINCTEST(fc, Feptr);
+ prop = GET_UCD(fc);
+ ok = MAPBIT(PRIV(ucd_boolprop_sets) +
+ UCD_BPROPS_PROP(prop), Lpropvalue) != 0;
+ if (ok == notmatch)
RRETURN(MATCH_NOMATCH);
}
break;
@@ -3341,7 +3434,9 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
if (Lmin == Lmax) continue;
/* If minimizing, we have to test the rest of the pattern before each
- subsequent match. */
+ subsequent match. This means we cannot use a local "notmatch" variable as
+ in the other cases. As all 4 temporary 32-bit values in the frame are
+ already in use, just test the type each time. */
if (reptype == REPTYPE_MIN)
{
@@ -3438,6 +3533,28 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
}
/* Control never gets here */
+ case PT_SCX:
+ for (;;)
+ {
+ BOOL ok;
+ const ucd_record *prop;
+ RMATCH(Fecode, RM225);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
+ if (Feptr >= mb->end_subject)
+ {
+ SCHECK_PARTIAL();
+ RRETURN(MATCH_NOMATCH);
+ }
+ GETCHARINCTEST(fc, Feptr);
+ prop = GET_UCD(fc);
+ ok = (prop->script == Lpropvalue
+ || MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), Lpropvalue) != 0);
+ if (ok == (Lctype == OP_NOTPROP))
+ RRETURN(MATCH_NOMATCH);
+ }
+ /* Control never gets here */
+
case PT_ALNUM:
for (;;)
{
@@ -3452,8 +3569,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
}
GETCHARINCTEST(fc, Feptr);
category = UCD_CATEGORY(fc);
- if ((category == ucp_L || category == ucp_N) ==
- (Lctype == OP_NOTPROP))
+ if ((category == ucp_L || category == ucp_N) == (Lctype == OP_NOTPROP))
RRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
@@ -3560,6 +3676,45 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
}
/* Control never gets here */
+ case PT_BIDICL:
+ for (;;)
+ {
+ RMATCH(Fecode, RM224);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
+ if (Feptr >= mb->end_subject)
+ {
+ SCHECK_PARTIAL();
+ RRETURN(MATCH_NOMATCH);
+ }
+ GETCHARINCTEST(fc, Feptr);
+ if ((UCD_BIDICLASS(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
+ RRETURN(MATCH_NOMATCH);
+ }
+ /* Control never gets here */
+
+ case PT_BOOL:
+ for (;;)
+ {
+ BOOL ok;
+ const ucd_record *prop;
+ RMATCH(Fecode, RM223);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
+ if (Feptr >= mb->end_subject)
+ {
+ SCHECK_PARTIAL();
+ RRETURN(MATCH_NOMATCH);
+ }
+ GETCHARINCTEST(fc, Feptr);
+ prop = GET_UCD(fc);
+ ok = MAPBIT(PRIV(ucd_boolprop_sets) +
+ UCD_BPROPS_PROP(prop), Lpropvalue) != 0;
+ if (ok == (Lctype == OP_NOTPROP))
+ RRETURN(MATCH_NOMATCH);
+ }
+ /* Control never gets here */
+
/* This should never occur */
default:
return PCRE2_ERROR_INTERNAL;
@@ -3868,7 +4023,9 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
}
/* If maximizing, it is worth using inline code for speed, doing the type
- test once at the start (i.e. keep it out of the loop). */
+ test once at the start (i.e. keep it out of the loops). Once again,
+ "notmatch" can be an ordinary local variable because the loops do not call
+ RMATCH. */
else
{
@@ -3877,6 +4034,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
#ifdef SUPPORT_UNICODE
if (proptype >= 0)
{
+ BOOL notmatch = Lctype == OP_NOTPROP;
switch(proptype)
{
case PT_ANY:
@@ -3889,7 +4047,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
break;
}
GETCHARLENTEST(fc, Feptr, len);
- if (Lctype == OP_NOTPROP) break;
+ if (notmatch) break;
Feptr+= len;
}
break;
@@ -3908,7 +4066,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
chartype = UCD_CHARTYPE(fc);
if ((chartype == ucp_Lu ||
chartype == ucp_Ll ||
- chartype == ucp_Lt) == (Lctype == OP_NOTPROP))
+ chartype == ucp_Lt) == notmatch)
break;
Feptr+= len;
}
@@ -3924,8 +4082,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
break;
}
GETCHARLENTEST(fc, Feptr, len);
- if ((UCD_CATEGORY(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
- break;
+ if ((UCD_CATEGORY(fc) == Lpropvalue) == notmatch) break;
Feptr+= len;
}
break;
@@ -3940,8 +4097,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
break;
}
GETCHARLENTEST(fc, Feptr, len);
- if ((UCD_CHARTYPE(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
- break;
+ if ((UCD_CHARTYPE(fc) == Lpropvalue) == notmatch) break;
Feptr+= len;
}
break;
@@ -3956,8 +4112,27 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
break;
}
GETCHARLENTEST(fc, Feptr, len);
- if ((UCD_SCRIPT(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
+ if ((UCD_SCRIPT(fc) == Lpropvalue) == notmatch) break;
+ Feptr+= len;
+ }
+ break;
+
+ case PT_SCX:
+ for (i = Lmin; i < Lmax; i++)
+ {
+ BOOL ok;
+ const ucd_record *prop;
+ int len = 1;
+ if (Feptr >= mb->end_subject)
+ {
+ SCHECK_PARTIAL();
break;
+ }
+ GETCHARLENTEST(fc, Feptr, len);
+ prop = GET_UCD(fc);
+ ok = (prop->script == Lpropvalue ||
+ MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), Lpropvalue) != 0);
+ if (ok == notmatch) break;
Feptr+= len;
}
break;
@@ -3974,8 +4149,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
}
GETCHARLENTEST(fc, Feptr, len);
category = UCD_CATEGORY(fc);
- if ((category == ucp_L || category == ucp_N) ==
- (Lctype == OP_NOTPROP))
+ if ((category == ucp_L || category == ucp_N) == notmatch)
break;
Feptr+= len;
}
@@ -4000,11 +4174,11 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
{
HSPACE_CASES:
VSPACE_CASES:
- if (Lctype == OP_NOTPROP) goto ENDLOOP99; /* Break the loop */
+ if (notmatch) goto ENDLOOP99; /* Break the loop */
break;
default:
- if ((UCD_CATEGORY(fc) == ucp_Z) == (Lctype == OP_NOTPROP))
+ if ((UCD_CATEGORY(fc) == ucp_Z) == notmatch)
goto ENDLOOP99; /* Break the loop */
break;
}
@@ -4026,7 +4200,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
GETCHARLENTEST(fc, Feptr, len);
category = UCD_CATEGORY(fc);
if ((category == ucp_L || category == ucp_N ||
- fc == CHAR_UNDERSCORE) == (Lctype == OP_NOTPROP))
+ fc == CHAR_UNDERSCORE) == notmatch)
break;
Feptr+= len;
}
@@ -4047,9 +4221,9 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
for (;;)
{
if (fc < *cp)
- { if (Lctype == OP_NOTPROP) break; else goto GOT_MAX; }
+ { if (notmatch) break; else goto GOT_MAX; }
if (fc == *cp++)
- { if (Lctype == OP_NOTPROP) goto GOT_MAX; else break; }
+ { if (notmatch) goto GOT_MAX; else break; }
}
Feptr += len;
}
@@ -4068,12 +4242,47 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
GETCHARLENTEST(fc, Feptr, len);
if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT ||
fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) ||
- fc >= 0xe000) == (Lctype == OP_NOTPROP))
+ fc >= 0xe000) == notmatch)
break;
Feptr += len;
}
break;
+ case PT_BIDICL:
+ for (i = Lmin; i < Lmax; i++)
+ {
+ int len = 1;
+ if (Feptr >= mb->end_subject)
+ {
+ SCHECK_PARTIAL();
+ break;
+ }
+ GETCHARLENTEST(fc, Feptr, len);
+ if ((UCD_BIDICLASS(fc) == Lpropvalue) == notmatch) break;
+ Feptr+= len;
+ }
+ break;
+
+ case PT_BOOL:
+ for (i = Lmin; i < Lmax; i++)
+ {
+ BOOL ok;
+ const ucd_record *prop;
+ int len = 1;
+ if (Feptr >= mb->end_subject)
+ {
+ SCHECK_PARTIAL();
+ break;
+ }
+ GETCHARLENTEST(fc, Feptr, len);
+ prop = GET_UCD(fc);
+ ok = MAPBIT(PRIV(ucd_boolprop_sets) +
+ UCD_BPROPS_PROP(prop), Lpropvalue) != 0;
+ if (ok == notmatch) break;
+ Feptr+= len;
+ }
+ break;
+
default:
return PCRE2_ERROR_INTERNAL;
}
@@ -5133,7 +5342,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
offset = Flast_group_offset;
while (offset != PCRE2_UNSET)
{
- N = (heapframe *)((char *)mb->match_frames + offset);
+ N = (heapframe *)((char *)match_data->heapframes + offset);
P = (heapframe *)((char *)N - frame_size);
if (N->group_frame_type == (GF_RECURSE | number))
{
@@ -5534,7 +5743,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
if (*bracode != OP_BRA && *bracode != OP_COND)
{
- N = (heapframe *)((char *)mb->match_frames + Flast_group_offset);
+ N = (heapframe *)((char *)match_data->heapframes + Flast_group_offset);
P = (heapframe *)((char *)N - frame_size);
Flast_group_offset = P->last_group_offset;
@@ -6064,7 +6273,7 @@ switch (Freturn_id)
LBL(200) LBL(201) LBL(202) LBL(203) LBL(204) LBL(205) LBL(206)
LBL(207) LBL(208) LBL(209) LBL(210) LBL(211) LBL(212) LBL(213)
LBL(214) LBL(215) LBL(216) LBL(217) LBL(218) LBL(219) LBL(220)
- LBL(221) LBL(222)
+ LBL(221) LBL(222) LBL(223) LBL(224) LBL(225)
#endif
default:
@@ -6115,8 +6324,8 @@ BOOL has_req_cu = FALSE;
BOOL startline;
#if PCRE2_CODE_UNIT_WIDTH == 8
-BOOL memchr_not_found_first_cu;
-BOOL memchr_not_found_first_cu2;
+PCRE2_SPTR memchr_found_first_cu;
+PCRE2_SPTR memchr_found_first_cu2;
#endif
PCRE2_UCHAR first_cu = 0;
@@ -6127,8 +6336,8 @@ PCRE2_UCHAR req_cu2 = 0;
PCRE2_SPTR bumpalong_limit;
PCRE2_SPTR end_subject;
PCRE2_SPTR true_end_subject;
-PCRE2_SPTR start_match = subject + start_offset;
-PCRE2_SPTR req_cu_ptr = start_match - 1;
+PCRE2_SPTR start_match;
+PCRE2_SPTR req_cu_ptr;
PCRE2_SPTR start_partial;
PCRE2_SPTR match_partial;
@@ -6151,6 +6360,7 @@ BOOL jit_checked_utf = FALSE;
#endif /* SUPPORT_UNICODE */
PCRE2_SIZE frame_size;
+PCRE2_SIZE heapframes_size;
/* We need to have mb as a pointer to a match block, because the IS_NEWLINE
macro is used below, and it expects NLBLOCK to be defined as a pointer. */
@@ -6159,18 +6369,18 @@ pcre2_callout_block cb;
match_block actual_match_block;
match_block *mb = &actual_match_block;
-/* Allocate an initial vector of backtracking frames on the stack. If this
-proves to be too small, it is replaced by a larger one on the heap. To get a
-vector of the size required that is aligned for pointers, allocate it as a
-vector of pointers. */
+/* Recognize NULL, length 0 as an empty string. */
-PCRE2_SPTR stack_frames_vector[START_FRAMES_SIZE/sizeof(PCRE2_SPTR)]
- PCRE2_KEEP_UNINITIALIZED;
-mb->stack_frames = (heapframe *)stack_frames_vector;
+if (subject == NULL && length == 0) subject = (PCRE2_SPTR)"";
-/* A length equal to PCRE2_ZERO_TERMINATED implies a zero-terminated
-subject string. */
+/* Plausibility checks */
+if ((options & ~PUBLIC_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION;
+if (code == NULL || subject == NULL || match_data == NULL)
+ return PCRE2_ERROR_NULL;
+
+start_match = subject + start_offset;
+req_cu_ptr = start_match - 1;
if (length == PCRE2_ZERO_TERMINATED)
{
length = PRIV(strlen)(subject);
@@ -6178,11 +6388,6 @@ if (length == PCRE2_ZERO_TERMINATED)
}
true_end_subject = end_subject = subject + length;
-/* Plausibility checks */
-
-if ((options & ~PUBLIC_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION;
-if (code == NULL || subject == NULL || match_data == NULL)
- return PCRE2_ERROR_NULL;
if (start_offset > length) return PCRE2_ERROR_BADOFFSET;
/* Check that the first field in the block is the magic number. */
@@ -6480,7 +6685,7 @@ if (utf &&
/* If the end precedes start_match, it means there is invalid UTF in the
extra code units we reversed over because of a lookbehind. Advance past the
first bad code unit, and then skip invalid character starting code units in
- 8-bit and 16-bit modes, and try again. */
+ 8-bit and 16-bit modes, and try again with the original end point. */
if (end_subject < start_match)
{
@@ -6489,6 +6694,7 @@ if (utf &&
while (mb->check_subject < start_match && NOT_FIRSTCU(*mb->check_subject))
mb->check_subject++;
#endif
+ end_subject = true_end_subject;
}
/* Otherwise, set the not end of line option, and do the match. */
@@ -6593,22 +6799,24 @@ switch(re->newline_convention)
vector at the end, whose size depends on the number of capturing parentheses in
the pattern. It is not used at all if there are no capturing parentheses.
- frame_size is the total size of each frame
- mb->frame_vector_size is the total usable size of the vector (rounded down
- to a whole number of frames)
+ frame_size is the total size of each frame
+ match_data->heapframes is the pointer to the frames vector
+ match_data->heapframes_size is the total size of the vector
-The last of these is changed within the match() function if the frame vector
-has to be expanded. We therefore put it into the match block so that it is
-correct when calling match() more than once for non-anchored patterns. */
+We must pad the frame_size for alignment to ensure subsequent frames are as
+aligned as heapframe. Whilst ovector is word-aligned due to being a PCRE2_SIZE
+array, that does not guarantee it is suitably aligned for pointers, as some
+architectures have pointers that are larger than a size_t. */
-frame_size = offsetof(heapframe, ovector) +
- re->top_bracket * 2 * sizeof(PCRE2_SIZE);
+frame_size = (offsetof(heapframe, ovector) +
+ re->top_bracket * 2 * sizeof(PCRE2_SIZE) + HEAPFRAME_ALIGNMENT - 1) &
+ ~(HEAPFRAME_ALIGNMENT - 1);
/* Limits set in the pattern override the match context only if they are
smaller. */
-mb->heap_limit = (mcontext->heap_limit < re->limit_heap)?
- mcontext->heap_limit : re->limit_heap;
+mb->heap_limit = ((mcontext->heap_limit < re->limit_heap)?
+ mcontext->heap_limit : re->limit_heap) * 1024;
mb->match_limit = (mcontext->match_limit < re->limit_match)?
mcontext->match_limit : re->limit_match;
@@ -6617,36 +6825,41 @@ mb->match_limit_depth = (mcontext->depth_limit < re->limit_depth)?
mcontext->depth_limit : re->limit_depth;
/* If a pattern has very many capturing parentheses, the frame size may be very
-large. Ensure that there are at least 10 available frames by getting an initial
-vector on the heap if necessary, except when the heap limit prevents this. Get
-fewer if possible. (The heap limit is in kibibytes.) */
-
-if (frame_size <= START_FRAMES_SIZE/10)
+large. Set the initial frame vector size to ensure that there are at least 10
+available frames, but enforce a minimum of START_FRAMES_SIZE. If this is
+greater than the heap limit, get as large a vector as possible. Always round
+the size to a multiple of the frame size. */
+
+heapframes_size = frame_size * 10;
+if (heapframes_size < START_FRAMES_SIZE) heapframes_size = START_FRAMES_SIZE;
+if (heapframes_size > mb->heap_limit)
{
- mb->match_frames = mb->stack_frames; /* Initial frame vector on the stack */
- mb->frame_vector_size = ((START_FRAMES_SIZE/frame_size) * frame_size);
+ if (frame_size > mb->heap_limit ) return PCRE2_ERROR_HEAPLIMIT;
+ heapframes_size = mb->heap_limit;
}
-else
+
+/* If an existing frame vector in the match_data block is large enough, we can
+use it.Otherwise, free any pre-existing vector and get a new one. */
+
+if (match_data->heapframes_size < heapframes_size)
{
- mb->frame_vector_size = frame_size * 10;
- if ((mb->frame_vector_size / 1024) > mb->heap_limit)
+ match_data->memctl.free(match_data->heapframes,
+ match_data->memctl.memory_data);
+ match_data->heapframes = match_data->memctl.malloc(heapframes_size,
+ match_data->memctl.memory_data);
+ if (match_data->heapframes == NULL)
{
- if (frame_size > mb->heap_limit * 1024) return PCRE2_ERROR_HEAPLIMIT;
- mb->frame_vector_size = ((mb->heap_limit * 1024)/frame_size) * frame_size;
+ match_data->heapframes_size = 0;
+ return PCRE2_ERROR_NOMEMORY;
}
- mb->match_frames = mb->memctl.malloc(mb->frame_vector_size,
- mb->memctl.memory_data);
- if (mb->match_frames == NULL) return PCRE2_ERROR_NOMEMORY;
+ match_data->heapframes_size = heapframes_size;
}
-mb->match_frames_top =
- (heapframe *)((char *)mb->match_frames + mb->frame_vector_size);
-
/* Write to the ovector within the first frame to mark every capture unset and
to avoid uninitialized memory read errors when it is copied to a new frame. */
-memset((char *)(mb->match_frames) + offsetof(heapframe, ovector), 0xff,
- re->top_bracket * 2 * sizeof(PCRE2_SIZE));
+memset((char *)(match_data->heapframes) + offsetof(heapframe, ovector), 0xff,
+ frame_size - offsetof(heapframe, ovector));
/* Pointers to the individual character tables */
@@ -6710,8 +6923,8 @@ start_partial = match_partial = NULL;
mb->hitend = FALSE;
#if PCRE2_CODE_UNIT_WIDTH == 8
-memchr_not_found_first_cu = FALSE;
-memchr_not_found_first_cu2 = FALSE;
+memchr_found_first_cu = NULL;
+memchr_found_first_cu2 = NULL;
#endif
for(;;)
@@ -6780,13 +6993,7 @@ for(;;)
}
}
- /* Not anchored. Advance to a unique first code unit if there is one. In
- 8-bit mode, the use of memchr() gives a big speed up, even though we have
- to call it twice in caseless mode, in order to find the earliest occurrence
- of the character in either of its cases. If a call to memchr() that
- searches the rest of the subject fails to find one case, remember that in
- order not to keep on repeating the search. This can make a huge difference
- when the strings are very long and only one case is present. */
+ /* Not anchored. Advance to a unique first code unit if there is one. */
else
{
@@ -6794,43 +7001,68 @@ for(;;)
{
if (first_cu != first_cu2) /* Caseless */
{
+ /* In 16-bit and 32_bit modes we have to do our own search, so can
+ look for both cases at once. */
+
#if PCRE2_CODE_UNIT_WIDTH != 8
PCRE2_UCHAR smc;
while (start_match < end_subject &&
(smc = UCHAR21TEST(start_match)) != first_cu &&
- smc != first_cu2)
+ smc != first_cu2)
start_match++;
+#else
+ /* In 8-bit mode, the use of memchr() gives a big speed up, even
+ though we have to call it twice in order to find the earliest
+ occurrence of the code unit in either of its cases. Caching is used
+ to remember the positions of previously found code units. This can
+ make a huge difference when the strings are very long and only one
+ case is actually present. */
-#else /* 8-bit code units */
PCRE2_SPTR pp1 = NULL;
PCRE2_SPTR pp2 = NULL;
- PCRE2_SIZE cu2size = end_subject - start_match;
+ PCRE2_SIZE searchlength = end_subject - start_match;
- if (!memchr_not_found_first_cu)
+ /* If we haven't got a previously found position for first_cu, or if
+ the current starting position is later, we need to do a search. If
+ the code unit is not found, set it to the end. */
+
+ if (memchr_found_first_cu == NULL ||
+ start_match > memchr_found_first_cu)
{
- pp1 = memchr(start_match, first_cu, end_subject - start_match);
- if (pp1 == NULL) memchr_not_found_first_cu = TRUE;
- else cu2size = pp1 - start_match;
+ pp1 = memchr(start_match, first_cu, searchlength);
+ memchr_found_first_cu = (pp1 == NULL)? end_subject : pp1;
}
- /* If pp1 is not NULL, we have arranged to search only as far as pp1,
- to see if the other case is earlier, so we can set "not found" only
- when both searches have returned NULL. */
+ /* If the start is before a previously found position, use the
+ previous position, or NULL if a previous search failed. */
+
+ else pp1 = (memchr_found_first_cu == end_subject)? NULL :
+ memchr_found_first_cu;
+
+ /* Do the same thing for the other case. */
- if (!memchr_not_found_first_cu2)
+ if (memchr_found_first_cu2 == NULL ||
+ start_match > memchr_found_first_cu2)
{
- pp2 = memchr(start_match, first_cu2, cu2size);
- memchr_not_found_first_cu2 = (pp2 == NULL && pp1 == NULL);
+ pp2 = memchr(start_match, first_cu2, searchlength);
+ memchr_found_first_cu2 = (pp2 == NULL)? end_subject : pp2;
}
+ else pp2 = (memchr_found_first_cu2 == end_subject)? NULL :
+ memchr_found_first_cu2;
+
+ /* Set the start to the end of the subject if neither case was found.
+ Otherwise, use the earlier found point. */
+
if (pp1 == NULL)
start_match = (pp2 == NULL)? end_subject : pp2;
else
start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
-#endif
+
+#endif /* 8-bit handling */
}
- /* The caseful case */
+ /* The caseful case is much simpler. */
else
{
@@ -7054,8 +7286,8 @@ for(;;)
mb->end_offset_top = 0;
mb->skip_arg_count = 0;
- rc = match(start_match, mb->start_code, match_data->ovector,
- match_data->oveccount, re->top_bracket, frame_size, mb);
+ rc = match(start_match, mb->start_code, re->top_bracket, frame_size,
+ match_data, mb);
if (mb->hitend && start_partial == NULL)
{
@@ -7238,11 +7470,6 @@ if (utf && end_subject != true_end_subject &&
}
#endif /* SUPPORT_UNICODE */
-/* Release an enlarged frame vector that is on the heap. */
-
-if (mb->match_frames != mb->stack_frames)
- mb->memctl.free(mb->match_frames, mb->memctl.memory_data);
-
/* Fill in fields that are always returned in the match data. */
match_data->code = re;
@@ -7308,4 +7535,10 @@ else match_data->rc = PCRE2_ERROR_NOMATCH;
return match_data->rc;
}
+/* These #undefs are here to enable unity builds with CMake. */
+
+#undef NLBLOCK /* Block containing newline information */
+#undef PSSTART /* Field containing processed string start */
+#undef PSEND /* Field containing processed string end */
+
/* End of pcre2_match.c */