summaryrefslogtreecommitdiffstats
path: root/src/3rdparty/pcre2/src/pcre2_match.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/3rdparty/pcre2/src/pcre2_match.c')
-rw-r--r--src/3rdparty/pcre2/src/pcre2_match.c336
1 files changed, 87 insertions, 249 deletions
diff --git a/src/3rdparty/pcre2/src/pcre2_match.c b/src/3rdparty/pcre2/src/pcre2_match.c
index 050b7e93ec..79cc93f918 100644
--- a/src/3rdparty/pcre2/src/pcre2_match.c
+++ b/src/3rdparty/pcre2/src/pcre2_match.c
@@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
- New API code Copyright (c) 2015-2017 University of Cambridge
+ New API code Copyright (c) 2015-2018 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -249,7 +249,8 @@ for (i = 0, Q = mb->match_frames;
/* This function is called for all callouts, whether "standalone" or at the
start of a conditional group. Feptr will be pointing to either OP_CALLOUT or
-OP_CALLOUT_STR.
+OP_CALLOUT_STR. A callout block is allocated in pcre2_match() and initialized
+with fixed values.
Arguments:
F points to the current backtracking frame
@@ -266,7 +267,7 @@ do_callout(heapframe *F, match_block *mb, PCRE2_SIZE *lengthptr)
int rc;
PCRE2_SIZE save0, save1;
PCRE2_SIZE *callout_ovector;
-pcre2_callout_block cb;
+pcre2_callout_block *cb;
*lengthptr = (*Fecode == OP_CALLOUT)?
PRIV(OP_lengths)[OP_CALLOUT] : GET(Fecode, 1 + 2*LINK_SIZE);
@@ -285,40 +286,42 @@ pointer. */
callout_ovector = (PCRE2_SIZE *)(Fovector) - 2;
-cb.version = 1;
-cb.capture_top = (uint32_t)Foffset_top/2 + 1;
-cb.capture_last = Fcapture_last;
-cb.offset_vector = callout_ovector;
-cb.mark = mb->nomatch_mark;
-cb.subject = mb->start_subject;
-cb.subject_length = (PCRE2_SIZE)(mb->end_subject - mb->start_subject);
-cb.start_match = (PCRE2_SIZE)(Fstart_match - mb->start_subject);
-cb.current_position = (PCRE2_SIZE)(Feptr - mb->start_subject);
-cb.pattern_position = GET(Fecode, 1);
-cb.next_item_length = GET(Fecode, 1 + LINK_SIZE);
+/* The cb->version, cb->subject, cb->subject_length, and cb->start_match fields
+are set externally. The first 3 never change; the last is updated for each
+bumpalong. */
+
+cb = mb->cb;
+cb->capture_top = (uint32_t)Foffset_top/2 + 1;
+cb->capture_last = Fcapture_last;
+cb->offset_vector = callout_ovector;
+cb->mark = mb->nomatch_mark;
+cb->current_position = (PCRE2_SIZE)(Feptr - mb->start_subject);
+cb->pattern_position = GET(Fecode, 1);
+cb->next_item_length = GET(Fecode, 1 + LINK_SIZE);
if (*Fecode == OP_CALLOUT) /* Numerical callout */
{
- cb.callout_number = Fecode[1 + 2*LINK_SIZE];
- cb.callout_string_offset = 0;
- cb.callout_string = NULL;
- cb.callout_string_length = 0;
+ cb->callout_number = Fecode[1 + 2*LINK_SIZE];
+ cb->callout_string_offset = 0;
+ cb->callout_string = NULL;
+ cb->callout_string_length = 0;
}
else /* String callout */
{
- cb.callout_number = 0;
- cb.callout_string_offset = GET(Fecode, 1 + 3*LINK_SIZE);
- cb.callout_string = Fecode + (1 + 4*LINK_SIZE) + 1;
- cb.callout_string_length =
+ cb->callout_number = 0;
+ cb->callout_string_offset = GET(Fecode, 1 + 3*LINK_SIZE);
+ cb->callout_string = Fecode + (1 + 4*LINK_SIZE) + 1;
+ cb->callout_string_length =
*lengthptr - (1 + 4*LINK_SIZE) - 2;
}
save0 = callout_ovector[0];
save1 = callout_ovector[1];
callout_ovector[0] = callout_ovector[1] = PCRE2_UNSET;
-rc = mb->callout(&cb, mb->callout_data);
+rc = mb->callout(cb, mb->callout_data);
callout_ovector[0] = save0;
callout_ovector[1] = save1;
+cb->callout_flags = 0;
return rc;
}
@@ -729,7 +732,7 @@ for (;;)
fprintf(stderr, "++ op=%d\n", *Fecode);
#endif
- Fop = *Fecode;
+ Fop = (uint8_t)(*Fecode); /* Cast needed for 16-bit and 32-bit modes */
switch(Fop)
{
/* ===================================================================== */
@@ -876,7 +879,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
}
Feptr++;
#ifdef SUPPORT_UNICODE
- if (utf) ACROSSCHAR(Feptr < mb->end_subject, *Feptr, Feptr++);
+ if (utf) ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
#endif
Fecode++;
break;
@@ -2440,55 +2443,9 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
}
else
{
- int lgb, rgb;
GETCHARINCTEST(fc, Feptr);
- lgb = UCD_GRAPHBREAK(fc);
- while (Feptr < mb->end_subject)
- {
- int len = 1;
- if (!utf) fc = *Feptr; else { GETCHARLEN(fc, Feptr, len); }
- rgb = UCD_GRAPHBREAK(fc);
- if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
-
- /* Not breaking between Regional Indicators is allowed only if there
- are an even number of preceding RIs. */
-
- if (lgb == ucp_gbRegionalIndicator && rgb == ucp_gbRegionalIndicator)
- {
- int ricount = 0;
- PCRE2_SPTR bptr = Feptr - 1;
-#ifdef SUPPORT_UNICODE
- if (utf) BACKCHAR(bptr);
-#endif
- /* bptr is pointing to the left-hand character */
-
- while (bptr > mb->start_subject)
- {
- bptr--;
-#ifdef SUPPORT_UNICODE
- if (utf)
- {
- BACKCHAR(bptr);
- GETCHAR(fc, bptr);
- }
- else
-#endif
- fc = *bptr;
- if (UCD_GRAPHBREAK(fc) != ucp_gbRegionalIndicator) break;
- ricount++;
- }
- if ((ricount & 1) != 0) break; /* Grapheme break required */
- }
-
- /* If Extend follows E_Base[_GAZ] do not update lgb; this allows
- any number of Extend before a following E_Modifier. */
-
- if (rgb != ucp_gbExtend ||
- (lgb != ucp_gbE_Base && lgb != ucp_gbE_Base_GAZ))
- lgb = rgb;
-
- Feptr += len;
- }
+ Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject, utf,
+ NULL);
}
CHECK_PARTIAL();
Fecode++;
@@ -2785,61 +2742,13 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
}
else
{
- int lgb, rgb;
GETCHARINCTEST(fc, Feptr);
- lgb = UCD_GRAPHBREAK(fc);
- while (Feptr < mb->end_subject)
- {
- int len = 1;
- if (!utf) fc = *Feptr; else { GETCHARLEN(fc, Feptr, len); }
- rgb = UCD_GRAPHBREAK(fc);
- if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
-
- /* Not breaking between Regional Indicators is allowed only if
- there are an even number of preceding RIs. */
-
- if (lgb == ucp_gbRegionalIndicator &&
- rgb == ucp_gbRegionalIndicator)
- {
- int ricount = 0;
- PCRE2_SPTR bptr = Feptr - 1;
-#ifdef SUPPORT_UNICODE
- if (utf) BACKCHAR(bptr);
-#endif
- /* bptr is pointing to the left-hand character */
-
- while (bptr > mb->start_subject)
- {
- bptr--;
-#ifdef SUPPORT_UNICODE
- if (utf)
- {
- BACKCHAR(bptr);
- GETCHAR(fc, bptr);
- }
- else
-#endif
- fc = *bptr;
- if (UCD_GRAPHBREAK(fc) != ucp_gbRegionalIndicator) break;
- ricount++;
- }
- if ((ricount & 1) != 0) break; /* Grapheme break required */
- }
-
- /* If Extend follows E_Base[_GAZ] do not update lgb; this allows
- any number of Extend before a following E_Modifier. */
-
- if (rgb != ucp_gbExtend ||
- (lgb != ucp_gbE_Base && lgb != ucp_gbE_Base_GAZ))
- lgb = rgb;
-
- Feptr += len;
- }
+ Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject,
+ mb->end_subject, utf, NULL);
}
CHECK_PARTIAL();
}
}
-
else
#endif /* SUPPORT_UNICODE */
@@ -2867,7 +2776,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
}
Feptr++;
- ACROSSCHAR(Feptr < mb->end_subject, *Feptr, Feptr++);
+ ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
}
break;
@@ -2880,7 +2789,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
RRETURN(MATCH_NOMATCH);
}
Feptr++;
- ACROSSCHAR(Feptr < mb->end_subject, *Feptr, Feptr++);
+ ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
}
break;
@@ -3034,7 +2943,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
if (cc < 128 && (mb->ctypes[cc] & ctype_space) != 0)
RRETURN(MATCH_NOMATCH);
Feptr++;
- ACROSSCHAR(Feptr < mb->end_subject, *Feptr, Feptr++);
+ ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
}
break;
@@ -3068,7 +2977,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
if (cc < 128 && (mb->ctypes[cc] & ctype_word) != 0)
RRETURN(MATCH_NOMATCH);
Feptr++;
- ACROSSCHAR(Feptr < mb->end_subject, *Feptr, Feptr++);
+ ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
}
break;
@@ -3593,56 +3502,9 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
}
else
{
- int lgb, rgb;
GETCHARINCTEST(fc, Feptr);
- lgb = UCD_GRAPHBREAK(fc);
- while (Feptr < mb->end_subject)
- {
- int len = 1;
- if (!utf) fc = *Feptr; else { GETCHARLEN(fc, Feptr, len); }
- rgb = UCD_GRAPHBREAK(fc);
- if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
-
- /* Not breaking between Regional Indicators is allowed only if
- there are an even number of preceding RIs. */
-
- if (lgb == ucp_gbRegionalIndicator &&
- rgb == ucp_gbRegionalIndicator)
- {
- int ricount = 0;
- PCRE2_SPTR bptr = Feptr - 1;
-#ifdef SUPPORT_UNICODE
- if (utf) BACKCHAR(bptr);
-#endif
- /* bptr is pointing to the left-hand character */
-
- while (bptr > mb->start_subject)
- {
- bptr--;
-#ifdef SUPPORT_UNICODE
- if (utf)
- {
- BACKCHAR(bptr);
- GETCHAR(fc, bptr);
- }
- else
-#endif
- fc = *bptr;
- if (UCD_GRAPHBREAK(fc) != ucp_gbRegionalIndicator) break;
- ricount++;
- }
- if ((ricount & 1) != 0) break; /* Grapheme break required */
- }
-
- /* If Extend follows E_Base[_GAZ] do not update lgb; this allows
- any number of Extend before a following E_Modifier. */
-
- if (rgb != ucp_gbExtend ||
- (lgb != ucp_gbE_Base && lgb != ucp_gbE_Base_GAZ))
- lgb = rgb;
-
- Feptr += len;
- }
+ Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject,
+ utf, NULL);
}
CHECK_PARTIAL();
}
@@ -4167,56 +4029,9 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
}
else
{
- int lgb, rgb;
GETCHARINCTEST(fc, Feptr);
- lgb = UCD_GRAPHBREAK(fc);
- while (Feptr < mb->end_subject)
- {
- int len = 1;
- if (!utf) fc = *Feptr; else { GETCHARLEN(fc, Feptr, len); }
- rgb = UCD_GRAPHBREAK(fc);
- if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
-
- /* Not breaking between Regional Indicators is allowed only if
- there are an even number of preceding RIs. */
-
- if (lgb == ucp_gbRegionalIndicator &&
- rgb == ucp_gbRegionalIndicator)
- {
- int ricount = 0;
- PCRE2_SPTR bptr = Feptr - 1;
-#ifdef SUPPORT_UNICODE
- if (utf) BACKCHAR(bptr);
-#endif
- /* bptr is pointing to the left-hand character */
-
- while (bptr > mb->start_subject)
- {
- bptr--;
-#ifdef SUPPORT_UNICODE
- if (utf)
- {
- BACKCHAR(bptr);
- GETCHAR(fc, bptr);
- }
- else
-#endif
- fc = *bptr;
- if (UCD_GRAPHBREAK(fc) != ucp_gbRegionalIndicator) break;
- ricount++;
- }
- if ((ricount & 1) != 0) break; /* Grapheme break required */
- }
-
- /* If Extend follows E_Base[_GAZ] do not update lgb; this allows
- any number of Extend before a following E_Modifier. */
-
- if (rgb != ucp_gbExtend ||
- (lgb != ucp_gbE_Base && lgb != ucp_gbE_Base_GAZ))
- lgb = rgb;
-
- Feptr += len;
- }
+ Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject,
+ utf, NULL);
}
CHECK_PARTIAL();
}
@@ -4295,7 +4110,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
}
Feptr++;
- ACROSSCHAR(Feptr < mb->end_subject, *Feptr, Feptr++);
+ ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
}
break;
@@ -4310,7 +4125,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
break;
}
Feptr++;
- ACROSSCHAR(Feptr < mb->end_subject, *Feptr, Feptr++);
+ ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
}
}
else
@@ -5240,7 +5055,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
P = (heapframe *)((char *)N - frame_size);
if (N->group_frame_type == (GF_RECURSE | number))
{
- if (Feptr == P->eptr) RRETURN(PCRE2_ERROR_RECURSELOOP);
+ if (Feptr == P->eptr) return PCRE2_ERROR_RECURSELOOP;
break;
}
offset = P->last_group_offset;
@@ -6105,8 +5920,9 @@ in rrc. */
#define LBL(val) case val: goto L_RM##val;
RETURN_SWITCH:
-if (Frdepth == 0) return rrc; /* Exit from the top level */
-F = (heapframe *)((char *)F - Fback_frame); /* Back track */
+if (Frdepth == 0) return rrc; /* Exit from the top level */
+F = (heapframe *)((char *)F - Fback_frame); /* Back track */
+mb->cb->callout_flags |= PCRE2_CALLOUT_BACKTRACK; /* Note for callouts */
#ifdef DEBUG_SHOW_RMATCH
fprintf(stderr, "++ RETURN %d to %d\n", rrc, Freturn_id);
@@ -6196,6 +6012,7 @@ PCRE2_SIZE frame_size;
/* We need to have mb as a pointer to a match block, because the IS_NEWLINE
macro is used below, and it expects NLBLOCK to be defined as a pointer. */
+pcre2_callout_block cb;
match_block actual_match_block;
match_block *mb = &actual_match_block;
@@ -6356,6 +6173,15 @@ startline = (re->flags & PCRE2_STARTLINE) != 0;
bumpalong_limit = (mcontext->offset_limit == PCRE2_UNSET)?
end_subject : subject + mcontext->offset_limit;
+/* Initialize and set up the fixed fields in the callout block, with a pointer
+in the match block. */
+
+mb->cb = &cb;
+cb.version = 2;
+cb.subject = subject;
+cb.subject_length = (PCRE2_SIZE)(end_subject - subject);
+cb.callout_flags = 0;
+
/* Fill in the remaining fields in the match block. */
mb->callout = mcontext->callout;
@@ -6537,13 +6363,11 @@ for(;;)
if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
{
- PCRE2_SPTR save_end_subject = end_subject;
-
/* If firstline is TRUE, the start of the match is constrained to the first
line of a multiline string. That is, the match must be before or at the
- first newline. Implement this by temporarily adjusting end_subject so that
- we stop the optimization scans for a first code unit at a newline. If the
- match fails at the newline, later code breaks this loop. */
+ first newline following the start of matching. Temporarily adjust
+ end_subject so that we stop the scans for a first code unit at a newline.
+ If the match fails at the newline, later code breaks the loop. */
if (firstline)
{
@@ -6551,15 +6375,15 @@ for(;;)
#ifdef SUPPORT_UNICODE
if (utf)
{
- while (t < mb->end_subject && !IS_NEWLINE(t))
+ while (t < end_subject && !IS_NEWLINE(t))
{
t++;
- ACROSSCHAR(t < end_subject, *t, t++);
+ ACROSSCHAR(t < end_subject, t, t++);
}
}
else
#endif
- while (t < mb->end_subject && !IS_NEWLINE(t)) t++;
+ while (t < end_subject && !IS_NEWLINE(t)) t++;
end_subject = t;
}
@@ -6635,13 +6459,17 @@ for(;;)
#endif
}
- /* If we can't find the required code unit, break the bumpalong loop,
- to force a match failure, except when doing partial matching, when we
- let the next cycle run at the end of the subject. To see why, consider
- the pattern /(?<=abc)def/, which partially matches "abc", even though
- the string does not contain the starting character "d". */
-
- if (!mb->partial && start_match >= end_subject)
+ /* If we can't find the required code unit, having reached the true end
+ of the subject, break the bumpalong loop, to force a match failure,
+ except when doing partial matching, when we let the next cycle run at
+ the end of the subject. To see why, consider the pattern /(?<=abc)def/,
+ which partially matches "abc", even though the string does not contain
+ the starting character "d". If we have not reached the true end of the
+ subject (PCRE2_FIRSTLINE caused end_subject to be temporarily modified)
+ we also let the cycle run, because the matching string is legitimately
+ allowed to start with the first code unit of a newline. */
+
+ if (!mb->partial && start_match >= mb->end_subject)
{
rc = MATCH_NOMATCH;
break;
@@ -6661,8 +6489,7 @@ for(;;)
while (start_match < end_subject && !WAS_NEWLINE(start_match))
{
start_match++;
- ACROSSCHAR(start_match < end_subject, *start_match,
- start_match++);
+ ACROSSCHAR(start_match < end_subject, start_match, start_match++);
}
}
else
@@ -6698,12 +6525,20 @@ for(;;)
if ((start_bits[c/8] & (1 << (c&7))) != 0) break;
start_match++;
}
+
+ /* See comment above in first_cu checking about the next few lines. */
+
+ if (!mb->partial && start_match >= mb->end_subject)
+ {
+ rc = MATCH_NOMATCH;
+ break;
+ }
}
} /* End first code unit handling */
/* Restore fudged end_subject */
- end_subject = save_end_subject;
+ end_subject = mb->end_subject;
/* The following two optimizations must be disabled for partial matching. */
@@ -6820,6 +6655,9 @@ for(;;)
/* OK, we can now run the match. If "hitend" is set afterwards, remember the
first starting point for which a partial match was found. */
+ cb.start_match = (PCRE2_SIZE)(start_match - subject);
+ cb.callout_flags |= PCRE2_CALLOUT_STARTMATCH;
+
mb->start_used_ptr = start_match;
mb->last_used_ptr = start_match;
mb->match_call_count = 0;
@@ -6870,7 +6708,7 @@ for(;;)
new_start_match = start_match + 1;
#ifdef SUPPORT_UNICODE
if (utf)
- ACROSSCHAR(new_start_match < end_subject, *new_start_match,
+ ACROSSCHAR(new_start_match < end_subject, new_start_match,
new_start_match++);
#endif
break;