summaryrefslogtreecommitdiffstats
path: root/src/3rdparty/pcre/pcre_exec.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/3rdparty/pcre/pcre_exec.c')
-rw-r--r--src/3rdparty/pcre/pcre_exec.c959
1 files changed, 546 insertions, 413 deletions
diff --git a/src/3rdparty/pcre/pcre_exec.c b/src/3rdparty/pcre/pcre_exec.c
index c888468a25..913521ff0c 100644
--- a/src/3rdparty/pcre/pcre_exec.c
+++ b/src/3rdparty/pcre/pcre_exec.c
@@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
- Copyright (c) 1997-2012 University of Cambridge
+ Copyright (c) 1997-2013 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -56,6 +56,20 @@ possible. There are also some static supporting functions. */
#undef min
#undef max
+/* The md->capture_last field uses the lower 16 bits for the last captured
+substring (which can never be greater than 65535) and a bit in the top half
+to mean "capture vector overflowed". This odd way of doing things was
+implemented when it was realized that preserving and restoring the overflow bit
+whenever the last capture number was saved/restored made for a neater
+interface, and doing it this way saved on (a) another variable, which would
+have increased the stack frame size (a big NO-NO in PCRE) and (b) another
+separate set of save/restore instructions. The following defines are used in
+implementing this. */
+
+#define CAPLMASK 0x0000ffff /* The bits used for last_capture */
+#define OVFLMASK 0xffff0000 /* The bits used for the overflow flag */
+#define OVFLBIT 0x00010000 /* The bit that is set for overflow */
+
/* Values for setting in md->match_function_type to indicate two special types
of call to match(). We do it this way to save on using another stack variable,
as stack usage is to be discouraged. */
@@ -73,13 +87,17 @@ defined PCRE_ERROR_xxx codes, which are all negative. */
negative to avoid the external error codes. */
#define MATCH_ACCEPT (-999)
-#define MATCH_COMMIT (-998)
-#define MATCH_KETRPOS (-997)
-#define MATCH_ONCE (-996)
+#define MATCH_KETRPOS (-998)
+#define MATCH_ONCE (-997)
+/* The next 5 must be kept together and in sequence so that a test that checks
+for any one of them can use a range. */
+#define MATCH_COMMIT (-996)
#define MATCH_PRUNE (-995)
#define MATCH_SKIP (-994)
#define MATCH_SKIP_ARG (-993)
#define MATCH_THEN (-992)
+#define MATCH_BACKTRACK_MAX MATCH_THEN
+#define MATCH_BACKTRACK_MIN MATCH_COMMIT
/* Maximum number of ints of offset to save on the stack for recursive calls.
If the offset vector is bigger, malloc is used. This should be a multiple of 3,
@@ -89,8 +107,8 @@ because the offset vector is always a multiple of 3 long. */
/* Min and max values for the common repeats; for the maxima, 0 => infinity */
-static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
-static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
+static const char rep_min[] = { 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, };
+static const char rep_max[] = { 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, };
#ifdef PCRE_DEBUG
/*************************************************
@@ -149,7 +167,7 @@ match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
{
PCRE_PUCHAR eptr_start = eptr;
register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
-#ifdef SUPPORT_UTF
+#if defined SUPPORT_UTF && defined SUPPORT_UCP
BOOL utf = md->utf;
#endif
@@ -177,8 +195,7 @@ ASCII characters. */
if (caseless)
{
-#ifdef SUPPORT_UTF
-#ifdef SUPPORT_UCP
+#if defined SUPPORT_UTF && defined SUPPORT_UCP
if (utf)
{
/* Match characters up to the end of the reference. NOTE: the number of
@@ -212,14 +229,13 @@ if (caseless)
}
else
#endif
-#endif
/* The same code works when not in UTF-8 mode and in UTF-8 mode when there
is no UCP support. */
{
while (length-- > 0)
{
- pcre_uchar cc, cp;
+ pcre_uint32 cc, cp;
if (eptr >= md->end_subject) return -2; /* Partial match */
cc = RAWUCHARTEST(eptr);
cp = RAWUCHARTEST(p);
@@ -416,10 +432,10 @@ typedef struct heapframe {
int Xlength;
int Xmax;
int Xmin;
- int Xnumber;
+ unsigned int Xnumber;
int Xoffset;
- int Xop;
- int Xsave_capture_last;
+ unsigned int Xop;
+ pcre_int32 Xsave_capture_last;
int Xsave_offset1, Xsave_offset2, Xsave_offset3;
int Xstacksave[REC_STACK_SAVE_MAX];
@@ -634,8 +650,8 @@ int max;
int min;
unsigned int number;
int offset;
-pcre_uchar op;
-int save_capture_last;
+unsigned int op;
+pcre_int32 save_capture_last;
int save_offset1, save_offset2, save_offset3;
int stacksave[REC_STACK_SAVE_MAX];
@@ -763,23 +779,16 @@ for (;;)
case OP_FAIL:
RRETURN(MATCH_NOMATCH);
- /* COMMIT overrides PRUNE, SKIP, and THEN */
-
case OP_COMMIT:
RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
eptrb, RM52);
- if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
- rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
- rrc != MATCH_THEN)
- RRETURN(rrc);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
RRETURN(MATCH_COMMIT);
- /* PRUNE overrides THEN */
-
case OP_PRUNE:
RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
eptrb, RM51);
- if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
RRETURN(MATCH_PRUNE);
case OP_PRUNE_ARG:
@@ -789,38 +798,39 @@ for (;;)
eptrb, RM56);
if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
md->mark == NULL) md->mark = ecode + 2;
- if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
RRETURN(MATCH_PRUNE);
- /* SKIP overrides PRUNE and THEN */
-
case OP_SKIP:
RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
eptrb, RM53);
- if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
- RRETURN(rrc);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
md->start_match_ptr = eptr; /* Pass back current position */
RRETURN(MATCH_SKIP);
/* Note that, for Perl compatibility, SKIP with an argument does NOT set
- nomatch_mark. There is a flag that disables this opcode when re-matching a
- pattern that ended with a SKIP for which there was not a matching MARK. */
+ nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was
+ not a matching mark, we have to re-run the match, ignoring the SKIP_ARG
+ that failed and any that precede it (either they also failed, or were not
+ triggered). To do this, we maintain a count of executed SKIP_ARGs. If a
+ SKIP_ARG gets to top level, the match is re-run with md->ignore_skip_arg
+ set to the count of the one that failed. */
case OP_SKIP_ARG:
- if (md->ignore_skip_arg)
+ md->skip_arg_count++;
+ if (md->skip_arg_count <= md->ignore_skip_arg)
{
ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
break;
}
RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
eptrb, RM57);
- if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
- RRETURN(rrc);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
/* Pass back the current skip name by overloading md->start_match_ptr and
returning the special MATCH_SKIP_ARG return code. This will either be
caught by a matching MARK, or get to the top, where it causes a rematch
- with the md->ignore_skip_arg flag set. */
+ with md->ignore_skip_arg set to the value of md->skip_arg_count. */
md->start_match_ptr = ecode + 2;
RRETURN(MATCH_SKIP_ARG);
@@ -1066,6 +1076,7 @@ for (;;)
/* In all other cases, we have to make another call to match(). */
save_mark = md->mark;
+ save_capture_last = md->capture_last;
RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
RM2);
@@ -1097,6 +1108,7 @@ for (;;)
ecode += GET(ecode, 1);
md->mark = save_mark;
if (*ecode != OP_ALT) break;
+ md->capture_last = save_capture_last;
}
RRETURN(MATCH_NOMATCH);
@@ -1159,6 +1171,7 @@ for (;;)
ecode = md->start_code + code_offset;
save_capture_last = md->capture_last;
matched_once = TRUE;
+ mstart = md->start_match_ptr; /* In case \K changed it */
continue;
}
@@ -1218,6 +1231,7 @@ for (;;)
POSSESSIVE_NON_CAPTURE:
matched_once = FALSE;
code_offset = (int)(ecode - md->start_code);
+ save_capture_last = md->capture_last;
for (;;)
{
@@ -1230,6 +1244,7 @@ for (;;)
eptr = md->end_match_ptr;
ecode = md->start_code + code_offset;
matched_once = TRUE;
+ mstart = md->start_match_ptr; /* In case \K reset it */
continue;
}
@@ -1247,6 +1262,7 @@ for (;;)
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
ecode += GET(ecode, 1);
if (*ecode != OP_ALT) break;
+ md->capture_last = save_capture_last;
}
if (matched_once || allow_zero)
@@ -1258,25 +1274,32 @@ for (;;)
/* Control never reaches here. */
- /* Conditional group: compilation checked that there are no more than
- two branches. If the condition is false, skipping the first branch takes us
- past the end if there is only one branch, but that's OK because that is
- exactly what going to the ket would do. */
+ /* Conditional group: compilation checked that there are no more than two
+ branches. If the condition is false, skipping the first branch takes us
+ past the end of the item if there is only one branch, but that's exactly
+ what we want. */
case OP_COND:
case OP_SCOND:
- codelink = GET(ecode, 1);
+
+ /* The variable codelink will be added to ecode when the condition is
+ false, to get to the second branch. Setting it to the offset to the ALT
+ or KET, then incrementing ecode achieves this effect. We now have ecode
+ pointing to the condition or callout. */
+
+ codelink = GET(ecode, 1); /* Offset to the second branch */
+ ecode += 1 + LINK_SIZE; /* From this opcode */
/* Because of the way auto-callout works during compile, a callout item is
inserted between OP_COND and an assertion condition. */
- if (ecode[LINK_SIZE+1] == OP_CALLOUT)
+ if (*ecode == OP_CALLOUT)
{
if (PUBL(callout) != NULL)
{
PUBL(callout_block) cb;
cb.version = 2; /* Version 1 of the callout block */
- cb.callout_number = ecode[LINK_SIZE+2];
+ cb.callout_number = ecode[1];
cb.offset_vector = md->offset_vector;
#if defined COMPILE_PCRE8
cb.subject = (PCRE_SPTR)md->start_subject;
@@ -1288,215 +1311,130 @@ for (;;)
cb.subject_length = (int)(md->end_subject - md->start_subject);
cb.start_match = (int)(mstart - md->start_subject);
cb.current_position = (int)(eptr - md->start_subject);
- cb.pattern_position = GET(ecode, LINK_SIZE + 3);
- cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
+ cb.pattern_position = GET(ecode, 2);
+ cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
cb.capture_top = offset_top/2;
- cb.capture_last = md->capture_last;
+ cb.capture_last = md->capture_last & CAPLMASK;
+ /* Internal change requires this for API compatibility. */
+ if (cb.capture_last == 0) cb.capture_last = -1;
cb.callout_data = md->callout_data;
cb.mark = md->nomatch_mark;
if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
if (rrc < 0) RRETURN(rrc);
}
+
+ /* Advance ecode past the callout, so it now points to the condition. We
+ must adjust codelink so that the value of ecode+codelink is unchanged. */
+
ecode += PRIV(OP_lengths)[OP_CALLOUT];
+ codelink -= PRIV(OP_lengths)[OP_CALLOUT];
}
- condcode = ecode[LINK_SIZE+1];
+ /* Test the various possible conditions */
- /* Now see what the actual condition is */
-
- if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
+ condition = FALSE;
+ switch(condcode = *ecode)
{
- if (md->recursive == NULL) /* Not recursing => FALSE */
- {
- condition = FALSE;
- ecode += GET(ecode, 1);
- }
- else
+ case OP_RREF: /* Numbered group recursion test */
+ if (md->recursive != NULL) /* Not recursing => FALSE */
{
- unsigned int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
+ unsigned int recno = GET2(ecode, 1); /* Recursion group number*/
condition = (recno == RREF_ANY || recno == md->recursive->group_num);
+ }
+ break;
- /* If the test is for recursion into a specific subpattern, and it is
- false, but the test was set up by name, scan the table to see if the
- name refers to any other numbers, and test them. The condition is true
- if any one is set. */
-
- if (!condition && condcode == OP_NRREF)
+ case OP_DNRREF: /* Duplicate named group recursion test */
+ if (md->recursive != NULL)
+ {
+ int count = GET2(ecode, 1 + IMM2_SIZE);
+ pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
+ while (count-- > 0)
{
- pcre_uchar *slotA = md->name_table;
- for (i = 0; i < md->name_count; i++)
- {
- if (GET2(slotA, 0) == recno) break;
- slotA += md->name_entry_size;
- }
-
- /* Found a name for the number - there can be only one; duplicate
- names for different numbers are allowed, but not vice versa. First
- scan down for duplicates. */
-
- if (i < md->name_count)
- {
- pcre_uchar *slotB = slotA;
- while (slotB > md->name_table)
- {
- slotB -= md->name_entry_size;
- if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
- {
- condition = GET2(slotB, 0) == md->recursive->group_num;
- if (condition) break;
- }
- else break;
- }
-
- /* Scan up for duplicates */
-
- if (!condition)
- {
- slotB = slotA;
- for (i++; i < md->name_count; i++)
- {
- slotB += md->name_entry_size;
- if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
- {
- condition = GET2(slotB, 0) == md->recursive->group_num;
- if (condition) break;
- }
- else break;
- }
- }
- }
+ unsigned int recno = GET2(slot, 0);
+ condition = recno == md->recursive->group_num;
+ if (condition) break;
+ slot += md->name_entry_size;
}
-
- /* Chose branch according to the condition */
-
- ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
}
- }
+ break;
- else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
- {
- offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
+ case OP_CREF: /* Numbered group used test */
+ offset = GET2(ecode, 1) << 1; /* Doubled ref number */
condition = offset < offset_top && md->offset_vector[offset] >= 0;
+ break;
- /* If the numbered capture is unset, but the reference was by name,
- scan the table to see if the name refers to any other numbers, and test
- them. The condition is true if any one is set. This is tediously similar
- to the code above, but not close enough to try to amalgamate. */
-
- if (!condition && condcode == OP_NCREF)
+ case OP_DNCREF: /* Duplicate named group used test */
{
- unsigned int refno = offset >> 1;
- pcre_uchar *slotA = md->name_table;
-
- for (i = 0; i < md->name_count; i++)
+ int count = GET2(ecode, 1 + IMM2_SIZE);
+ pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
+ while (count-- > 0)
{
- if (GET2(slotA, 0) == refno) break;
- slotA += md->name_entry_size;
- }
-
- /* Found a name for the number - there can be only one; duplicate names
- for different numbers are allowed, but not vice versa. First scan down
- for duplicates. */
-
- if (i < md->name_count)
- {
- pcre_uchar *slotB = slotA;
- while (slotB > md->name_table)
- {
- slotB -= md->name_entry_size;
- if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
- {
- offset = GET2(slotB, 0) << 1;
- condition = offset < offset_top &&
- md->offset_vector[offset] >= 0;
- if (condition) break;
- }
- else break;
- }
-
- /* Scan up for duplicates */
-
- if (!condition)
- {
- slotB = slotA;
- for (i++; i < md->name_count; i++)
- {
- slotB += md->name_entry_size;
- if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
- {
- offset = GET2(slotB, 0) << 1;
- condition = offset < offset_top &&
- md->offset_vector[offset] >= 0;
- if (condition) break;
- }
- else break;
- }
- }
+ offset = GET2(slot, 0) << 1;
+ condition = offset < offset_top && md->offset_vector[offset] >= 0;
+ if (condition) break;
+ slot += md->name_entry_size;
}
}
+ break;
- /* Chose branch according to the condition */
-
- ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
- }
-
- else if (condcode == OP_DEF) /* DEFINE - always false */
- {
- condition = FALSE;
- ecode += GET(ecode, 1);
- }
+ case OP_DEF: /* DEFINE - always false */
+ break;
- /* The condition is an assertion. Call match() to evaluate it - setting
- md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
- an assertion. */
+ /* The condition is an assertion. Call match() to evaluate it - setting
+ md->match_function_type to MATCH_CONDASSERT causes it to stop at the end
+ of an assertion. */
- else
- {
+ default:
md->match_function_type = MATCH_CONDASSERT;
- RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
+ RMATCH(eptr, ecode, offset_top, md, NULL, RM3);
if (rrc == MATCH_MATCH)
{
if (md->end_offset_top > offset_top)
offset_top = md->end_offset_top; /* Captures may have happened */
condition = TRUE;
- ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
+
+ /* Advance ecode past the assertion to the start of the first branch,
+ but adjust it so that the general choosing code below works. */
+
+ ecode += GET(ecode, 1);
while (*ecode == OP_ALT) ecode += GET(ecode, 1);
+ ecode += 1 + LINK_SIZE - PRIV(OP_lengths)[condcode];
}
/* PCRE doesn't allow the effect of (*THEN) to escape beyond an
- assertion; it is therefore treated as NOMATCH. */
+ assertion; it is therefore treated as NOMATCH. Any other return is an
+ error. */
else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
{
RRETURN(rrc); /* Need braces because of following else */
}
- else
- {
- condition = FALSE;
- ecode += codelink;
- }
+ break;
}
- /* We are now at the branch that is to be obeyed. As there is only one, can
- use tail recursion to avoid using another stack frame, except when there is
- unlimited repeat of a possibly empty group. In the latter case, a recursive
- call to match() is always required, unless the second alternative doesn't
- exist, in which case we can just plough on. Note that, for compatibility
- with Perl, the | in a conditional group is NOT treated as creating two
- alternatives. If a THEN is encountered in the branch, it propagates out to
- the enclosing alternative (unless nested in a deeper set of alternatives,
- of course). */
-
- if (condition || *ecode == OP_ALT)
+ /* Choose branch according to the condition */
+
+ ecode += condition? PRIV(OP_lengths)[condcode] : codelink;
+
+ /* We are now at the branch that is to be obeyed. As there is only one, we
+ can use tail recursion to avoid using another stack frame, except when
+ there is unlimited repeat of a possibly empty group. In the latter case, a
+ recursive call to match() is always required, unless the second alternative
+ doesn't exist, in which case we can just plough on. Note that, for
+ compatibility with Perl, the | in a conditional group is NOT treated as
+ creating two alternatives. If a THEN is encountered in the branch, it
+ propagates out to the enclosing alternative (unless nested in a deeper set
+ of alternatives, of course). */
+
+ if (condition || ecode[-(1+LINK_SIZE)] == OP_ALT)
{
if (op != OP_SCOND)
{
- ecode += 1 + LINK_SIZE;
goto TAIL_RECURSE;
}
md->match_function_type = MATCH_CBEGROUP;
- RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
+ RMATCH(eptr, ecode, offset_top, md, eptrb, RM49);
RRETURN(rrc);
}
@@ -1504,7 +1442,6 @@ for (;;)
else
{
- ecode += 1 + LINK_SIZE;
}
break;
@@ -1513,7 +1450,7 @@ for (;;)
to close any currently open capturing brackets. */
case OP_CLOSE:
- number = GET2(ecode, 1);
+ number = GET2(ecode, 1); /* Must be less than 65536 */
offset = number << 1;
#ifdef PCRE_DEBUG
@@ -1521,8 +1458,8 @@ for (;;)
printf("\n");
#endif
- md->capture_last = number;
- if (offset >= md->offset_max) md->offset_overflow = TRUE; else
+ md->capture_last = (md->capture_last & OVFLMASK) | number;
+ if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
{
md->offset_vector[offset] =
md->offset_vector[md->offset_end - number];
@@ -1584,28 +1521,49 @@ for (;;)
}
else condassert = FALSE;
+ /* Loop for each branch */
+
do
{
RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
+
+ /* A match means that the assertion is true; break out of the loop
+ that matches its alternatives. */
+
if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
{
mstart = md->start_match_ptr; /* In case \K reset it */
break;
}
+
+ /* If not matched, restore the previous mark setting. */
+
md->mark = save_mark;
- /* A COMMIT failure must fail the entire assertion, without trying any
- subsequent branches. */
+ /* See comment in the code for capturing groups above about handling
+ THEN. */
- if (rrc == MATCH_COMMIT) RRETURN(MATCH_NOMATCH);
+ if (rrc == MATCH_THEN)
+ {
+ next = ecode + GET(ecode,1);
+ if (md->start_match_ptr < next &&
+ (*ecode == OP_ALT || *next == OP_ALT))
+ rrc = MATCH_NOMATCH;
+ }
- /* PCRE does not allow THEN to escape beyond an assertion; it
- is treated as NOMATCH. */
+ /* Anything other than NOMATCH causes the entire assertion to fail,
+ passing back the return code. This includes COMMIT, SKIP, PRUNE and an
+ uncaptured THEN, which means they take their normal effect. This
+ consistent approach does not always have exactly the same effect as in
+ Perl. */
- if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
ecode += GET(ecode, 1);
}
- while (*ecode == OP_ALT);
+ while (*ecode == OP_ALT); /* Continue for next alternative */
+
+ /* If we have tried all the alternative branches, the assertion has
+ failed. If not, we broke out after a match. */
if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
@@ -1613,17 +1571,16 @@ for (;;)
if (condassert) RRETURN(MATCH_MATCH);
- /* Continue from after the assertion, updating the offsets high water
- mark, since extracts may have been taken during the assertion. */
+ /* Continue from after a successful assertion, updating the offsets high
+ water mark, since extracts may have been taken during the assertion. */
do ecode += GET(ecode,1); while (*ecode == OP_ALT);
ecode += 1 + LINK_SIZE;
offset_top = md->end_offset_top;
continue;
- /* Negative assertion: all branches must fail to match. Encountering SKIP,
- PRUNE, or COMMIT means we must assume failure without checking subsequent
- branches. */
+ /* Negative assertion: all branches must fail to match for the assertion to
+ succeed. */
case OP_ASSERT_NOT:
case OP_ASSERTBACK_NOT:
@@ -1635,28 +1592,64 @@ for (;;)
}
else condassert = FALSE;
+ /* Loop for each alternative branch. */
+
do
{
RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
- md->mark = save_mark;
- if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH);
- if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
+ md->mark = save_mark; /* Always restore the mark setting */
+
+ switch(rrc)
{
- do ecode += GET(ecode,1); while (*ecode == OP_ALT);
+ case MATCH_MATCH: /* A successful match means */
+ case MATCH_ACCEPT: /* the assertion has failed. */
+ RRETURN(MATCH_NOMATCH);
+
+ case MATCH_NOMATCH: /* Carry on with next branch */
break;
+
+ /* See comment in the code for capturing groups above about handling
+ THEN. */
+
+ case MATCH_THEN:
+ next = ecode + GET(ecode,1);
+ if (md->start_match_ptr < next &&
+ (*ecode == OP_ALT || *next == OP_ALT))
+ {
+ rrc = MATCH_NOMATCH;
+ break;
+ }
+ /* Otherwise fall through. */
+
+ /* COMMIT, SKIP, PRUNE, and an uncaptured THEN cause the whole
+ assertion to fail to match, without considering any more alternatives.
+ Failing to match means the assertion is true. This is a consistent
+ approach, but does not always have the same effect as in Perl. */
+
+ case MATCH_COMMIT:
+ case MATCH_SKIP:
+ case MATCH_SKIP_ARG:
+ case MATCH_PRUNE:
+ do ecode += GET(ecode,1); while (*ecode == OP_ALT);
+ goto NEG_ASSERT_TRUE; /* Break out of alternation loop */
+
+ /* Anything else is an error */
+
+ default:
+ RRETURN(rrc);
}
- /* PCRE does not allow THEN to escape beyond an assertion; it is treated
- as NOMATCH. */
+ /* Continue with next branch */
- if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
ecode += GET(ecode,1);
}
while (*ecode == OP_ALT);
- if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
+ /* All branches in the assertion failed to match. */
- ecode += 1 + LINK_SIZE;
+ NEG_ASSERT_TRUE:
+ if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
+ ecode += 1 + LINK_SIZE; /* Continue with current branch */
continue;
/* Move the subject pointer back. This occurs only at the start of
@@ -1716,7 +1709,9 @@ for (;;)
cb.pattern_position = GET(ecode, 2);
cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
cb.capture_top = offset_top/2;
- cb.capture_last = md->capture_last;
+ cb.capture_last = md->capture_last & CAPLMASK;
+ /* Internal change requires this for API compatibility. */
+ if (cb.capture_last == 0) cb.capture_last = -1;
cb.callout_data = md->callout_data;
cb.mark = md->nomatch_mark;
if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
@@ -1762,6 +1757,7 @@ for (;;)
/* Add to "recursing stack" */
new_recursive.group_num = recno;
+ new_recursive.saved_capture_last = md->capture_last;
new_recursive.subject_position = eptr;
new_recursive.prevrec = md->recursive;
md->recursive = &new_recursive;
@@ -1785,8 +1781,9 @@ for (;;)
new_recursive.saved_max * sizeof(int));
/* OK, now we can do the recursion. After processing each alternative,
- restore the offset data. If there were nested recursions, md->recursive
- might be changed, so reset it before looping. */
+ restore the offset data and the last captured value. If there were nested
+ recursions, md->recursive might be changed, so reset it before looping.
+ */
DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
cbegroup = (*callpat >= OP_SBRA);
@@ -1797,6 +1794,7 @@ for (;;)
md, eptrb, RM6);
memcpy(md->offset_vector, new_recursive.offset_save,
new_recursive.saved_max * sizeof(int));
+ md->capture_last = new_recursive.saved_capture_last;
md->recursive = new_recursive.prevrec;
if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
{
@@ -1813,11 +1811,16 @@ for (;;)
goto RECURSION_MATCHED; /* Exit loop; end processing */
}
- /* PCRE does not allow THEN or COMMIT to escape beyond a recursion; it
- is treated as NOMATCH. */
+ /* PCRE does not allow THEN, SKIP, PRUNE or COMMIT to escape beyond a
+ recursion; they cause a NOMATCH for the entire recursion. These codes
+ are defined in a range that can be tested for. */
+
+ if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX)
+ RRETURN(MATCH_NOMATCH);
- else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN &&
- rrc != MATCH_COMMIT)
+ /* Any return code other than NOMATCH is an error. */
+
+ if (rrc != MATCH_NOMATCH)
{
DPRINTF(("Recursion gave error %d\n", rrc));
if (new_recursive.offset_save != stacksave)
@@ -1947,8 +1950,8 @@ for (;;)
/* Deal with capturing */
- md->capture_last = number;
- if (offset >= md->offset_max) md->offset_overflow = TRUE; else
+ md->capture_last = (md->capture_last & OVFLMASK) | number;
+ if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
{
/* If offset is greater than offset_top, it means that we are
"skipping" a capturing group, and that group's offsets must be marked
@@ -2004,6 +2007,7 @@ for (;;)
if (*ecode == OP_KETRPOS)
{
+ md->start_match_ptr = mstart; /* In case \K reset it */
md->end_match_ptr = eptr;
md->end_offset_top = offset_top;
RRETURN(MATCH_KETRPOS);
@@ -2571,19 +2575,24 @@ for (;;)
RRETURN(MATCH_NOMATCH);
break;
- case PT_SPACE: /* Perl space */
- if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
- c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
- == (op == OP_NOTPROP))
- RRETURN(MATCH_NOMATCH);
- break;
+ /* Perl space used to exclude VT, but from Perl 5.18 it is included,
+ which means that Perl space and POSIX space are now identical. PCRE
+ was changed at release 8.34. */
+ case PT_SPACE: /* Perl space */
case PT_PXSPACE: /* POSIX space */
- if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
- c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
- c == CHAR_FF || c == CHAR_CR)
- == (op == OP_NOTPROP))
- RRETURN(MATCH_NOMATCH);
+ switch(c)
+ {
+ HSPACE_CASES:
+ VSPACE_CASES:
+ if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
+ break;
+
+ default:
+ if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) ==
+ (op == OP_NOTPROP)) RRETURN(MATCH_NOMATCH);
+ break;
+ }
break;
case PT_WORD:
@@ -2604,6 +2613,13 @@ for (;;)
}
break;
+ case PT_UCNC:
+ if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
+ c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
+ c >= 0xe000) == (op == OP_NOTPROP))
+ RRETURN(MATCH_NOMATCH);
+ break;
+
/* This should never occur */
default:
@@ -2650,15 +2666,7 @@ for (;;)
similar code to character type repeats - written out again for speed.
However, if the referenced string is the empty string, always treat
it as matched, any number of times (otherwise there could be infinite
- loops). */
-
- case OP_REF:
- case OP_REFI:
- caseless = op == OP_REFI;
- offset = GET2(ecode, 1) << 1; /* Doubled ref number */
- ecode += 1 + IMM2_SIZE;
-
- /* If the reference is unset, there are two possibilities:
+ loops). If the reference is unset, there are two possibilities:
(a) In the default, Perl-compatible state, set the length negative;
this ensures that every attempt at a match fails. We can't just fail
@@ -2668,8 +2676,39 @@ for (;;)
so that the back reference matches an empty string.
Otherwise, set the length to the length of what was matched by the
- referenced subpattern. */
+ referenced subpattern.
+
+ The OP_REF and OP_REFI opcodes are used for a reference to a numbered group
+ or to a non-duplicated named group. For a duplicated named group, OP_DNREF
+ and OP_DNREFI are used. In this case we must scan the list of groups to
+ which the name refers, and use the first one that is set. */
+
+ case OP_DNREF:
+ case OP_DNREFI:
+ caseless = op == OP_DNREFI;
+ {
+ int count = GET2(ecode, 1+IMM2_SIZE);
+ pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
+ ecode += 1 + 2*IMM2_SIZE;
+
+ while (count-- > 0)
+ {
+ offset = GET2(slot, 0) << 1;
+ if (offset < offset_top && md->offset_vector[offset] >= 0) break;
+ slot += md->name_entry_size;
+ }
+ if (count < 0)
+ length = (md->jscript_compat)? 0 : -1;
+ else
+ length = md->offset_vector[offset+1] - md->offset_vector[offset];
+ }
+ goto REF_REPEAT;
+ case OP_REF:
+ case OP_REFI:
+ caseless = op == OP_REFI;
+ offset = GET2(ecode, 1) << 1; /* Doubled ref number */
+ ecode += 1 + IMM2_SIZE;
if (offset >= offset_top || md->offset_vector[offset] < 0)
length = (md->jscript_compat)? 0 : -1;
else
@@ -2677,6 +2716,7 @@ for (;;)
/* Set up for repetition, or handle the non-repeated case */
+ REF_REPEAT:
switch (*ecode)
{
case OP_CRSTAR:
@@ -2825,8 +2865,12 @@ for (;;)
case OP_CRMINPLUS:
case OP_CRQUERY:
case OP_CRMINQUERY:
+ case OP_CRPOSSTAR:
+ case OP_CRPOSPLUS:
+ case OP_CRPOSQUERY:
c = *ecode++ - OP_CRSTAR;
- minimize = (c & 1) != 0;
+ if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0;
+ else possessive = TRUE;
min = rep_min[c]; /* Pick up values from tables; */
max = rep_max[c]; /* zero for max => infinity */
if (max == 0) max = INT_MAX;
@@ -2834,7 +2878,9 @@ for (;;)
case OP_CRRANGE:
case OP_CRMINRANGE:
+ case OP_CRPOSRANGE:
minimize = (*ecode == OP_CRMINRANGE);
+ possessive = (*ecode == OP_CRPOSRANGE);
min = GET2(ecode, 1);
max = GET2(ecode, 1 + IMM2_SIZE);
if (max == 0) max = INT_MAX;
@@ -2976,6 +3022,9 @@ for (;;)
if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
eptr += len;
}
+
+ if (possessive) continue; /* No backtracking */
+
for (;;)
{
RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
@@ -3006,6 +3055,9 @@ for (;;)
if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
eptr++;
}
+
+ if (possessive) continue; /* No backtracking */
+
while (eptr >= pp)
{
RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
@@ -3021,9 +3073,10 @@ for (;;)
/* Control never gets here */
- /* Match an extended character class. This opcode is encountered only
- when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
- mode, because Unicode properties are supported in non-UTF-8 mode. */
+ /* Match an extended character class. In the 8-bit library, this opcode is
+ encountered only when UTF-8 mode mode is supported. In the 16-bit and
+ 32-bit libraries, codepoints greater than 255 may be encountered even when
+ UTF is not supported. */
#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
case OP_XCLASS:
@@ -3039,8 +3092,12 @@ for (;;)
case OP_CRMINPLUS:
case OP_CRQUERY:
case OP_CRMINQUERY:
+ case OP_CRPOSSTAR:
+ case OP_CRPOSPLUS:
+ case OP_CRPOSQUERY:
c = *ecode++ - OP_CRSTAR;
- minimize = (c & 1) != 0;
+ if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0;
+ else possessive = TRUE;
min = rep_min[c]; /* Pick up values from tables; */
max = rep_max[c]; /* zero for max => infinity */
if (max == 0) max = INT_MAX;
@@ -3048,7 +3105,9 @@ for (;;)
case OP_CRRANGE:
case OP_CRMINRANGE:
+ case OP_CRPOSRANGE:
minimize = (*ecode == OP_CRMINRANGE);
+ possessive = (*ecode == OP_CRPOSRANGE);
min = GET2(ecode, 1);
max = GET2(ecode, 1 + IMM2_SIZE);
if (max == 0) max = INT_MAX;
@@ -3120,6 +3179,9 @@ for (;;)
if (!PRIV(xclass)(c, data, utf)) break;
eptr += len;
}
+
+ if (possessive) continue; /* No backtracking */
+
for(;;)
{
RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
@@ -3190,7 +3252,7 @@ for (;;)
if (fc < 128)
{
- pcre_uchar cc = RAWUCHAR(eptr);
+ pcre_uint32 cc = RAWUCHAR(eptr);
if (md->lcc[fc] != TABLE_GET(cc, md->lcc, cc)) RRETURN(MATCH_NOMATCH);
ecode++;
eptr++;
@@ -3295,7 +3357,22 @@ for (;;)
max = rep_max[c]; /* zero for max => infinity */
if (max == 0) max = INT_MAX;
- /* Common code for all repeated single-character matches. */
+ /* Common code for all repeated single-character matches. We first check
+ for the minimum number of characters. If the minimum equals the maximum, we
+ are done. Otherwise, if minimizing, check the rest of the pattern for a
+ match; if there isn't one, advance up to the maximum, one character at a
+ time.
+
+ If maximizing, advance up to the maximum number of matching characters,
+ until eptr is past the end of the maximum run. If possessive, we are
+ then done (no backing up). Otherwise, match at this position; anything
+ other than no match is immediately returned. For nomatch, back up one
+ character, unless we are matching \R and the last thing matched was
+ \r\n, in which case, back up two bytes. When we reach the first optional
+ character position, we can save stack by doing a tail recurse.
+
+ The various UTF/non-UTF and caseful/caseless cases are handled separately,
+ for speed. */
REPEATCHAR:
#ifdef SUPPORT_UTF
@@ -3379,13 +3456,12 @@ for (;;)
}
}
- if (possessive) continue;
-
+ if (possessive) continue; /* No backtracking */
for(;;)
{
+ if (eptr == pp) goto TAIL_RECURSE;
RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
#ifdef SUPPORT_UCP
eptr--;
BACKCHAR(eptr);
@@ -3439,8 +3515,7 @@ for (;;)
for (i = 1; i <= min; i++)
{
- pcre_uchar cc;
-
+ pcre_uint32 cc; /* Faster than pcre_uchar */
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
@@ -3455,8 +3530,7 @@ for (;;)
{
for (fi = min;; fi++)
{
- pcre_uchar cc;
-
+ pcre_uint32 cc; /* Faster than pcre_uchar */
RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (fi >= max) RRETURN(MATCH_NOMATCH);
@@ -3476,8 +3550,7 @@ for (;;)
pp = eptr;
for (i = min; i < max; i++)
{
- pcre_uchar cc;
-
+ pcre_uint32 cc; /* Faster than pcre_uchar */
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
@@ -3487,18 +3560,16 @@ for (;;)
if (fc != cc && foc != cc) break;
eptr++;
}
-
- if (possessive) continue;
-
- while (eptr >= pp)
+ if (possessive) continue; /* No backtracking */
+ for (;;)
{
+ if (eptr == pp) goto TAIL_RECURSE;
RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
eptr--;
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
}
- RRETURN(MATCH_NOMATCH);
+ /* Control never gets here */
}
- /* Control never gets here */
}
/* Caseful comparisons (includes all multi-byte characters) */
@@ -3546,15 +3617,15 @@ for (;;)
if (fc != RAWUCHARTEST(eptr)) break;
eptr++;
}
- if (possessive) continue;
-
- while (eptr >= pp)
+ if (possessive) continue; /* No backtracking */
+ for (;;)
{
+ if (eptr == pp) goto TAIL_RECURSE;
RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
eptr--;
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
}
- RRETURN(MATCH_NOMATCH);
+ /* Control never gets here */
}
}
/* Control never gets here */
@@ -3726,7 +3797,7 @@ for (;;)
}
}
else
-#endif
+#endif /* SUPPORT_UTF */
/* Not UTF mode */
{
for (i = 1; i <= min; i++)
@@ -3764,7 +3835,7 @@ for (;;)
}
}
else
-#endif
+#endif /*SUPPORT_UTF */
/* Not UTF mode */
{
for (fi = min;; fi++)
@@ -3806,17 +3877,18 @@ for (;;)
if (fc == d || (unsigned int)foc == d) break;
eptr += len;
}
- if (possessive) continue;
+ if (possessive) continue; /* No backtracking */
for(;;)
{
+ if (eptr == pp) goto TAIL_RECURSE;
RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr-- == pp) break; /* Stop if tried at original pos */
+ eptr--;
BACKCHAR(eptr);
}
}
else
-#endif
+#endif /* SUPPORT_UTF */
/* Not UTF mode */
{
for (i = min; i < max; i++)
@@ -3829,18 +3901,17 @@ for (;;)
if (fc == *eptr || foc == *eptr) break;
eptr++;
}
- if (possessive) continue;
- while (eptr >= pp)
+ if (possessive) continue; /* No backtracking */
+ for (;;)
{
+ if (eptr == pp) goto TAIL_RECURSE;
RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
eptr--;
}
}
-
- RRETURN(MATCH_NOMATCH);
+ /* Control never gets here */
}
- /* Control never gets here */
}
/* Caseful comparisons */
@@ -3941,12 +4012,13 @@ for (;;)
if (fc == d) break;
eptr += len;
}
- if (possessive) continue;
+ if (possessive) continue; /* No backtracking */
for(;;)
{
+ if (eptr == pp) goto TAIL_RECURSE;
RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr-- == pp) break; /* Stop if tried at original pos */
+ eptr--;
BACKCHAR(eptr);
}
}
@@ -3964,16 +4036,16 @@ for (;;)
if (fc == *eptr) break;
eptr++;
}
- if (possessive) continue;
- while (eptr >= pp)
+ if (possessive) continue; /* No backtracking */
+ for (;;)
{
+ if (eptr == pp) goto TAIL_RECURSE;
RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
eptr--;
}
}
-
- RRETURN(MATCH_NOMATCH);
+ /* Control never gets here */
}
}
/* Control never gets here */
@@ -4155,7 +4227,12 @@ for (;;)
}
break;
+ /* Perl space used to exclude VT, but from Perl 5.18 it is included,
+ which means that Perl space and POSIX space are now identical. PCRE
+ was changed at release 8.34. */
+
case PT_SPACE: /* Perl space */
+ case PT_PXSPACE: /* POSIX space */
for (i = 1; i <= min; i++)
{
if (eptr >= md->end_subject)
@@ -4164,26 +4241,18 @@ for (;;)
RRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(c, eptr);
- if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
- c == CHAR_FF || c == CHAR_CR)
- == prop_fail_result)
- RRETURN(MATCH_NOMATCH);
- }
- break;
-
- case PT_PXSPACE: /* POSIX space */
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject)
+ switch(c)
{
- SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ HSPACE_CASES:
+ VSPACE_CASES:
+ if (prop_fail_result) RRETURN(MATCH_NOMATCH);
+ break;
+
+ default:
+ if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
+ RRETURN(MATCH_NOMATCH);
+ break;
}
- GETCHARINCTEST(c, eptr);
- if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
- c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
- == prop_fail_result)
- RRETURN(MATCH_NOMATCH);
}
break;
@@ -4225,6 +4294,22 @@ for (;;)
}
break;
+ case PT_UCNC:
+ for (i = 1; i <= min; i++)
+ {
+ if (eptr >= md->end_subject)
+ {
+ SCHECK_PARTIAL();
+ RRETURN(MATCH_NOMATCH);
+ }
+ GETCHARINCTEST(c, eptr);
+ if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
+ c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
+ c >= 0xe000) == prop_fail_result)
+ RRETURN(MATCH_NOMATCH);
+ }
+ break;
+
/* This should not occur */
default:
@@ -4430,8 +4515,7 @@ for (;;)
case OP_DIGIT:
for (i = 1; i <= min; i++)
{
- pcre_uchar cc;
-
+ pcre_uint32 cc;
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
@@ -4448,8 +4532,7 @@ for (;;)
case OP_NOT_WHITESPACE:
for (i = 1; i <= min; i++)
{
- pcre_uchar cc;
-
+ pcre_uint32 cc;
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
@@ -4466,8 +4549,7 @@ for (;;)
case OP_WHITESPACE:
for (i = 1; i <= min; i++)
{
- pcre_uchar cc;
-
+ pcre_uint32 cc;
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
@@ -4484,8 +4566,7 @@ for (;;)
case OP_NOT_WORDCHAR:
for (i = 1; i <= min; i++)
{
- pcre_uchar cc;
-
+ pcre_uint32 cc;
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
@@ -4502,8 +4583,7 @@ for (;;)
case OP_WORDCHAR:
for (i = 1; i <= min; i++)
{
- pcre_uchar cc;
-
+ pcre_uint32 cc;
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
@@ -4892,25 +4972,11 @@ for (;;)
}
/* Control never gets here */
- case PT_SPACE: /* Perl space */
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
- if (eptr >= md->end_subject)
- {
- SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
- }
- GETCHARINCTEST(c, eptr);
- if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
- c == CHAR_FF || c == CHAR_CR)
- == prop_fail_result)
- RRETURN(MATCH_NOMATCH);
- }
- /* Control never gets here */
+ /* Perl space used to exclude VT, but from Perl 5.18 it is included,
+ which means that Perl space and POSIX space are now identical. PCRE
+ was changed at release 8.34. */
+ case PT_SPACE: /* Perl space */
case PT_PXSPACE: /* POSIX space */
for (fi = min;; fi++)
{
@@ -4923,10 +4989,18 @@ for (;;)
RRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(c, eptr);
- if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
- c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
- == prop_fail_result)
- RRETURN(MATCH_NOMATCH);
+ switch(c)
+ {
+ HSPACE_CASES:
+ VSPACE_CASES:
+ if (prop_fail_result) RRETURN(MATCH_NOMATCH);
+ break;
+
+ default:
+ if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
+ RRETURN(MATCH_NOMATCH);
+ break;
+ }
}
/* Control never gets here */
@@ -4976,6 +5050,25 @@ for (;;)
}
/* Control never gets here */
+ case PT_UCNC:
+ for (fi = min;; fi++)
+ {
+ RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (fi >= max) RRETURN(MATCH_NOMATCH);
+ if (eptr >= md->end_subject)
+ {
+ SCHECK_PARTIAL();
+ RRETURN(MATCH_NOMATCH);
+ }
+ GETCHARINCTEST(c, eptr);
+ if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
+ c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
+ c >= 0xe000) == prop_fail_result)
+ RRETURN(MATCH_NOMATCH);
+ }
+ /* Control never gets here */
+
/* This should never occur */
default:
RRETURN(PCRE_ERROR_INTERNAL);
@@ -5391,7 +5484,12 @@ for (;;)
}
break;
+ /* Perl space used to exclude VT, but from Perl 5.18 it is included,
+ which means that Perl space and POSIX space are now identical. PCRE
+ was changed at release 8.34. */
+
case PT_SPACE: /* Perl space */
+ case PT_PXSPACE: /* POSIX space */
for (i = min; i < max; i++)
{
int len = 1;
@@ -5401,30 +5499,21 @@ for (;;)
break;
}
GETCHARLENTEST(c, eptr, len);
- if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
- c == CHAR_FF || c == CHAR_CR)
- == prop_fail_result)
+ switch(c)
+ {
+ HSPACE_CASES:
+ VSPACE_CASES:
+ if (prop_fail_result) goto ENDLOOP99; /* Break the loop */
break;
- eptr+= len;
- }
- break;
- case PT_PXSPACE: /* POSIX space */
- for (i = min; i < max; i++)
- {
- int len = 1;
- if (eptr >= md->end_subject)
- {
- SCHECK_PARTIAL();
+ default:
+ if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
+ goto ENDLOOP99; /* Break the loop */
break;
}
- GETCHARLENTEST(c, eptr, len);
- if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
- c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
- == prop_fail_result)
- break;
eptr+= len;
}
+ ENDLOOP99:
break;
case PT_WORD:
@@ -5470,23 +5559,42 @@ for (;;)
GOT_MAX:
break;
+ case PT_UCNC:
+ for (i = min; i < max; i++)
+ {
+ int len = 1;
+ if (eptr >= md->end_subject)
+ {
+ SCHECK_PARTIAL();
+ break;
+ }
+ GETCHARLENTEST(c, eptr, len);
+ if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
+ c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
+ c >= 0xe000) == prop_fail_result)
+ break;
+ eptr += len;
+ }
+ break;
+
default:
RRETURN(PCRE_ERROR_INTERNAL);
}
/* eptr is now past the end of the maximum run */
- if (possessive) continue;
+ if (possessive) continue; /* No backtracking */
for(;;)
{
+ if (eptr == pp) goto TAIL_RECURSE;
RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr-- == pp) break; /* Stop if tried at original pos */
+ eptr--;
if (utf) BACKCHAR(eptr);
}
}
- /* Match extended Unicode sequences. We will get here only if the
+ /* Match extended Unicode grapheme clusters. We will get here only if the
support is in the binary; otherwise a compile-time error occurs. */
else if (ctype == OP_EXTUNI)
@@ -5518,22 +5626,42 @@ for (;;)
/* eptr is now past the end of the maximum run */
- if (possessive) continue;
+ if (possessive) continue; /* No backtracking */
for(;;)
{
+ int lgb, rgb;
+ PCRE_PUCHAR fptr;
+
+ if (eptr == pp) goto TAIL_RECURSE; /* At start of char run */
RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr-- == pp) break; /* Stop if tried at original pos */
- for (;;) /* Move back over one extended */
+
+ /* Backtracking over an extended grapheme cluster involves inspecting
+ the previous two characters (if present) to see if a break is
+ permitted between them. */
+
+ eptr--;
+ if (!utf) c = *eptr; else
+ {
+ BACKCHAR(eptr);
+ GETCHAR(c, eptr);
+ }
+ rgb = UCD_GRAPHBREAK(c);
+
+ for (;;)
{
- if (!utf) c = *eptr; else
+ if (eptr == pp) goto TAIL_RECURSE; /* At start of char run */
+ fptr = eptr - 1;
+ if (!utf) c = *fptr; else
{
- BACKCHAR(eptr);
- GETCHAR(c, eptr);
+ BACKCHAR(fptr);
+ GETCHAR(c, fptr);
}
- if (UCD_CATEGORY(c) != ucp_M) break;
- eptr--;
+ lgb = UCD_GRAPHBREAK(c);
+ if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
+ eptr = fptr;
+ rgb = lgb;
}
}
}
@@ -5799,18 +5927,13 @@ for (;;)
RRETURN(PCRE_ERROR_INTERNAL);
}
- /* eptr is now past the end of the maximum run. If possessive, we are
- done (no backing up). Otherwise, match at this position; anything other
- than no match is immediately returned. For nomatch, back up one
- character, unless we are matching \R and the last thing matched was
- \r\n, in which case, back up two bytes. */
-
- if (possessive) continue;
+ if (possessive) continue; /* No backtracking */
for(;;)
{
+ if (eptr == pp) goto TAIL_RECURSE;
RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr-- == pp) break; /* Stop if tried at original pos */
+ eptr--;
BACKCHAR(eptr);
if (ctype == OP_ANYNL && eptr > pp && RAWUCHAR(eptr) == CHAR_NL &&
RAWUCHAR(eptr - 1) == CHAR_CR) eptr--;
@@ -6048,15 +6171,10 @@ for (;;)
RRETURN(PCRE_ERROR_INTERNAL);
}
- /* eptr is now past the end of the maximum run. If possessive, we are
- done (no backing up). Otherwise, match at this position; anything other
- than no match is immediately returned. For nomatch, back up one
- character (byte), unless we are matching \R and the last thing matched
- was \r\n, in which case, back up two bytes. */
-
- if (possessive) continue;
- while (eptr >= pp)
+ if (possessive) continue; /* No backtracking */
+ for (;;)
{
+ if (eptr == pp) goto TAIL_RECURSE;
RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
eptr--;
@@ -6065,11 +6183,8 @@ for (;;)
}
}
- /* Get here if we can't make it match with any permitted repetitions */
-
- RRETURN(MATCH_NOMATCH);
+ /* Control never gets here */
}
- /* Control never gets here */
/* There's been some horrible disaster. Arrival here can only mean there is
something seriously wrong in the code above or the OP_xxx definitions. */
@@ -6103,10 +6218,10 @@ switch (frame->Xwhere)
LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
LBL(65) LBL(66)
#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
- LBL(21)
+ LBL(20) LBL(21)
#endif
#ifdef SUPPORT_UTF
- LBL(16) LBL(18) LBL(20)
+ LBL(16) LBL(18)
LBL(22) LBL(23) LBL(28) LBL(30)
LBL(32) LBL(34) LBL(42) LBL(46)
#ifdef SUPPORT_UCP
@@ -6264,6 +6379,7 @@ const pcre_uint8 *start_bits = NULL;
PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
PCRE_PUCHAR end_subject;
PCRE_PUCHAR start_partial = NULL;
+PCRE_PUCHAR match_partial = NULL;
PCRE_PUCHAR req_char_ptr = start_match - 1;
const pcre_study_data *study;
@@ -6393,6 +6509,8 @@ md->callout_data = NULL;
tables = re->tables;
+/* The two limit values override the defaults, whatever their value. */
+
if (extra_data != NULL)
{
register unsigned int flags = extra_data->flags;
@@ -6407,6 +6525,15 @@ if (extra_data != NULL)
if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
}
+/* Limits in the regex override only if they are smaller. */
+
+if ((re->flags & PCRE_MLSET) != 0 && re->limit_match < md->match_limit)
+ md->match_limit = re->limit_match;
+
+if ((re->flags & PCRE_RLSET) != 0 &&
+ re->limit_recursion < md->match_limit_recursion)
+ md->match_limit_recursion = re->limit_recursion;
+
/* If the exec call supplied NULL for tables, use the inbuilt ones. This
is a feature that makes it possible to save compiled regex and re-use them
in other programs later. */
@@ -6432,7 +6559,7 @@ end_subject = md->end_subject;
md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
md->use_ucp = (re->options & PCRE_UCP) != 0;
md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
-md->ignore_skip_arg = FALSE;
+md->ignore_skip_arg = 0;
/* Some options are unpacked into BOOL variables in the hope that testing
them will be faster than individual option bits. */
@@ -6542,11 +6669,9 @@ if (re->top_backref > 0 && re->top_backref >= ocount/3)
DPRINTF(("Got memory to hold back references\n"));
}
else md->offset_vector = offsets;
-
md->offset_end = ocount;
md->offset_max = (2*ocount)/3;
-md->offset_overflow = FALSE;
-md->capture_last = -1;
+md->capture_last = 0;
/* Reset the working variable associated with each extraction. These should
never be used unless previously set, but they get saved and restored, and so we
@@ -6816,8 +6941,13 @@ for(;;)
md->match_call_count = 0;
md->match_function_type = 0;
md->end_offset_top = 0;
+ md->skip_arg_count = 0;
rc = match(start_match, md->start_code, start_match, 2, md, NULL, 0);
- if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
+ if (md->hitend && start_partial == NULL)
+ {
+ start_partial = md->start_used_ptr;
+ match_partial = start_match;
+ }
switch(rc)
{
@@ -6830,14 +6960,14 @@ for(;;)
case MATCH_SKIP_ARG:
new_start_match = start_match;
- md->ignore_skip_arg = TRUE;
+ md->ignore_skip_arg = md->skip_arg_count;
break;
- /* SKIP passes back the next starting point explicitly, but if it is the
- same as the match we have just done, treat it as NOMATCH. */
+ /* SKIP passes back the next starting point explicitly, but if it is no
+ greater than the match we have just done, treat it as NOMATCH. */
case MATCH_SKIP:
- if (md->start_match_ptr != start_match)
+ if (md->start_match_ptr > start_match)
{
new_start_match = md->start_match_ptr;
break;
@@ -6845,12 +6975,12 @@ for(;;)
/* Fall through */
/* NOMATCH and PRUNE advance by one character. THEN at this level acts
- exactly like PRUNE. Unset the ignore SKIP-with-argument flag. */
+ exactly like PRUNE. Unset ignore SKIP-with-argument. */
case MATCH_NOMATCH:
case MATCH_PRUNE:
case MATCH_THEN:
- md->ignore_skip_arg = FALSE;
+ md->ignore_skip_arg = 0;
new_start_match = start_match + 1;
#ifdef SUPPORT_UTF
if (utf)
@@ -6943,7 +7073,7 @@ if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
(arg_offset_max - 2) * sizeof(int));
DPRINTF(("Copied offsets from temporary memory\n"));
}
- if (md->end_offset_top > arg_offset_max) md->offset_overflow = TRUE;
+ if (md->end_offset_top > arg_offset_max) md->capture_last |= OVFLBIT;
DPRINTF(("Freeing temporary memory\n"));
(PUBL(free))(md->offset_vector);
}
@@ -6951,7 +7081,8 @@ if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
/* Set the return code to the number of captured strings, or 0 if there were
too many to fit into the vector. */
- rc = (md->offset_overflow && md->end_offset_top >= arg_offset_max)?
+ rc = ((md->capture_last & OVFLBIT) != 0 &&
+ md->end_offset_top >= arg_offset_max)?
0 : md->end_offset_top/2;
/* If there is space in the offset vector, set any unused pairs at the end of
@@ -7016,7 +7147,7 @@ if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
/* Handle partial matches - disable any mark data */
-if (start_partial != NULL)
+if (match_partial != NULL)
{
DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
md->mark = NULL;
@@ -7024,6 +7155,8 @@ if (start_partial != NULL)
{
offsets[0] = (int)(start_partial - (PCRE_PUCHAR)subject);
offsets[1] = (int)(end_subject - (PCRE_PUCHAR)subject);
+ if (offsetcount > 2)
+ offsets[2] = (int)(match_partial - (PCRE_PUCHAR)subject);
}
rc = PCRE_ERROR_PARTIAL;
}