From 492d922207a318b721a63f7c2b46d5d5012df1cf Mon Sep 17 00:00:00 2001 From: Giuseppe D'Angelo Date: Mon, 16 Dec 2013 11:11:47 +0100 Subject: Upgrade the PCRE bundle to 8.34 New upstream version, changelogs at: http://pcre.org/news.txt http://pcre.org/changelog.txt Qt still requires 8.30. Change-Id: I76794a3079601b07c469b952367f71f794079edc Reviewed-by: Thiago Macieira Reviewed-by: Lars Knoll --- src/3rdparty/pcre/pcre_exec.c | 959 ++++++++++++++++++++++++------------------ 1 file changed, 546 insertions(+), 413 deletions(-) (limited to 'src/3rdparty/pcre/pcre_exec.c') diff --git a/src/3rdparty/pcre/pcre_exec.c b/src/3rdparty/pcre/pcre_exec.c index c888468a25..913521ff0c 100644 --- a/src/3rdparty/pcre/pcre_exec.c +++ b/src/3rdparty/pcre/pcre_exec.c @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2012 University of Cambridge + Copyright (c) 1997-2013 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -56,6 +56,20 @@ possible. There are also some static supporting functions. */ #undef min #undef max +/* The md->capture_last field uses the lower 16 bits for the last captured +substring (which can never be greater than 65535) and a bit in the top half +to mean "capture vector overflowed". This odd way of doing things was +implemented when it was realized that preserving and restoring the overflow bit +whenever the last capture number was saved/restored made for a neater +interface, and doing it this way saved on (a) another variable, which would +have increased the stack frame size (a big NO-NO in PCRE) and (b) another +separate set of save/restore instructions. The following defines are used in +implementing this. */ + +#define CAPLMASK 0x0000ffff /* The bits used for last_capture */ +#define OVFLMASK 0xffff0000 /* The bits used for the overflow flag */ +#define OVFLBIT 0x00010000 /* The bit that is set for overflow */ + /* Values for setting in md->match_function_type to indicate two special types of call to match(). We do it this way to save on using another stack variable, as stack usage is to be discouraged. */ @@ -73,13 +87,17 @@ defined PCRE_ERROR_xxx codes, which are all negative. */ negative to avoid the external error codes. */ #define MATCH_ACCEPT (-999) -#define MATCH_COMMIT (-998) -#define MATCH_KETRPOS (-997) -#define MATCH_ONCE (-996) +#define MATCH_KETRPOS (-998) +#define MATCH_ONCE (-997) +/* The next 5 must be kept together and in sequence so that a test that checks +for any one of them can use a range. */ +#define MATCH_COMMIT (-996) #define MATCH_PRUNE (-995) #define MATCH_SKIP (-994) #define MATCH_SKIP_ARG (-993) #define MATCH_THEN (-992) +#define MATCH_BACKTRACK_MAX MATCH_THEN +#define MATCH_BACKTRACK_MIN MATCH_COMMIT /* Maximum number of ints of offset to save on the stack for recursive calls. If the offset vector is bigger, malloc is used. This should be a multiple of 3, @@ -89,8 +107,8 @@ because the offset vector is always a multiple of 3 long. */ /* Min and max values for the common repeats; for the maxima, 0 => infinity */ -static const char rep_min[] = { 0, 0, 1, 1, 0, 0 }; -static const char rep_max[] = { 0, 0, 0, 0, 1, 1 }; +static const char rep_min[] = { 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, }; +static const char rep_max[] = { 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, }; #ifdef PCRE_DEBUG /************************************************* @@ -149,7 +167,7 @@ match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md, { PCRE_PUCHAR eptr_start = eptr; register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset]; -#ifdef SUPPORT_UTF +#if defined SUPPORT_UTF && defined SUPPORT_UCP BOOL utf = md->utf; #endif @@ -177,8 +195,7 @@ ASCII characters. */ if (caseless) { -#ifdef SUPPORT_UTF -#ifdef SUPPORT_UCP +#if defined SUPPORT_UTF && defined SUPPORT_UCP if (utf) { /* Match characters up to the end of the reference. NOTE: the number of @@ -211,7 +228,6 @@ if (caseless) } } else -#endif #endif /* The same code works when not in UTF-8 mode and in UTF-8 mode when there @@ -219,7 +235,7 @@ if (caseless) { while (length-- > 0) { - pcre_uchar cc, cp; + pcre_uint32 cc, cp; if (eptr >= md->end_subject) return -2; /* Partial match */ cc = RAWUCHARTEST(eptr); cp = RAWUCHARTEST(p); @@ -416,10 +432,10 @@ typedef struct heapframe { int Xlength; int Xmax; int Xmin; - int Xnumber; + unsigned int Xnumber; int Xoffset; - int Xop; - int Xsave_capture_last; + unsigned int Xop; + pcre_int32 Xsave_capture_last; int Xsave_offset1, Xsave_offset2, Xsave_offset3; int Xstacksave[REC_STACK_SAVE_MAX]; @@ -634,8 +650,8 @@ int max; int min; unsigned int number; int offset; -pcre_uchar op; -int save_capture_last; +unsigned int op; +pcre_int32 save_capture_last; int save_offset1, save_offset2, save_offset3; int stacksave[REC_STACK_SAVE_MAX]; @@ -763,23 +779,16 @@ for (;;) case OP_FAIL: RRETURN(MATCH_NOMATCH); - /* COMMIT overrides PRUNE, SKIP, and THEN */ - case OP_COMMIT: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb, RM52); - if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && - rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG && - rrc != MATCH_THEN) - RRETURN(rrc); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); RRETURN(MATCH_COMMIT); - /* PRUNE overrides THEN */ - case OP_PRUNE: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb, RM51); - if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); RRETURN(MATCH_PRUNE); case OP_PRUNE_ARG: @@ -789,38 +798,39 @@ for (;;) eptrb, RM56); if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) && md->mark == NULL) md->mark = ecode + 2; - if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); RRETURN(MATCH_PRUNE); - /* SKIP overrides PRUNE and THEN */ - case OP_SKIP: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb, RM53); - if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN) - RRETURN(rrc); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); md->start_match_ptr = eptr; /* Pass back current position */ RRETURN(MATCH_SKIP); /* Note that, for Perl compatibility, SKIP with an argument does NOT set - nomatch_mark. There is a flag that disables this opcode when re-matching a - pattern that ended with a SKIP for which there was not a matching MARK. */ + nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was + not a matching mark, we have to re-run the match, ignoring the SKIP_ARG + that failed and any that precede it (either they also failed, or were not + triggered). To do this, we maintain a count of executed SKIP_ARGs. If a + SKIP_ARG gets to top level, the match is re-run with md->ignore_skip_arg + set to the count of the one that failed. */ case OP_SKIP_ARG: - if (md->ignore_skip_arg) + md->skip_arg_count++; + if (md->skip_arg_count <= md->ignore_skip_arg) { ecode += PRIV(OP_lengths)[*ecode] + ecode[1]; break; } RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md, eptrb, RM57); - if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN) - RRETURN(rrc); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); /* Pass back the current skip name by overloading md->start_match_ptr and returning the special MATCH_SKIP_ARG return code. This will either be caught by a matching MARK, or get to the top, where it causes a rematch - with the md->ignore_skip_arg flag set. */ + with md->ignore_skip_arg set to the value of md->skip_arg_count. */ md->start_match_ptr = ecode + 2; RRETURN(MATCH_SKIP_ARG); @@ -1066,6 +1076,7 @@ for (;;) /* In all other cases, we have to make another call to match(). */ save_mark = md->mark; + save_capture_last = md->capture_last; RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb, RM2); @@ -1097,6 +1108,7 @@ for (;;) ecode += GET(ecode, 1); md->mark = save_mark; if (*ecode != OP_ALT) break; + md->capture_last = save_capture_last; } RRETURN(MATCH_NOMATCH); @@ -1159,6 +1171,7 @@ for (;;) ecode = md->start_code + code_offset; save_capture_last = md->capture_last; matched_once = TRUE; + mstart = md->start_match_ptr; /* In case \K changed it */ continue; } @@ -1218,6 +1231,7 @@ for (;;) POSSESSIVE_NON_CAPTURE: matched_once = FALSE; code_offset = (int)(ecode - md->start_code); + save_capture_last = md->capture_last; for (;;) { @@ -1230,6 +1244,7 @@ for (;;) eptr = md->end_match_ptr; ecode = md->start_code + code_offset; matched_once = TRUE; + mstart = md->start_match_ptr; /* In case \K reset it */ continue; } @@ -1247,6 +1262,7 @@ for (;;) if (rrc != MATCH_NOMATCH) RRETURN(rrc); ecode += GET(ecode, 1); if (*ecode != OP_ALT) break; + md->capture_last = save_capture_last; } if (matched_once || allow_zero) @@ -1258,25 +1274,32 @@ for (;;) /* Control never reaches here. */ - /* Conditional group: compilation checked that there are no more than - two branches. If the condition is false, skipping the first branch takes us - past the end if there is only one branch, but that's OK because that is - exactly what going to the ket would do. */ + /* Conditional group: compilation checked that there are no more than two + branches. If the condition is false, skipping the first branch takes us + past the end of the item if there is only one branch, but that's exactly + what we want. */ case OP_COND: case OP_SCOND: - codelink = GET(ecode, 1); + + /* The variable codelink will be added to ecode when the condition is + false, to get to the second branch. Setting it to the offset to the ALT + or KET, then incrementing ecode achieves this effect. We now have ecode + pointing to the condition or callout. */ + + codelink = GET(ecode, 1); /* Offset to the second branch */ + ecode += 1 + LINK_SIZE; /* From this opcode */ /* Because of the way auto-callout works during compile, a callout item is inserted between OP_COND and an assertion condition. */ - if (ecode[LINK_SIZE+1] == OP_CALLOUT) + if (*ecode == OP_CALLOUT) { if (PUBL(callout) != NULL) { PUBL(callout_block) cb; cb.version = 2; /* Version 1 of the callout block */ - cb.callout_number = ecode[LINK_SIZE+2]; + cb.callout_number = ecode[1]; cb.offset_vector = md->offset_vector; #if defined COMPILE_PCRE8 cb.subject = (PCRE_SPTR)md->start_subject; @@ -1288,215 +1311,130 @@ for (;;) cb.subject_length = (int)(md->end_subject - md->start_subject); cb.start_match = (int)(mstart - md->start_subject); cb.current_position = (int)(eptr - md->start_subject); - cb.pattern_position = GET(ecode, LINK_SIZE + 3); - cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE); + cb.pattern_position = GET(ecode, 2); + cb.next_item_length = GET(ecode, 2 + LINK_SIZE); cb.capture_top = offset_top/2; - cb.capture_last = md->capture_last; + cb.capture_last = md->capture_last & CAPLMASK; + /* Internal change requires this for API compatibility. */ + if (cb.capture_last == 0) cb.capture_last = -1; cb.callout_data = md->callout_data; cb.mark = md->nomatch_mark; if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH); if (rrc < 0) RRETURN(rrc); } + + /* Advance ecode past the callout, so it now points to the condition. We + must adjust codelink so that the value of ecode+codelink is unchanged. */ + ecode += PRIV(OP_lengths)[OP_CALLOUT]; + codelink -= PRIV(OP_lengths)[OP_CALLOUT]; } - condcode = ecode[LINK_SIZE+1]; + /* Test the various possible conditions */ - /* Now see what the actual condition is */ - - if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */ + condition = FALSE; + switch(condcode = *ecode) { - if (md->recursive == NULL) /* Not recursing => FALSE */ - { - condition = FALSE; - ecode += GET(ecode, 1); - } - else + case OP_RREF: /* Numbered group recursion test */ + if (md->recursive != NULL) /* Not recursing => FALSE */ { - unsigned int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/ + unsigned int recno = GET2(ecode, 1); /* Recursion group number*/ condition = (recno == RREF_ANY || recno == md->recursive->group_num); + } + break; - /* If the test is for recursion into a specific subpattern, and it is - false, but the test was set up by name, scan the table to see if the - name refers to any other numbers, and test them. The condition is true - if any one is set. */ - - if (!condition && condcode == OP_NRREF) + case OP_DNRREF: /* Duplicate named group recursion test */ + if (md->recursive != NULL) + { + int count = GET2(ecode, 1 + IMM2_SIZE); + pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size; + while (count-- > 0) { - pcre_uchar *slotA = md->name_table; - for (i = 0; i < md->name_count; i++) - { - if (GET2(slotA, 0) == recno) break; - slotA += md->name_entry_size; - } - - /* Found a name for the number - there can be only one; duplicate - names for different numbers are allowed, but not vice versa. First - scan down for duplicates. */ - - if (i < md->name_count) - { - pcre_uchar *slotB = slotA; - while (slotB > md->name_table) - { - slotB -= md->name_entry_size; - if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0) - { - condition = GET2(slotB, 0) == md->recursive->group_num; - if (condition) break; - } - else break; - } - - /* Scan up for duplicates */ - - if (!condition) - { - slotB = slotA; - for (i++; i < md->name_count; i++) - { - slotB += md->name_entry_size; - if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0) - { - condition = GET2(slotB, 0) == md->recursive->group_num; - if (condition) break; - } - else break; - } - } - } + unsigned int recno = GET2(slot, 0); + condition = recno == md->recursive->group_num; + if (condition) break; + slot += md->name_entry_size; } - - /* Chose branch according to the condition */ - - ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1); } - } + break; - else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */ - { - offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */ + case OP_CREF: /* Numbered group used test */ + offset = GET2(ecode, 1) << 1; /* Doubled ref number */ condition = offset < offset_top && md->offset_vector[offset] >= 0; + break; - /* If the numbered capture is unset, but the reference was by name, - scan the table to see if the name refers to any other numbers, and test - them. The condition is true if any one is set. This is tediously similar - to the code above, but not close enough to try to amalgamate. */ - - if (!condition && condcode == OP_NCREF) + case OP_DNCREF: /* Duplicate named group used test */ { - unsigned int refno = offset >> 1; - pcre_uchar *slotA = md->name_table; - - for (i = 0; i < md->name_count; i++) + int count = GET2(ecode, 1 + IMM2_SIZE); + pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size; + while (count-- > 0) { - if (GET2(slotA, 0) == refno) break; - slotA += md->name_entry_size; - } - - /* Found a name for the number - there can be only one; duplicate names - for different numbers are allowed, but not vice versa. First scan down - for duplicates. */ - - if (i < md->name_count) - { - pcre_uchar *slotB = slotA; - while (slotB > md->name_table) - { - slotB -= md->name_entry_size; - if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0) - { - offset = GET2(slotB, 0) << 1; - condition = offset < offset_top && - md->offset_vector[offset] >= 0; - if (condition) break; - } - else break; - } - - /* Scan up for duplicates */ - - if (!condition) - { - slotB = slotA; - for (i++; i < md->name_count; i++) - { - slotB += md->name_entry_size; - if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0) - { - offset = GET2(slotB, 0) << 1; - condition = offset < offset_top && - md->offset_vector[offset] >= 0; - if (condition) break; - } - else break; - } - } + offset = GET2(slot, 0) << 1; + condition = offset < offset_top && md->offset_vector[offset] >= 0; + if (condition) break; + slot += md->name_entry_size; } } + break; - /* Chose branch according to the condition */ - - ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1); - } - - else if (condcode == OP_DEF) /* DEFINE - always false */ - { - condition = FALSE; - ecode += GET(ecode, 1); - } + case OP_DEF: /* DEFINE - always false */ + break; - /* The condition is an assertion. Call match() to evaluate it - setting - md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of - an assertion. */ + /* The condition is an assertion. Call match() to evaluate it - setting + md->match_function_type to MATCH_CONDASSERT causes it to stop at the end + of an assertion. */ - else - { + default: md->match_function_type = MATCH_CONDASSERT; - RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3); + RMATCH(eptr, ecode, offset_top, md, NULL, RM3); if (rrc == MATCH_MATCH) { if (md->end_offset_top > offset_top) offset_top = md->end_offset_top; /* Captures may have happened */ condition = TRUE; - ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2); + + /* Advance ecode past the assertion to the start of the first branch, + but adjust it so that the general choosing code below works. */ + + ecode += GET(ecode, 1); while (*ecode == OP_ALT) ecode += GET(ecode, 1); + ecode += 1 + LINK_SIZE - PRIV(OP_lengths)[condcode]; } /* PCRE doesn't allow the effect of (*THEN) to escape beyond an - assertion; it is therefore treated as NOMATCH. */ + assertion; it is therefore treated as NOMATCH. Any other return is an + error. */ else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) { RRETURN(rrc); /* Need braces because of following else */ } - else - { - condition = FALSE; - ecode += codelink; - } + break; } - /* We are now at the branch that is to be obeyed. As there is only one, can - use tail recursion to avoid using another stack frame, except when there is - unlimited repeat of a possibly empty group. In the latter case, a recursive - call to match() is always required, unless the second alternative doesn't - exist, in which case we can just plough on. Note that, for compatibility - with Perl, the | in a conditional group is NOT treated as creating two - alternatives. If a THEN is encountered in the branch, it propagates out to - the enclosing alternative (unless nested in a deeper set of alternatives, - of course). */ - - if (condition || *ecode == OP_ALT) + /* Choose branch according to the condition */ + + ecode += condition? PRIV(OP_lengths)[condcode] : codelink; + + /* We are now at the branch that is to be obeyed. As there is only one, we + can use tail recursion to avoid using another stack frame, except when + there is unlimited repeat of a possibly empty group. In the latter case, a + recursive call to match() is always required, unless the second alternative + doesn't exist, in which case we can just plough on. Note that, for + compatibility with Perl, the | in a conditional group is NOT treated as + creating two alternatives. If a THEN is encountered in the branch, it + propagates out to the enclosing alternative (unless nested in a deeper set + of alternatives, of course). */ + + if (condition || ecode[-(1+LINK_SIZE)] == OP_ALT) { if (op != OP_SCOND) { - ecode += 1 + LINK_SIZE; goto TAIL_RECURSE; } md->match_function_type = MATCH_CBEGROUP; - RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49); + RMATCH(eptr, ecode, offset_top, md, eptrb, RM49); RRETURN(rrc); } @@ -1504,7 +1442,6 @@ for (;;) else { - ecode += 1 + LINK_SIZE; } break; @@ -1513,7 +1450,7 @@ for (;;) to close any currently open capturing brackets. */ case OP_CLOSE: - number = GET2(ecode, 1); + number = GET2(ecode, 1); /* Must be less than 65536 */ offset = number << 1; #ifdef PCRE_DEBUG @@ -1521,8 +1458,8 @@ for (;;) printf("\n"); #endif - md->capture_last = number; - if (offset >= md->offset_max) md->offset_overflow = TRUE; else + md->capture_last = (md->capture_last & OVFLMASK) | number; + if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else { md->offset_vector[offset] = md->offset_vector[md->offset_end - number]; @@ -1584,28 +1521,49 @@ for (;;) } else condassert = FALSE; + /* Loop for each branch */ + do { RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4); + + /* A match means that the assertion is true; break out of the loop + that matches its alternatives. */ + if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) { mstart = md->start_match_ptr; /* In case \K reset it */ break; } + + /* If not matched, restore the previous mark setting. */ + md->mark = save_mark; - /* A COMMIT failure must fail the entire assertion, without trying any - subsequent branches. */ + /* See comment in the code for capturing groups above about handling + THEN. */ - if (rrc == MATCH_COMMIT) RRETURN(MATCH_NOMATCH); + if (rrc == MATCH_THEN) + { + next = ecode + GET(ecode,1); + if (md->start_match_ptr < next && + (*ecode == OP_ALT || *next == OP_ALT)) + rrc = MATCH_NOMATCH; + } - /* PCRE does not allow THEN to escape beyond an assertion; it - is treated as NOMATCH. */ + /* Anything other than NOMATCH causes the entire assertion to fail, + passing back the return code. This includes COMMIT, SKIP, PRUNE and an + uncaptured THEN, which means they take their normal effect. This + consistent approach does not always have exactly the same effect as in + Perl. */ - if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); ecode += GET(ecode, 1); } - while (*ecode == OP_ALT); + while (*ecode == OP_ALT); /* Continue for next alternative */ + + /* If we have tried all the alternative branches, the assertion has + failed. If not, we broke out after a match. */ if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH); @@ -1613,17 +1571,16 @@ for (;;) if (condassert) RRETURN(MATCH_MATCH); - /* Continue from after the assertion, updating the offsets high water - mark, since extracts may have been taken during the assertion. */ + /* Continue from after a successful assertion, updating the offsets high + water mark, since extracts may have been taken during the assertion. */ do ecode += GET(ecode,1); while (*ecode == OP_ALT); ecode += 1 + LINK_SIZE; offset_top = md->end_offset_top; continue; - /* Negative assertion: all branches must fail to match. Encountering SKIP, - PRUNE, or COMMIT means we must assume failure without checking subsequent - branches. */ + /* Negative assertion: all branches must fail to match for the assertion to + succeed. */ case OP_ASSERT_NOT: case OP_ASSERTBACK_NOT: @@ -1635,28 +1592,64 @@ for (;;) } else condassert = FALSE; + /* Loop for each alternative branch. */ + do { RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5); - md->mark = save_mark; - if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH); - if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT) + md->mark = save_mark; /* Always restore the mark setting */ + + switch(rrc) { - do ecode += GET(ecode,1); while (*ecode == OP_ALT); + case MATCH_MATCH: /* A successful match means */ + case MATCH_ACCEPT: /* the assertion has failed. */ + RRETURN(MATCH_NOMATCH); + + case MATCH_NOMATCH: /* Carry on with next branch */ break; + + /* See comment in the code for capturing groups above about handling + THEN. */ + + case MATCH_THEN: + next = ecode + GET(ecode,1); + if (md->start_match_ptr < next && + (*ecode == OP_ALT || *next == OP_ALT)) + { + rrc = MATCH_NOMATCH; + break; + } + /* Otherwise fall through. */ + + /* COMMIT, SKIP, PRUNE, and an uncaptured THEN cause the whole + assertion to fail to match, without considering any more alternatives. + Failing to match means the assertion is true. This is a consistent + approach, but does not always have the same effect as in Perl. */ + + case MATCH_COMMIT: + case MATCH_SKIP: + case MATCH_SKIP_ARG: + case MATCH_PRUNE: + do ecode += GET(ecode,1); while (*ecode == OP_ALT); + goto NEG_ASSERT_TRUE; /* Break out of alternation loop */ + + /* Anything else is an error */ + + default: + RRETURN(rrc); } - /* PCRE does not allow THEN to escape beyond an assertion; it is treated - as NOMATCH. */ + /* Continue with next branch */ - if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); ecode += GET(ecode,1); } while (*ecode == OP_ALT); - if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */ + /* All branches in the assertion failed to match. */ - ecode += 1 + LINK_SIZE; + NEG_ASSERT_TRUE: + if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */ + ecode += 1 + LINK_SIZE; /* Continue with current branch */ continue; /* Move the subject pointer back. This occurs only at the start of @@ -1716,7 +1709,9 @@ for (;;) cb.pattern_position = GET(ecode, 2); cb.next_item_length = GET(ecode, 2 + LINK_SIZE); cb.capture_top = offset_top/2; - cb.capture_last = md->capture_last; + cb.capture_last = md->capture_last & CAPLMASK; + /* Internal change requires this for API compatibility. */ + if (cb.capture_last == 0) cb.capture_last = -1; cb.callout_data = md->callout_data; cb.mark = md->nomatch_mark; if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH); @@ -1762,6 +1757,7 @@ for (;;) /* Add to "recursing stack" */ new_recursive.group_num = recno; + new_recursive.saved_capture_last = md->capture_last; new_recursive.subject_position = eptr; new_recursive.prevrec = md->recursive; md->recursive = &new_recursive; @@ -1785,8 +1781,9 @@ for (;;) new_recursive.saved_max * sizeof(int)); /* OK, now we can do the recursion. After processing each alternative, - restore the offset data. If there were nested recursions, md->recursive - might be changed, so reset it before looping. */ + restore the offset data and the last captured value. If there were nested + recursions, md->recursive might be changed, so reset it before looping. + */ DPRINTF(("Recursing into group %d\n", new_recursive.group_num)); cbegroup = (*callpat >= OP_SBRA); @@ -1797,6 +1794,7 @@ for (;;) md, eptrb, RM6); memcpy(md->offset_vector, new_recursive.offset_save, new_recursive.saved_max * sizeof(int)); + md->capture_last = new_recursive.saved_capture_last; md->recursive = new_recursive.prevrec; if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) { @@ -1813,11 +1811,16 @@ for (;;) goto RECURSION_MATCHED; /* Exit loop; end processing */ } - /* PCRE does not allow THEN or COMMIT to escape beyond a recursion; it - is treated as NOMATCH. */ + /* PCRE does not allow THEN, SKIP, PRUNE or COMMIT to escape beyond a + recursion; they cause a NOMATCH for the entire recursion. These codes + are defined in a range that can be tested for. */ + + if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX) + RRETURN(MATCH_NOMATCH); - else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN && - rrc != MATCH_COMMIT) + /* Any return code other than NOMATCH is an error. */ + + if (rrc != MATCH_NOMATCH) { DPRINTF(("Recursion gave error %d\n", rrc)); if (new_recursive.offset_save != stacksave) @@ -1947,8 +1950,8 @@ for (;;) /* Deal with capturing */ - md->capture_last = number; - if (offset >= md->offset_max) md->offset_overflow = TRUE; else + md->capture_last = (md->capture_last & OVFLMASK) | number; + if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else { /* If offset is greater than offset_top, it means that we are "skipping" a capturing group, and that group's offsets must be marked @@ -2004,6 +2007,7 @@ for (;;) if (*ecode == OP_KETRPOS) { + md->start_match_ptr = mstart; /* In case \K reset it */ md->end_match_ptr = eptr; md->end_offset_top = offset_top; RRETURN(MATCH_KETRPOS); @@ -2571,19 +2575,24 @@ for (;;) RRETURN(MATCH_NOMATCH); break; - case PT_SPACE: /* Perl space */ - if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z || - c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR) - == (op == OP_NOTPROP)) - RRETURN(MATCH_NOMATCH); - break; + /* Perl space used to exclude VT, but from Perl 5.18 it is included, + which means that Perl space and POSIX space are now identical. PCRE + was changed at release 8.34. */ + case PT_SPACE: /* Perl space */ case PT_PXSPACE: /* POSIX space */ - if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z || - c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || - c == CHAR_FF || c == CHAR_CR) - == (op == OP_NOTPROP)) - RRETURN(MATCH_NOMATCH); + switch(c) + { + HSPACE_CASES: + VSPACE_CASES: + if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH); + break; + + default: + if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == + (op == OP_NOTPROP)) RRETURN(MATCH_NOMATCH); + break; + } break; case PT_WORD: @@ -2604,6 +2613,13 @@ for (;;) } break; + case PT_UCNC: + if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || + c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || + c >= 0xe000) == (op == OP_NOTPROP)) + RRETURN(MATCH_NOMATCH); + break; + /* This should never occur */ default: @@ -2650,15 +2666,7 @@ for (;;) similar code to character type repeats - written out again for speed. However, if the referenced string is the empty string, always treat it as matched, any number of times (otherwise there could be infinite - loops). */ - - case OP_REF: - case OP_REFI: - caseless = op == OP_REFI; - offset = GET2(ecode, 1) << 1; /* Doubled ref number */ - ecode += 1 + IMM2_SIZE; - - /* If the reference is unset, there are two possibilities: + loops). If the reference is unset, there are two possibilities: (a) In the default, Perl-compatible state, set the length negative; this ensures that every attempt at a match fails. We can't just fail @@ -2668,8 +2676,39 @@ for (;;) so that the back reference matches an empty string. Otherwise, set the length to the length of what was matched by the - referenced subpattern. */ + referenced subpattern. + + The OP_REF and OP_REFI opcodes are used for a reference to a numbered group + or to a non-duplicated named group. For a duplicated named group, OP_DNREF + and OP_DNREFI are used. In this case we must scan the list of groups to + which the name refers, and use the first one that is set. */ + + case OP_DNREF: + case OP_DNREFI: + caseless = op == OP_DNREFI; + { + int count = GET2(ecode, 1+IMM2_SIZE); + pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size; + ecode += 1 + 2*IMM2_SIZE; + + while (count-- > 0) + { + offset = GET2(slot, 0) << 1; + if (offset < offset_top && md->offset_vector[offset] >= 0) break; + slot += md->name_entry_size; + } + if (count < 0) + length = (md->jscript_compat)? 0 : -1; + else + length = md->offset_vector[offset+1] - md->offset_vector[offset]; + } + goto REF_REPEAT; + case OP_REF: + case OP_REFI: + caseless = op == OP_REFI; + offset = GET2(ecode, 1) << 1; /* Doubled ref number */ + ecode += 1 + IMM2_SIZE; if (offset >= offset_top || md->offset_vector[offset] < 0) length = (md->jscript_compat)? 0 : -1; else @@ -2677,6 +2716,7 @@ for (;;) /* Set up for repetition, or handle the non-repeated case */ + REF_REPEAT: switch (*ecode) { case OP_CRSTAR: @@ -2825,8 +2865,12 @@ for (;;) case OP_CRMINPLUS: case OP_CRQUERY: case OP_CRMINQUERY: + case OP_CRPOSSTAR: + case OP_CRPOSPLUS: + case OP_CRPOSQUERY: c = *ecode++ - OP_CRSTAR; - minimize = (c & 1) != 0; + if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0; + else possessive = TRUE; min = rep_min[c]; /* Pick up values from tables; */ max = rep_max[c]; /* zero for max => infinity */ if (max == 0) max = INT_MAX; @@ -2834,7 +2878,9 @@ for (;;) case OP_CRRANGE: case OP_CRMINRANGE: + case OP_CRPOSRANGE: minimize = (*ecode == OP_CRMINRANGE); + possessive = (*ecode == OP_CRPOSRANGE); min = GET2(ecode, 1); max = GET2(ecode, 1 + IMM2_SIZE); if (max == 0) max = INT_MAX; @@ -2976,6 +3022,9 @@ for (;;) if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break; eptr += len; } + + if (possessive) continue; /* No backtracking */ + for (;;) { RMATCH(eptr, ecode, offset_top, md, eptrb, RM18); @@ -3006,6 +3055,9 @@ for (;;) if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break; eptr++; } + + if (possessive) continue; /* No backtracking */ + while (eptr >= pp) { RMATCH(eptr, ecode, offset_top, md, eptrb, RM19); @@ -3021,9 +3073,10 @@ for (;;) /* Control never gets here */ - /* Match an extended character class. This opcode is encountered only - when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8 - mode, because Unicode properties are supported in non-UTF-8 mode. */ + /* Match an extended character class. In the 8-bit library, this opcode is + encountered only when UTF-8 mode mode is supported. In the 16-bit and + 32-bit libraries, codepoints greater than 255 may be encountered even when + UTF is not supported. */ #if defined SUPPORT_UTF || !defined COMPILE_PCRE8 case OP_XCLASS: @@ -3039,8 +3092,12 @@ for (;;) case OP_CRMINPLUS: case OP_CRQUERY: case OP_CRMINQUERY: + case OP_CRPOSSTAR: + case OP_CRPOSPLUS: + case OP_CRPOSQUERY: c = *ecode++ - OP_CRSTAR; - minimize = (c & 1) != 0; + if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0; + else possessive = TRUE; min = rep_min[c]; /* Pick up values from tables; */ max = rep_max[c]; /* zero for max => infinity */ if (max == 0) max = INT_MAX; @@ -3048,7 +3105,9 @@ for (;;) case OP_CRRANGE: case OP_CRMINRANGE: + case OP_CRPOSRANGE: minimize = (*ecode == OP_CRMINRANGE); + possessive = (*ecode == OP_CRPOSRANGE); min = GET2(ecode, 1); max = GET2(ecode, 1 + IMM2_SIZE); if (max == 0) max = INT_MAX; @@ -3120,6 +3179,9 @@ for (;;) if (!PRIV(xclass)(c, data, utf)) break; eptr += len; } + + if (possessive) continue; /* No backtracking */ + for(;;) { RMATCH(eptr, ecode, offset_top, md, eptrb, RM21); @@ -3190,7 +3252,7 @@ for (;;) if (fc < 128) { - pcre_uchar cc = RAWUCHAR(eptr); + pcre_uint32 cc = RAWUCHAR(eptr); if (md->lcc[fc] != TABLE_GET(cc, md->lcc, cc)) RRETURN(MATCH_NOMATCH); ecode++; eptr++; @@ -3295,7 +3357,22 @@ for (;;) max = rep_max[c]; /* zero for max => infinity */ if (max == 0) max = INT_MAX; - /* Common code for all repeated single-character matches. */ + /* Common code for all repeated single-character matches. We first check + for the minimum number of characters. If the minimum equals the maximum, we + are done. Otherwise, if minimizing, check the rest of the pattern for a + match; if there isn't one, advance up to the maximum, one character at a + time. + + If maximizing, advance up to the maximum number of matching characters, + until eptr is past the end of the maximum run. If possessive, we are + then done (no backing up). Otherwise, match at this position; anything + other than no match is immediately returned. For nomatch, back up one + character, unless we are matching \R and the last thing matched was + \r\n, in which case, back up two bytes. When we reach the first optional + character position, we can save stack by doing a tail recurse. + + The various UTF/non-UTF and caseful/caseless cases are handled separately, + for speed. */ REPEATCHAR: #ifdef SUPPORT_UTF @@ -3379,13 +3456,12 @@ for (;;) } } - if (possessive) continue; - + if (possessive) continue; /* No backtracking */ for(;;) { + if (eptr == pp) goto TAIL_RECURSE; RMATCH(eptr, ecode, offset_top, md, eptrb, RM23); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (eptr == pp) { RRETURN(MATCH_NOMATCH); } #ifdef SUPPORT_UCP eptr--; BACKCHAR(eptr); @@ -3439,8 +3515,7 @@ for (;;) for (i = 1; i <= min; i++) { - pcre_uchar cc; - + pcre_uint32 cc; /* Faster than pcre_uchar */ if (eptr >= md->end_subject) { SCHECK_PARTIAL(); @@ -3455,8 +3530,7 @@ for (;;) { for (fi = min;; fi++) { - pcre_uchar cc; - + pcre_uint32 cc; /* Faster than pcre_uchar */ RMATCH(eptr, ecode, offset_top, md, eptrb, RM24); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max) RRETURN(MATCH_NOMATCH); @@ -3476,8 +3550,7 @@ for (;;) pp = eptr; for (i = min; i < max; i++) { - pcre_uchar cc; - + pcre_uint32 cc; /* Faster than pcre_uchar */ if (eptr >= md->end_subject) { SCHECK_PARTIAL(); @@ -3487,18 +3560,16 @@ for (;;) if (fc != cc && foc != cc) break; eptr++; } - - if (possessive) continue; - - while (eptr >= pp) + if (possessive) continue; /* No backtracking */ + for (;;) { + if (eptr == pp) goto TAIL_RECURSE; RMATCH(eptr, ecode, offset_top, md, eptrb, RM25); eptr--; if (rrc != MATCH_NOMATCH) RRETURN(rrc); } - RRETURN(MATCH_NOMATCH); + /* Control never gets here */ } - /* Control never gets here */ } /* Caseful comparisons (includes all multi-byte characters) */ @@ -3546,15 +3617,15 @@ for (;;) if (fc != RAWUCHARTEST(eptr)) break; eptr++; } - if (possessive) continue; - - while (eptr >= pp) + if (possessive) continue; /* No backtracking */ + for (;;) { + if (eptr == pp) goto TAIL_RECURSE; RMATCH(eptr, ecode, offset_top, md, eptrb, RM27); eptr--; if (rrc != MATCH_NOMATCH) RRETURN(rrc); } - RRETURN(MATCH_NOMATCH); + /* Control never gets here */ } } /* Control never gets here */ @@ -3726,7 +3797,7 @@ for (;;) } } else -#endif +#endif /* SUPPORT_UTF */ /* Not UTF mode */ { for (i = 1; i <= min; i++) @@ -3764,7 +3835,7 @@ for (;;) } } else -#endif +#endif /*SUPPORT_UTF */ /* Not UTF mode */ { for (fi = min;; fi++) @@ -3806,17 +3877,18 @@ for (;;) if (fc == d || (unsigned int)foc == d) break; eptr += len; } - if (possessive) continue; + if (possessive) continue; /* No backtracking */ for(;;) { + if (eptr == pp) goto TAIL_RECURSE; RMATCH(eptr, ecode, offset_top, md, eptrb, RM30); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (eptr-- == pp) break; /* Stop if tried at original pos */ + eptr--; BACKCHAR(eptr); } } else -#endif +#endif /* SUPPORT_UTF */ /* Not UTF mode */ { for (i = min; i < max; i++) @@ -3829,18 +3901,17 @@ for (;;) if (fc == *eptr || foc == *eptr) break; eptr++; } - if (possessive) continue; - while (eptr >= pp) + if (possessive) continue; /* No backtracking */ + for (;;) { + if (eptr == pp) goto TAIL_RECURSE; RMATCH(eptr, ecode, offset_top, md, eptrb, RM31); if (rrc != MATCH_NOMATCH) RRETURN(rrc); eptr--; } } - - RRETURN(MATCH_NOMATCH); + /* Control never gets here */ } - /* Control never gets here */ } /* Caseful comparisons */ @@ -3941,12 +4012,13 @@ for (;;) if (fc == d) break; eptr += len; } - if (possessive) continue; + if (possessive) continue; /* No backtracking */ for(;;) { + if (eptr == pp) goto TAIL_RECURSE; RMATCH(eptr, ecode, offset_top, md, eptrb, RM34); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (eptr-- == pp) break; /* Stop if tried at original pos */ + eptr--; BACKCHAR(eptr); } } @@ -3964,16 +4036,16 @@ for (;;) if (fc == *eptr) break; eptr++; } - if (possessive) continue; - while (eptr >= pp) + if (possessive) continue; /* No backtracking */ + for (;;) { + if (eptr == pp) goto TAIL_RECURSE; RMATCH(eptr, ecode, offset_top, md, eptrb, RM35); if (rrc != MATCH_NOMATCH) RRETURN(rrc); eptr--; } } - - RRETURN(MATCH_NOMATCH); + /* Control never gets here */ } } /* Control never gets here */ @@ -4155,7 +4227,12 @@ for (;;) } break; + /* Perl space used to exclude VT, but from Perl 5.18 it is included, + which means that Perl space and POSIX space are now identical. PCRE + was changed at release 8.34. */ + case PT_SPACE: /* Perl space */ + case PT_PXSPACE: /* POSIX space */ for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) @@ -4164,26 +4241,18 @@ for (;;) RRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); - if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL || - c == CHAR_FF || c == CHAR_CR) - == prop_fail_result) - RRETURN(MATCH_NOMATCH); - } - break; - - case PT_PXSPACE: /* POSIX space */ - for (i = 1; i <= min; i++) - { - if (eptr >= md->end_subject) + switch(c) { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + HSPACE_CASES: + VSPACE_CASES: + if (prop_fail_result) RRETURN(MATCH_NOMATCH); + break; + + default: + if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result) + RRETURN(MATCH_NOMATCH); + break; } - GETCHARINCTEST(c, eptr); - if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL || - c == CHAR_VT || c == CHAR_FF || c == CHAR_CR) - == prop_fail_result) - RRETURN(MATCH_NOMATCH); } break; @@ -4225,6 +4294,22 @@ for (;;) } break; + case PT_UCNC: + for (i = 1; i <= min; i++) + { + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(c, eptr); + if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || + c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || + c >= 0xe000) == prop_fail_result) + RRETURN(MATCH_NOMATCH); + } + break; + /* This should not occur */ default: @@ -4430,8 +4515,7 @@ for (;;) case OP_DIGIT: for (i = 1; i <= min; i++) { - pcre_uchar cc; - + pcre_uint32 cc; if (eptr >= md->end_subject) { SCHECK_PARTIAL(); @@ -4448,8 +4532,7 @@ for (;;) case OP_NOT_WHITESPACE: for (i = 1; i <= min; i++) { - pcre_uchar cc; - + pcre_uint32 cc; if (eptr >= md->end_subject) { SCHECK_PARTIAL(); @@ -4466,8 +4549,7 @@ for (;;) case OP_WHITESPACE: for (i = 1; i <= min; i++) { - pcre_uchar cc; - + pcre_uint32 cc; if (eptr >= md->end_subject) { SCHECK_PARTIAL(); @@ -4484,8 +4566,7 @@ for (;;) case OP_NOT_WORDCHAR: for (i = 1; i <= min; i++) { - pcre_uchar cc; - + pcre_uint32 cc; if (eptr >= md->end_subject) { SCHECK_PARTIAL(); @@ -4502,8 +4583,7 @@ for (;;) case OP_WORDCHAR: for (i = 1; i <= min; i++) { - pcre_uchar cc; - + pcre_uint32 cc; if (eptr >= md->end_subject) { SCHECK_PARTIAL(); @@ -4892,25 +4972,11 @@ for (;;) } /* Control never gets here */ - case PT_SPACE: /* Perl space */ - for (fi = min;; fi++) - { - RMATCH(eptr, ecode, offset_top, md, eptrb, RM60); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max) RRETURN(MATCH_NOMATCH); - if (eptr >= md->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - GETCHARINCTEST(c, eptr); - if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL || - c == CHAR_FF || c == CHAR_CR) - == prop_fail_result) - RRETURN(MATCH_NOMATCH); - } - /* Control never gets here */ + /* Perl space used to exclude VT, but from Perl 5.18 it is included, + which means that Perl space and POSIX space are now identical. PCRE + was changed at release 8.34. */ + case PT_SPACE: /* Perl space */ case PT_PXSPACE: /* POSIX space */ for (fi = min;; fi++) { @@ -4923,10 +4989,18 @@ for (;;) RRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); - if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL || - c == CHAR_VT || c == CHAR_FF || c == CHAR_CR) - == prop_fail_result) - RRETURN(MATCH_NOMATCH); + switch(c) + { + HSPACE_CASES: + VSPACE_CASES: + if (prop_fail_result) RRETURN(MATCH_NOMATCH); + break; + + default: + if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result) + RRETURN(MATCH_NOMATCH); + break; + } } /* Control never gets here */ @@ -4976,6 +5050,25 @@ for (;;) } /* Control never gets here */ + case PT_UCNC: + for (fi = min;; fi++) + { + RMATCH(eptr, ecode, offset_top, md, eptrb, RM60); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (fi >= max) RRETURN(MATCH_NOMATCH); + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(c, eptr); + if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || + c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || + c >= 0xe000) == prop_fail_result) + RRETURN(MATCH_NOMATCH); + } + /* Control never gets here */ + /* This should never occur */ default: RRETURN(PCRE_ERROR_INTERNAL); @@ -5391,7 +5484,12 @@ for (;;) } break; + /* Perl space used to exclude VT, but from Perl 5.18 it is included, + which means that Perl space and POSIX space are now identical. PCRE + was changed at release 8.34. */ + case PT_SPACE: /* Perl space */ + case PT_PXSPACE: /* POSIX space */ for (i = min; i < max; i++) { int len = 1; @@ -5401,30 +5499,21 @@ for (;;) break; } GETCHARLENTEST(c, eptr, len); - if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL || - c == CHAR_FF || c == CHAR_CR) - == prop_fail_result) + switch(c) + { + HSPACE_CASES: + VSPACE_CASES: + if (prop_fail_result) goto ENDLOOP99; /* Break the loop */ break; - eptr+= len; - } - break; - case PT_PXSPACE: /* POSIX space */ - for (i = min; i < max; i++) - { - int len = 1; - if (eptr >= md->end_subject) - { - SCHECK_PARTIAL(); + default: + if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result) + goto ENDLOOP99; /* Break the loop */ break; } - GETCHARLENTEST(c, eptr, len); - if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL || - c == CHAR_VT || c == CHAR_FF || c == CHAR_CR) - == prop_fail_result) - break; eptr+= len; } + ENDLOOP99: break; case PT_WORD: @@ -5470,23 +5559,42 @@ for (;;) GOT_MAX: break; + case PT_UCNC: + for (i = min; i < max; i++) + { + int len = 1; + if (eptr >= md->end_subject) + { + SCHECK_PARTIAL(); + break; + } + GETCHARLENTEST(c, eptr, len); + if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || + c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) || + c >= 0xe000) == prop_fail_result) + break; + eptr += len; + } + break; + default: RRETURN(PCRE_ERROR_INTERNAL); } /* eptr is now past the end of the maximum run */ - if (possessive) continue; + if (possessive) continue; /* No backtracking */ for(;;) { + if (eptr == pp) goto TAIL_RECURSE; RMATCH(eptr, ecode, offset_top, md, eptrb, RM44); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (eptr-- == pp) break; /* Stop if tried at original pos */ + eptr--; if (utf) BACKCHAR(eptr); } } - /* Match extended Unicode sequences. We will get here only if the + /* Match extended Unicode grapheme clusters. We will get here only if the support is in the binary; otherwise a compile-time error occurs. */ else if (ctype == OP_EXTUNI) @@ -5518,22 +5626,42 @@ for (;;) /* eptr is now past the end of the maximum run */ - if (possessive) continue; + if (possessive) continue; /* No backtracking */ for(;;) { + int lgb, rgb; + PCRE_PUCHAR fptr; + + if (eptr == pp) goto TAIL_RECURSE; /* At start of char run */ RMATCH(eptr, ecode, offset_top, md, eptrb, RM45); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (eptr-- == pp) break; /* Stop if tried at original pos */ - for (;;) /* Move back over one extended */ + + /* Backtracking over an extended grapheme cluster involves inspecting + the previous two characters (if present) to see if a break is + permitted between them. */ + + eptr--; + if (!utf) c = *eptr; else + { + BACKCHAR(eptr); + GETCHAR(c, eptr); + } + rgb = UCD_GRAPHBREAK(c); + + for (;;) { - if (!utf) c = *eptr; else + if (eptr == pp) goto TAIL_RECURSE; /* At start of char run */ + fptr = eptr - 1; + if (!utf) c = *fptr; else { - BACKCHAR(eptr); - GETCHAR(c, eptr); + BACKCHAR(fptr); + GETCHAR(c, fptr); } - if (UCD_CATEGORY(c) != ucp_M) break; - eptr--; + lgb = UCD_GRAPHBREAK(c); + if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; + eptr = fptr; + rgb = lgb; } } } @@ -5799,18 +5927,13 @@ for (;;) RRETURN(PCRE_ERROR_INTERNAL); } - /* eptr is now past the end of the maximum run. If possessive, we are - done (no backing up). Otherwise, match at this position; anything other - than no match is immediately returned. For nomatch, back up one - character, unless we are matching \R and the last thing matched was - \r\n, in which case, back up two bytes. */ - - if (possessive) continue; + if (possessive) continue; /* No backtracking */ for(;;) { + if (eptr == pp) goto TAIL_RECURSE; RMATCH(eptr, ecode, offset_top, md, eptrb, RM46); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (eptr-- == pp) break; /* Stop if tried at original pos */ + eptr--; BACKCHAR(eptr); if (ctype == OP_ANYNL && eptr > pp && RAWUCHAR(eptr) == CHAR_NL && RAWUCHAR(eptr - 1) == CHAR_CR) eptr--; @@ -6048,15 +6171,10 @@ for (;;) RRETURN(PCRE_ERROR_INTERNAL); } - /* eptr is now past the end of the maximum run. If possessive, we are - done (no backing up). Otherwise, match at this position; anything other - than no match is immediately returned. For nomatch, back up one - character (byte), unless we are matching \R and the last thing matched - was \r\n, in which case, back up two bytes. */ - - if (possessive) continue; - while (eptr >= pp) + if (possessive) continue; /* No backtracking */ + for (;;) { + if (eptr == pp) goto TAIL_RECURSE; RMATCH(eptr, ecode, offset_top, md, eptrb, RM47); if (rrc != MATCH_NOMATCH) RRETURN(rrc); eptr--; @@ -6065,11 +6183,8 @@ for (;;) } } - /* Get here if we can't make it match with any permitted repetitions */ - - RRETURN(MATCH_NOMATCH); + /* Control never gets here */ } - /* Control never gets here */ /* There's been some horrible disaster. Arrival here can only mean there is something seriously wrong in the code above or the OP_xxx definitions. */ @@ -6103,10 +6218,10 @@ switch (frame->Xwhere) LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64) LBL(65) LBL(66) #if defined SUPPORT_UTF || !defined COMPILE_PCRE8 - LBL(21) + LBL(20) LBL(21) #endif #ifdef SUPPORT_UTF - LBL(16) LBL(18) LBL(20) + LBL(16) LBL(18) LBL(22) LBL(23) LBL(28) LBL(30) LBL(32) LBL(34) LBL(42) LBL(46) #ifdef SUPPORT_UCP @@ -6264,6 +6379,7 @@ const pcre_uint8 *start_bits = NULL; PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset; PCRE_PUCHAR end_subject; PCRE_PUCHAR start_partial = NULL; +PCRE_PUCHAR match_partial = NULL; PCRE_PUCHAR req_char_ptr = start_match - 1; const pcre_study_data *study; @@ -6393,6 +6509,8 @@ md->callout_data = NULL; tables = re->tables; +/* The two limit values override the defaults, whatever their value. */ + if (extra_data != NULL) { register unsigned int flags = extra_data->flags; @@ -6407,6 +6525,15 @@ if (extra_data != NULL) if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables; } +/* Limits in the regex override only if they are smaller. */ + +if ((re->flags & PCRE_MLSET) != 0 && re->limit_match < md->match_limit) + md->match_limit = re->limit_match; + +if ((re->flags & PCRE_RLSET) != 0 && + re->limit_recursion < md->match_limit_recursion) + md->match_limit_recursion = re->limit_recursion; + /* If the exec call supplied NULL for tables, use the inbuilt ones. This is a feature that makes it possible to save compiled regex and re-use them in other programs later. */ @@ -6432,7 +6559,7 @@ end_subject = md->end_subject; md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0; md->use_ucp = (re->options & PCRE_UCP) != 0; md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0; -md->ignore_skip_arg = FALSE; +md->ignore_skip_arg = 0; /* Some options are unpacked into BOOL variables in the hope that testing them will be faster than individual option bits. */ @@ -6542,11 +6669,9 @@ if (re->top_backref > 0 && re->top_backref >= ocount/3) DPRINTF(("Got memory to hold back references\n")); } else md->offset_vector = offsets; - md->offset_end = ocount; md->offset_max = (2*ocount)/3; -md->offset_overflow = FALSE; -md->capture_last = -1; +md->capture_last = 0; /* Reset the working variable associated with each extraction. These should never be used unless previously set, but they get saved and restored, and so we @@ -6816,8 +6941,13 @@ for(;;) md->match_call_count = 0; md->match_function_type = 0; md->end_offset_top = 0; + md->skip_arg_count = 0; rc = match(start_match, md->start_code, start_match, 2, md, NULL, 0); - if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr; + if (md->hitend && start_partial == NULL) + { + start_partial = md->start_used_ptr; + match_partial = start_match; + } switch(rc) { @@ -6830,14 +6960,14 @@ for(;;) case MATCH_SKIP_ARG: new_start_match = start_match; - md->ignore_skip_arg = TRUE; + md->ignore_skip_arg = md->skip_arg_count; break; - /* SKIP passes back the next starting point explicitly, but if it is the - same as the match we have just done, treat it as NOMATCH. */ + /* SKIP passes back the next starting point explicitly, but if it is no + greater than the match we have just done, treat it as NOMATCH. */ case MATCH_SKIP: - if (md->start_match_ptr != start_match) + if (md->start_match_ptr > start_match) { new_start_match = md->start_match_ptr; break; @@ -6845,12 +6975,12 @@ for(;;) /* Fall through */ /* NOMATCH and PRUNE advance by one character. THEN at this level acts - exactly like PRUNE. Unset the ignore SKIP-with-argument flag. */ + exactly like PRUNE. Unset ignore SKIP-with-argument. */ case MATCH_NOMATCH: case MATCH_PRUNE: case MATCH_THEN: - md->ignore_skip_arg = FALSE; + md->ignore_skip_arg = 0; new_start_match = start_match + 1; #ifdef SUPPORT_UTF if (utf) @@ -6943,7 +7073,7 @@ if (rc == MATCH_MATCH || rc == MATCH_ACCEPT) (arg_offset_max - 2) * sizeof(int)); DPRINTF(("Copied offsets from temporary memory\n")); } - if (md->end_offset_top > arg_offset_max) md->offset_overflow = TRUE; + if (md->end_offset_top > arg_offset_max) md->capture_last |= OVFLBIT; DPRINTF(("Freeing temporary memory\n")); (PUBL(free))(md->offset_vector); } @@ -6951,7 +7081,8 @@ if (rc == MATCH_MATCH || rc == MATCH_ACCEPT) /* Set the return code to the number of captured strings, or 0 if there were too many to fit into the vector. */ - rc = (md->offset_overflow && md->end_offset_top >= arg_offset_max)? + rc = ((md->capture_last & OVFLBIT) != 0 && + md->end_offset_top >= arg_offset_max)? 0 : md->end_offset_top/2; /* If there is space in the offset vector, set any unused pairs at the end of @@ -7016,7 +7147,7 @@ if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL) /* Handle partial matches - disable any mark data */ -if (start_partial != NULL) +if (match_partial != NULL) { DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n")); md->mark = NULL; @@ -7024,6 +7155,8 @@ if (start_partial != NULL) { offsets[0] = (int)(start_partial - (PCRE_PUCHAR)subject); offsets[1] = (int)(end_subject - (PCRE_PUCHAR)subject); + if (offsetcount > 2) + offsets[2] = (int)(match_partial - (PCRE_PUCHAR)subject); } rc = PCRE_ERROR_PARTIAL; } -- cgit v1.2.3