summaryrefslogtreecommitdiffstats
path: root/src/3rdparty/pcre2/src/pcre2_match.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/3rdparty/pcre2/src/pcre2_match.c')
-rw-r--r--src/3rdparty/pcre2/src/pcre2_match.c8355
1 files changed, 3986 insertions, 4369 deletions
diff --git a/src/3rdparty/pcre2/src/pcre2_match.c b/src/3rdparty/pcre2/src/pcre2_match.c
index 0763a239e1..8741e1432d 100644
--- a/src/3rdparty/pcre2/src/pcre2_match.c
+++ b/src/3rdparty/pcre2/src/pcre2_match.c
@@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
- New API code Copyright (c) 2016 University of Cambridge
+ New API code Copyright (c) 2015-2018 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -43,17 +43,31 @@ POSSIBILITY OF SUCH DAMAGE.
#include "config.h"
#endif
-#define NLBLOCK mb /* Block containing newline information */
-#define PSSTART start_subject /* Field containing processed string start */
-#define PSEND end_subject /* Field containing processed string end */
+/* These defines enable debugging code */
+
+/* #define DEBUG_FRAMES_DISPLAY */
+/* #define DEBUG_SHOW_OPS */
+/* #define DEBUG_SHOW_RMATCH */
+
+#ifdef DEBUG_FRAME_DISPLAY
+#include <stdarg.h>
+#endif
+
+/* These defines identify the name of the block containing "static"
+information, and fields within it. */
+
+#define NLBLOCK mb /* Block containing newline information */
+#define PSSTART start_subject /* Field containing processed string start */
+#define PSEND end_subject /* Field containing processed string end */
#include "pcre2_internal.h"
-/* Masks for identifying the public options that are permitted at match time.
-*/
+#define RECURSE_UNSET 0xffffffffu /* Bigger than max group number */
+
+/* Masks for identifying the public options that are permitted at match time. */
#define PUBLIC_MATCH_OPTIONS \
- (PCRE2_ANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \
+ (PCRE2_ANCHORED|PCRE2_ENDANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \
PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \
PCRE2_PARTIAL_SOFT|PCRE2_NO_JIT)
@@ -61,60 +75,255 @@ POSSIBILITY OF SUCH DAMAGE.
(PCRE2_NO_UTF_CHECK|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY|\
PCRE2_NOTEMPTY_ATSTART|PCRE2_PARTIAL_SOFT|PCRE2_PARTIAL_HARD)
-/* The mb->capture_last field uses the lower 16 bits for the last captured
-substring (which can never be greater than 65535) and a bit in the top half
-to mean "capture vector overflowed". This odd way of doing things was
-implemented when it was realized that preserving and restoring the overflow bit
-whenever the last capture number was saved/restored made for a neater
-interface, and doing it this way saved on (a) another variable, which would
-have increased the stack frame size (a big NO-NO in PCRE) and (b) another
-separate set of save/restore instructions. The following defines are used in
-implementing this. */
-
-#define CAPLMASK 0x0000ffff /* The bits used for last_capture */
-#define OVFLMASK 0xffff0000 /* The bits used for the overflow flag */
-#define OVFLBIT 0x00010000 /* The bit that is set for overflow */
-
-/* Bits for setting in mb->match_function_type to indicate two special types
-of call to match(). We do it this way to save on using another stack variable,
-as stack usage is to be discouraged. */
-
-#define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
-#define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
-
-/* Non-error returns from the match() function. Error returns are externally
-defined PCRE2_ERROR_xxx codes, which are all negative. */
+/* Non-error returns from and within the match() function. Error returns are
+externally defined PCRE2_ERROR_xxx codes, which are all negative. */
#define MATCH_MATCH 1
#define MATCH_NOMATCH 0
-/* Special internal returns from the match() function. Make them sufficiently
-negative to avoid the external error codes. */
+/* Special internal returns used in the match() function. Make them
+sufficiently negative to avoid the external error codes. */
#define MATCH_ACCEPT (-999)
#define MATCH_KETRPOS (-998)
-#define MATCH_ONCE (-997)
/* The next 5 must be kept together and in sequence so that a test that checks
for any one of them can use a range. */
-#define MATCH_COMMIT (-996)
-#define MATCH_PRUNE (-995)
-#define MATCH_SKIP (-994)
-#define MATCH_SKIP_ARG (-993)
-#define MATCH_THEN (-992)
+#define MATCH_COMMIT (-997)
+#define MATCH_PRUNE (-996)
+#define MATCH_SKIP (-995)
+#define MATCH_SKIP_ARG (-994)
+#define MATCH_THEN (-993)
#define MATCH_BACKTRACK_MAX MATCH_THEN
#define MATCH_BACKTRACK_MIN MATCH_COMMIT
-/* Min and max values for the common repeats; for the maxima, 0 => infinity */
+/* Group frame type values. Zero means the frame is not a group frame. The
+lower 16 bits are used for data (e.g. the capture number). Group frames are
+used for most groups so that information about the start is easily available at
+the end without having to scan back through intermediate frames (backtrack
+points). */
+
+#define GF_CAPTURE 0x00010000u
+#define GF_NOCAPTURE 0x00020000u
+#define GF_CONDASSERT 0x00030000u
+#define GF_RECURSE 0x00040000u
-static const char rep_min[] = { 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, };
-static const char rep_max[] = { 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, };
+/* Masks for the identity and data parts of the group frame type. */
-/* Maximum number of ovector elements that can be saved on the system stack
-when processing OP_RECURSE in non-HEAP_MATCH_RECURSE mode. If the ovector is
-bigger, malloc() is used. This value should be a multiple of 3, because the
-ovector length is always a multiple of 3. */
+#define GF_IDMASK(a) ((a) & 0xffff0000u)
+#define GF_DATAMASK(a) ((a) & 0x0000ffffu)
-#define OP_RECURSE_STACK_SAVE_MAX 45
+/* Repetition types */
+
+enum { REPTYPE_MIN, REPTYPE_MAX, REPTYPE_POS };
+
+/* Min and max values for the common repeats; a maximum of UINT32_MAX =>
+infinity. */
+
+static const uint32_t rep_min[] = {
+ 0, 0, /* * and *? */
+ 1, 1, /* + and +? */
+ 0, 0, /* ? and ?? */
+ 0, 0, /* dummy placefillers for OP_CR[MIN]RANGE */
+ 0, 1, 0 }; /* OP_CRPOS{STAR, PLUS, QUERY} */
+
+static const uint32_t rep_max[] = {
+ UINT32_MAX, UINT32_MAX, /* * and *? */
+ UINT32_MAX, UINT32_MAX, /* + and +? */
+ 1, 1, /* ? and ?? */
+ 0, 0, /* dummy placefillers for OP_CR[MIN]RANGE */
+ UINT32_MAX, UINT32_MAX, 1 }; /* OP_CRPOS{STAR, PLUS, QUERY} */
+
+/* Repetition types - must include OP_CRPOSRANGE (not needed above) */
+
+static const uint32_t rep_typ[] = {
+ REPTYPE_MAX, REPTYPE_MIN, /* * and *? */
+ REPTYPE_MAX, REPTYPE_MIN, /* + and +? */
+ REPTYPE_MAX, REPTYPE_MIN, /* ? and ?? */
+ REPTYPE_MAX, REPTYPE_MIN, /* OP_CRRANGE and OP_CRMINRANGE */
+ REPTYPE_POS, REPTYPE_POS, /* OP_CRPOSSTAR, OP_CRPOSPLUS */
+ REPTYPE_POS, REPTYPE_POS }; /* OP_CRPOSQUERY, OP_CRPOSRANGE */
+
+/* Numbers for RMATCH calls at backtracking points. When these lists are
+changed, the code at RETURN_SWITCH below must be updated in sync. */
+
+enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
+ RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
+ RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
+ RM31, RM32, RM33, RM34, RM35, RM36 };
+
+#ifdef SUPPORT_WIDE_CHARS
+enum { RM100=100, RM101 };
+#endif
+
+#ifdef SUPPORT_UNICODE
+enum { RM200=200, RM201, RM202, RM203, RM204, RM205, RM206, RM207,
+ RM208, RM209, RM210, RM211, RM212, RM213, RM214, RM215,
+ RM216, RM217, RM218, RM219, RM220, RM221, RM222 };
+#endif
+
+/* Define short names for general fields in the current backtrack frame, which
+is always pointed to by the F variable. Occasional references to fields in
+other frames are written out explicitly. There are also some fields in the
+current frame whose names start with "temp" that are used for short-term,
+localised backtracking memory. These are #defined with Lxxx names at the point
+of use and undefined afterwards. */
+
+#define Fback_frame F->back_frame
+#define Fcapture_last F->capture_last
+#define Fcurrent_recurse F->current_recurse
+#define Fecode F->ecode
+#define Feptr F->eptr
+#define Fgroup_frame_type F->group_frame_type
+#define Flast_group_offset F->last_group_offset
+#define Flength F->length
+#define Fmark F->mark
+#define Frdepth F->rdepth
+#define Fstart_match F->start_match
+#define Foffset_top F->offset_top
+#define Foccu F->occu
+#define Fop F->op
+#define Fovector F->ovector
+#define Freturn_id F->return_id
+
+
+#ifdef DEBUG_FRAMES_DISPLAY
+/*************************************************
+* Display current frames and contents *
+*************************************************/
+
+/* This debugging function displays the current set of frames and their
+contents. It is not called automatically from anywhere, the intention being
+that calls can be inserted where necessary when debugging frame-related
+problems.
+
+Arguments:
+ f the file to write to
+ F the current top frame
+ P a previous frame of interest
+ frame_size the frame size
+ mb points to the match block
+ s identification text
+
+Returns: nothing
+*/
+
+static void
+display_frames(FILE *f, heapframe *F, heapframe *P, PCRE2_SIZE frame_size,
+ match_block *mb, const char *s, ...)
+{
+uint32_t i;
+heapframe *Q;
+va_list ap;
+va_start(ap, s);
+
+fprintf(f, "FRAMES ");
+vfprintf(f, s, ap);
+va_end(ap);
+
+if (P != NULL) fprintf(f, " P=%lu",
+ ((char *)P - (char *)(mb->match_frames))/frame_size);
+fprintf(f, "\n");
+
+for (i = 0, Q = mb->match_frames;
+ Q <= F;
+ i++, Q = (heapframe *)((char *)Q + frame_size))
+ {
+ fprintf(f, "Frame %d type=%x subj=%lu code=%d back=%lu id=%d",
+ i, Q->group_frame_type, Q->eptr - mb->start_subject, *(Q->ecode),
+ Q->back_frame, Q->return_id);
+
+ if (Q->last_group_offset == PCRE2_UNSET)
+ fprintf(f, " lgoffset=unset\n");
+ else
+ fprintf(f, " lgoffset=%lu\n", Q->last_group_offset/frame_size);
+ }
+}
+
+#endif
+
+
+
+/*************************************************
+* Process a callout *
+*************************************************/
+
+/* This function is called for all callouts, whether "standalone" or at the
+start of a conditional group. Feptr will be pointing to either OP_CALLOUT or
+OP_CALLOUT_STR. A callout block is allocated in pcre2_match() and initialized
+with fixed values.
+
+Arguments:
+ F points to the current backtracking frame
+ mb points to the match block
+ lengthptr where to return the length of the callout item
+
+Returns: the return from the callout
+ or 0 if no callout function exists
+*/
+
+static int
+do_callout(heapframe *F, match_block *mb, PCRE2_SIZE *lengthptr)
+{
+int rc;
+PCRE2_SIZE save0, save1;
+PCRE2_SIZE *callout_ovector;
+pcre2_callout_block *cb;
+
+*lengthptr = (*Fecode == OP_CALLOUT)?
+ PRIV(OP_lengths)[OP_CALLOUT] : GET(Fecode, 1 + 2*LINK_SIZE);
+
+if (mb->callout == NULL) return 0; /* No callout function provided */
+
+/* The original matching code (pre 10.30) worked directly with the ovector
+passed by the user, and this was passed to callouts. Now that the working
+ovector is in the backtracking frame, it no longer needs to reserve space for
+the overall match offsets (which would waste space in the frame). For backward
+compatibility, however, we pass capture_top and offset_vector to the callout as
+if for the extended ovector, and we ensure that the first two slots are unset
+by preserving and restoring their current contents. Picky compilers complain if
+references such as Fovector[-2] are use directly, so we set up a separate
+pointer. */
+
+callout_ovector = (PCRE2_SIZE *)(Fovector) - 2;
+
+/* The cb->version, cb->subject, cb->subject_length, and cb->start_match fields
+are set externally. The first 3 never change; the last is updated for each
+bumpalong. */
+
+cb = mb->cb;
+cb->capture_top = (uint32_t)Foffset_top/2 + 1;
+cb->capture_last = Fcapture_last;
+cb->offset_vector = callout_ovector;
+cb->mark = mb->nomatch_mark;
+cb->current_position = (PCRE2_SIZE)(Feptr - mb->start_subject);
+cb->pattern_position = GET(Fecode, 1);
+cb->next_item_length = GET(Fecode, 1 + LINK_SIZE);
+
+if (*Fecode == OP_CALLOUT) /* Numerical callout */
+ {
+ cb->callout_number = Fecode[1 + 2*LINK_SIZE];
+ cb->callout_string_offset = 0;
+ cb->callout_string = NULL;
+ cb->callout_string_length = 0;
+ }
+else /* String callout */
+ {
+ cb->callout_number = 0;
+ cb->callout_string_offset = GET(Fecode, 1 + 3*LINK_SIZE);
+ cb->callout_string = Fecode + (1 + 4*LINK_SIZE) + 1;
+ cb->callout_string_length =
+ *lengthptr - (1 + 4*LINK_SIZE) - 2;
+ }
+
+save0 = callout_ovector[0];
+save1 = callout_ovector[1];
+callout_ovector[0] = callout_ovector[1] = PCRE2_UNSET;
+rc = mb->callout(cb, mb->callout_data);
+callout_ovector[0] = save0;
+callout_ovector[1] = save1;
+cb->callout_flags = 0;
+return rc;
+}
@@ -130,10 +339,9 @@ seems unlikely.)
Arguments:
offset index into the offset vector
- offset_top top of the used offset vector
- eptr pointer into the subject
- mb points to match block
caseless TRUE if caseless
+ F the current backtracking frame pointer
+ mb points to match block
lengthptr pointer for returning the length matched
Returns: = 0 sucessful match; number of code units matched is set
@@ -142,21 +350,18 @@ Returns: = 0 sucessful match; number of code units matched is set
*/
static int
-match_ref(PCRE2_SIZE offset, PCRE2_SIZE offset_top, register PCRE2_SPTR eptr,
- match_block *mb, BOOL caseless, PCRE2_SIZE *lengthptr)
+match_ref(PCRE2_SIZE offset, BOOL caseless, heapframe *F, match_block *mb,
+ PCRE2_SIZE *lengthptr)
{
-#if defined SUPPORT_UNICODE
-BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
-#endif
-
-register PCRE2_SPTR p;
+PCRE2_SPTR p;
PCRE2_SIZE length;
-PCRE2_SPTR eptr_start = eptr;
+PCRE2_SPTR eptr;
+PCRE2_SPTR eptr_start;
/* Deal with an unset group. The default is no match, but there is an option to
match an empty string. */
-if (offset >= offset_top || mb->ovector[offset] == PCRE2_UNSET)
+if (offset >= Foffset_top || Fovector[offset] == PCRE2_UNSET)
{
if ((mb->poptions & PCRE2_MATCH_UNSET_BACKREF) != 0)
{
@@ -168,19 +373,20 @@ if (offset >= offset_top || mb->ovector[offset] == PCRE2_UNSET)
/* Separate the caseless and UTF cases for speed. */
-p = mb->start_subject + mb->ovector[offset];
-length = mb->ovector[offset+1] - mb->ovector[offset];
+eptr = eptr_start = Feptr;
+p = mb->start_subject + Fovector[offset];
+length = Fovector[offset+1] - Fovector[offset];
if (caseless)
{
#if defined SUPPORT_UNICODE
- if (utf)
+ if ((mb->poptions & PCRE2_UTF) != 0)
{
/* Match characters up to the end of the reference. NOTE: the number of
code units matched may differ, because in UTF-8 there are some characters
- whose upper and lower case versions code have different numbers of bytes.
- For example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65
- (3 bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
+ whose upper and lower case codes have different numbers of bytes. For
+ example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65 (3
+ bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
sequence of two of the latter. It is important, therefore, to check the
length along the reference, not along the subject (earlier code did this
wrong). */
@@ -226,14 +432,26 @@ if (caseless)
}
/* In the caseful case, we can just compare the code units, whether or not we
-are in UTF mode. */
+are in UTF mode. When partial matching, we have to do this unit-by-unit. */
else
{
- for (; length > 0; length--)
+ if (mb->partial != 0)
+ {
+ for (; length > 0; length--)
+ {
+ if (eptr >= mb->end_subject) return 1; /* Partial match */
+ if (UCHAR21INCTEST(p) != UCHAR21INCTEST(eptr)) return -1; /* No match */
+ }
+ }
+
+ /* Not partial matching */
+
+ else
{
- if (eptr >= mb->end_subject) return 1; /* Partial match */
- if (UCHAR21INCTEST(p) != UCHAR21INCTEST(eptr)) return -1; /*No match */
+ if ((PCRE2_SIZE)(mb->end_subject - eptr) < length) return 1; /* Partial */
+ if (memcmp(p, eptr, CU2BYTES(length)) != 0) return -1; /* No match */
+ eptr += length;
}
}
@@ -243,281 +461,73 @@ return 0; /* Match */
-/***************************************************************************
-****************************************************************************
- RECURSION IN THE match() FUNCTION
-
-The match() function is highly recursive, though not every recursive call
-increases the recursion depth. Nevertheless, some regular expressions can cause
-it to recurse to a great depth. I was writing for Unix, so I just let it call
-itself recursively. This uses the stack for saving everything that has to be
-saved for a recursive call. On Unix, the stack can be large, and this works
-fine.
-
-It turns out that on some non-Unix-like systems there are problems with
-programs that use a lot of stack. (This despite the fact that every last chip
-has oodles of memory these days, and techniques for extending the stack have
-been known for decades.) So....
-
-There is a fudge, triggered by defining HEAP_MATCH_RECURSE, which avoids
-recursive calls by keeping local variables that need to be preserved in blocks
-of memory on the heap instead instead of on the stack. Macros are used to
-achieve this so that the actual code doesn't look very different to what it
-always used to.
-
-The original heap-recursive code used longjmp(). However, it seems that this
-can be very slow on some operating systems. Following a suggestion from Stan
-Switzer, the use of longjmp() has been abolished, at the cost of having to
-provide a unique number for each call to RMATCH. There is no way of generating
-a sequence of numbers at compile time in C. I have given them names, to make
-them stand out more clearly.
-
-Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
-FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
-tests. Furthermore, not using longjmp() means that local dynamic variables
-don't have indeterminate values; this has meant that the frame size can be
-reduced because the result can be "passed back" by straight setting of the
-variable instead of being passed in the frame.
-****************************************************************************
-***************************************************************************/
-
-/* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
-below must be updated in sync. */
+/******************************************************************************
+*******************************************************************************
+ "Recursion" in the match() function
-enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
- RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
- RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
- RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
- RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
- RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
- RM61, RM62, RM63, RM64, RM65, RM66, RM67, RM68 };
-
-/* These versions of the macros use the stack, as normal. Note that the "rw"
-argument of RMATCH isn't actually used in this definition. */
-
-#ifndef HEAP_MATCH_RECURSE
-#define REGISTER register
-#define RMATCH(ra,rb,rc,rd,re,rw) \
- rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
-#define RRETURN(ra) return ra
-#else
+The original match() function was highly recursive, but this proved to be the
+source of a number of problems over the years, mostly because of the relatively
+small system stacks that are commonly found. As new features were added to
+patterns, various kludges were invented to reduce the amount of stack used,
+making the code hard to understand in places.
-/* These versions of the macros manage a private stack on the heap. Note that
-the "rd" argument of RMATCH isn't actually used in this definition. It's the mb
-argument of match(), which never changes. */
+A version did exist that used individual frames on the heap instead of calling
+match() recursively, but this ran substantially slower. The current version is
+a refactoring that uses a vector of frames to remember backtracking points.
+This runs no slower, and possibly even a bit faster than the original recursive
+implementation. An initial vector of size START_FRAMES_SIZE (enough for maybe
+50 frames) is allocated on the system stack. If this is not big enough, the
+heap is used for a larger vector.
-#define REGISTER
+*******************************************************************************
+******************************************************************************/
-#define RMATCH(ra,rb,rc,rd,re,rw)\
- {\
- heapframe *newframe = frame->Xnextframe;\
- if (newframe == NULL)\
- {\
- newframe = (heapframe *)(mb->stack_memctl.malloc)\
- (sizeof(heapframe), mb->stack_memctl.memory_data);\
- if (newframe == NULL) RRETURN(PCRE2_ERROR_NOMEMORY);\
- newframe->Xnextframe = NULL;\
- frame->Xnextframe = newframe;\
- }\
- frame->Xwhere = rw;\
- newframe->Xeptr = ra;\
- newframe->Xecode = rb;\
- newframe->Xmstart = mstart;\
- newframe->Xoffset_top = rc;\
- newframe->Xeptrb = re;\
- newframe->Xrdepth = frame->Xrdepth + 1;\
- newframe->Xprevframe = frame;\
- frame = newframe;\
- goto HEAP_RECURSE;\
- L_##rw:;\
- }
-
-#define RRETURN(ra)\
- {\
- heapframe *oldframe = frame;\
- frame = oldframe->Xprevframe;\
- if (frame != NULL)\
- {\
- rrc = ra;\
- goto HEAP_RETURN;\
- }\
- return ra;\
- }
-/* Structure for remembering the local variables in a private frame. Arrange it
-so as to minimize the number of holes. */
-
-typedef struct heapframe {
- struct heapframe *Xprevframe;
- struct heapframe *Xnextframe;
-
-#ifdef SUPPORT_UNICODE
- PCRE2_SPTR Xcharptr;
-#endif
- PCRE2_SPTR Xeptr;
- PCRE2_SPTR Xecode;
- PCRE2_SPTR Xmstart;
- PCRE2_SPTR Xcallpat;
- PCRE2_SPTR Xdata;
- PCRE2_SPTR Xnext_ecode;
- PCRE2_SPTR Xpp;
- PCRE2_SPTR Xprev;
- PCRE2_SPTR Xsaved_eptr;
-
- eptrblock *Xeptrb;
-
- PCRE2_SIZE Xlength;
- PCRE2_SIZE Xoffset;
- PCRE2_SIZE Xoffset_top;
- PCRE2_SIZE Xsave_offset1, Xsave_offset2, Xsave_offset3;
-
- uint32_t Xfc;
- uint32_t Xnumber;
- uint32_t Xrdepth;
- uint32_t Xop;
- uint32_t Xsave_capture_last;
-
-#ifdef SUPPORT_UNICODE
- uint32_t Xprop_value;
- int Xprop_type;
- int Xprop_fail_result;
- int Xoclength;
-#endif
-
- int Xcodelink;
- int Xctype;
- int Xfi;
- int Xmax;
- int Xmin;
- int Xwhere; /* Where to jump back to */
-
- BOOL Xcondition;
- BOOL Xcur_is_word;
- BOOL Xprev_is_word;
-
- eptrblock Xnewptrb;
- recursion_info Xnew_recursive;
-
-#ifdef SUPPORT_UNICODE
- PCRE2_UCHAR Xocchars[6];
-#endif
-} heapframe;
-
-#endif
-
-
-/***************************************************************************
-***************************************************************************/
-
-
-/* When HEAP_MATCH_RECURSE is not defined, the match() function implements
-backtrack points by calling itself recursively in all but one case. The one
-special case is when processing OP_RECURSE, which specifies recursion in the
-pattern. The entire ovector must be saved and restored while processing
-OP_RECURSE. If the ovector is small enough, instead of calling match()
-directly, op_recurse_ovecsave() is called. This function uses the system stack
-to save the ovector while calling match() to process the pattern recursion. */
-
-#ifndef HEAP_MATCH_RECURSE
-
-/* We need a prototype for match() because it is mutually recursive with
-op_recurse_ovecsave(). */
-
-static int
-match(REGISTER PCRE2_SPTR eptr, REGISTER PCRE2_SPTR ecode, PCRE2_SPTR mstart,
- PCRE2_SIZE offset_top, match_block *mb, eptrblock *eptrb, uint32_t rdepth);
-
/*************************************************
-* Process OP_RECURSE, stacking ovector *
+* Macros for the match() function *
*************************************************/
-/* When this function is called, mb->recursive has already been updated to
-point to a new recursion data block, and all its fields other than ovec_save
-have been set.
-
-This function exists so that the local vector variable ovecsave is no longer
-defined in the match() function, as it was in PCRE1. It is used only when there
-is recursion in the pattern, so it wastes a lot of stack to have it defined for
-every call of match(). We now use this function as an indirect way of calling
-match() only in the case when ovecsave is needed. (David Wheeler used to say
-"All problems in computer science can be solved by another level of
-indirection.")
+/* These macros pack up tests that are used for partial matching several times
+in the code. We set the "hit end" flag if the pointer is at the end of the
+subject and also past the earliest inspected character (i.e. something has been
+matched, even if not part of the actual matched string). For hard partial
+matching, we then return immediately. The second one is used when we already
+know we are past the end of the subject. */
-HOWEVER: when this file is compiled by gcc in an optimizing mode, because this
-function is called only once, and only from within match(), gcc will "inline"
-it - that is, move it inside match() - and this completely negates its reason
-for existence. Therefore, we mark it as non-inline when gcc is in use.
-
-Arguments:
- eptr pointer to current character in subject
- callpat the recursion point in the pattern
- mstart pointer to the current match start position (can be modified
- by encountering \K)
- offset_top current top pointer (highest ovector offset used + 1)
- mb pointer to "static" info block for the match
- eptrb pointer to chain of blocks containing eptr at start of
- brackets - for testing for empty matches
- rdepth the recursion depth
-
-Returns: a match() return code
-*/
-
-static int
-#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
-__attribute__ ((noinline))
-#endif
-op_recurse_ovecsave(REGISTER PCRE2_SPTR eptr, PCRE2_SPTR callpat,
- PCRE2_SPTR mstart, PCRE2_SIZE offset_top, match_block *mb, eptrblock *eptrb,
- uint32_t rdepth)
-{
-register int rrc;
-BOOL cbegroup = *callpat >= OP_SBRA;
-recursion_info *new_recursive = mb->recursive;
-PCRE2_SIZE ovecsave[OP_RECURSE_STACK_SAVE_MAX];
-
-/* Save the ovector */
+#define CHECK_PARTIAL()\
+ if (mb->partial != 0 && Feptr >= mb->end_subject && \
+ Feptr > mb->start_used_ptr) \
+ { \
+ mb->hitend = TRUE; \
+ if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; \
+ }
-new_recursive->ovec_save = ovecsave;
-memcpy(ovecsave, mb->ovector, mb->offset_end * sizeof(PCRE2_SIZE));
+#define SCHECK_PARTIAL()\
+ if (mb->partial != 0 && Feptr > mb->start_used_ptr) \
+ { \
+ mb->hitend = TRUE; \
+ if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; \
+ }
-/* Do the recursion. After processing each alternative, restore the ovector
-data and the last captured value. */
+/* These macros are used to implement backtracking. They simulate a recursive
+call to the match() function by means of a local vector of frames which
+remember the backtracking points. */
-do
- {
- if (cbegroup) mb->match_function_type |= MATCH_CBEGROUP;
- rrc = match(eptr, callpat + PRIV(OP_lengths)[*callpat], mstart, offset_top,
- mb, eptrb, rdepth + 1);
- memcpy(mb->ovector, new_recursive->ovec_save,
- mb->offset_end * sizeof(PCRE2_SIZE));
- mb->capture_last = new_recursive->saved_capture_last;
- if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) return rrc;
-
- /* PCRE does not allow THEN, SKIP, PRUNE or COMMIT to escape beyond a
- recursion; they cause a NOMATCH for the entire recursion. These codes
- are defined in a range that can be tested for. */
-
- if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX)
- return MATCH_NOMATCH;
-
- /* Any return code other than NOMATCH is an error. Otherwise, advance to the
- next alternative or to the end of the recursing subpattern. If there were
- nested recursions, mb->recursive might be changed, so reset it before
- looping. */
-
- if (rrc != MATCH_NOMATCH) return rrc;
- mb->recursive = new_recursive;
- callpat += GET(callpat, 1);
+#define RMATCH(ra,rb)\
+ {\
+ start_ecode = ra;\
+ Freturn_id = rb;\
+ goto MATCH_RECURSE;\
+ L_##rb:;\
}
-while (*callpat == OP_ALT); /* Loop for the alternatives */
-/* None of the alternatives matched. */
-
-return MATCH_NOMATCH;
-}
-#endif /* HEAP_MATCH_RECURSE */
+#define RRETURN(ra)\
+ {\
+ rrc = ra;\
+ goto RETURN_SWITCH;\
+ }
@@ -525,2470 +535,1270 @@ return MATCH_NOMATCH;
* Match from current position *
*************************************************/
-/* This function is called recursively in many circumstances. Whenever it
-returns a negative (error) response, the outer incarnation must also return the
-same response. */
+/* This function is called to run one match attempt at a single starting point
+in the subject.
-/* These macros pack up tests that are used for partial matching, and which
-appear several times in the code. We set the "hit end" flag if the pointer is
-at the end of the subject and also past the earliest inspected character (i.e.
-something has been matched, even if not part of the actual matched string). For
-hard partial matching, we then return immediately. The second one is used when
-we already know we are past the end of the subject. */
-
-#define CHECK_PARTIAL()\
- if (mb->partial != 0 && eptr >= mb->end_subject && \
- eptr > mb->start_used_ptr) \
- { \
- mb->hitend = TRUE; \
- if (mb->partial > 1) RRETURN(PCRE2_ERROR_PARTIAL); \
- }
-
-#define SCHECK_PARTIAL()\
- if (mb->partial != 0 && eptr > mb->start_used_ptr) \
- { \
- mb->hitend = TRUE; \
- if (mb->partial > 1) RRETURN(PCRE2_ERROR_PARTIAL); \
- }
-
-
-/* Performance note: It might be tempting to extract commonly used fields from
-the mb structure (e.g. utf, end_subject) into individual variables to improve
+Performance note: It might be tempting to extract commonly used fields from the
+mb structure (e.g. end_subject) into individual variables to improve
performance. Tests using gcc on a SPARC disproved this; in the first case, it
made performance worse.
Arguments:
- eptr pointer to current character in subject
- ecode pointer to current position in compiled code
- mstart pointer to the current match start position (can be modified
- by encountering \K)
- offset_top current top pointer (highest ovector offset used + 1)
- mb pointer to "static" info block for the match
- eptrb pointer to chain of blocks containing eptr at start of
- brackets - for testing for empty matches
- rdepth the recursion depth
-
-Returns: MATCH_MATCH if matched ) these values are >= 0
- MATCH_NOMATCH if failed to match )
- a negative MATCH_xxx value for PRUNE, SKIP, etc
- a negative PCRE2_ERROR_xxx value if aborted by an error condition
- (e.g. stopped by repeated call or recursion limit)
+ start_eptr starting character in subject
+ start_ecode starting position in compiled code
+ ovector pointer to the final output vector
+ oveccount number of pairs in ovector
+ top_bracket number of capturing parentheses in the pattern
+ frame_size size of each backtracking frame
+ mb pointer to "static" variables block
+
+Returns: MATCH_MATCH if matched ) these values are >= 0
+ MATCH_NOMATCH if failed to match )
+ negative MATCH_xxx value for PRUNE, SKIP, etc
+ negative PCRE2_ERROR_xxx value if aborted by an error condition
+ (e.g. stopped by repeated call or depth limit)
*/
static int
-match(REGISTER PCRE2_SPTR eptr, REGISTER PCRE2_SPTR ecode, PCRE2_SPTR mstart,
- PCRE2_SIZE offset_top, match_block *mb, eptrblock *eptrb, uint32_t rdepth)
+match(PCRE2_SPTR start_eptr, PCRE2_SPTR start_ecode, PCRE2_SIZE *ovector,
+ uint16_t oveccount, uint16_t top_bracket, PCRE2_SIZE frame_size,
+ match_block *mb)
{
-/* These variables do not need to be preserved over recursion in this function,
-so they can be ordinary variables in all cases. Mark some of them with
-"register" because they are used a lot in loops. */
+/* Frame-handling variables */
-register int rrc; /* Returns from recursive calls */
-register int i; /* Used for loops not involving calls to RMATCH() */
-register uint32_t c; /* Character values not kept over RMATCH() calls */
-register BOOL utf; /* Local copy of UTF flag for speed */
+heapframe *F; /* Current frame pointer */
+heapframe *N = NULL; /* Temporary frame pointers */
+heapframe *P = NULL;
+heapframe *assert_accept_frame; /* For passing back the frame with captures */
+PCRE2_SIZE frame_copy_size; /* Amount to copy when creating a new frame */
-BOOL minimize, possessive; /* Quantifier options */
-BOOL caseless;
-int condcode;
+/* Local variables that do not need to be preserved over calls to RRMATCH(). */
-/* When recursion is not being used, all "local" variables that have to be
-preserved over calls to RMATCH() are part of a "frame". We set up the top-level
-frame on the stack here; subsequent instantiations are obtained from the heap
-whenever RMATCH() does a "recursion". See the macro definitions above. Putting
-the top-level on the stack rather than malloc-ing them all gives a performance
-boost in many cases where there is not much "recursion". */
+PCRE2_SPTR bracode; /* Temp pointer to start of group */
+PCRE2_SIZE offset; /* Used for group offsets */
+PCRE2_SIZE length; /* Used for various length calculations */
-#ifdef HEAP_MATCH_RECURSE
-heapframe *frame = (heapframe *)mb->match_frames_base;
-
-/* Copy in the original argument variables */
-
-frame->Xeptr = eptr;
-frame->Xecode = ecode;
-frame->Xmstart = mstart;
-frame->Xoffset_top = offset_top;
-frame->Xeptrb = eptrb;
-frame->Xrdepth = rdepth;
-
-/* This is where control jumps back to to effect "recursion" */
-
-HEAP_RECURSE:
+int rrc; /* Return from functions & backtracking "recursions" */
+#ifdef SUPPORT_UNICODE
+int proptype; /* Type of character property */
+#endif
-/* Macros make the argument variables come from the current frame */
+uint32_t i; /* Used for local loops */
+uint32_t fc; /* Character values */
+uint32_t number; /* Used for group and other numbers */
+uint32_t reptype = 0; /* Type of repetition (0 to avoid compiler warning) */
+uint32_t group_frame_type; /* Specifies type for new group frames */
-#define eptr frame->Xeptr
-#define ecode frame->Xecode
-#define mstart frame->Xmstart
-#define offset_top frame->Xoffset_top
-#define eptrb frame->Xeptrb
-#define rdepth frame->Xrdepth
+BOOL condition; /* Used in conditional groups */
+BOOL cur_is_word; /* Used in "word" tests */
+BOOL prev_is_word; /* Used in "word" tests */
-/* Ditto for the local variables */
+/* UTF flag */
#ifdef SUPPORT_UNICODE
-#define charptr frame->Xcharptr
-#define prop_value frame->Xprop_value
-#define prop_type frame->Xprop_type
-#define prop_fail_result frame->Xprop_fail_result
-#define oclength frame->Xoclength
-#define occhars frame->Xocchars
+BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
+#else
+BOOL utf = FALSE;
#endif
+/* This is the length of the last part of a backtracking frame that must be
+copied when a new frame is created. */
-#define callpat frame->Xcallpat
-#define codelink frame->Xcodelink
-#define data frame->Xdata
-#define next_ecode frame->Xnext_ecode
-#define pp frame->Xpp
-#define prev frame->Xprev
-#define saved_eptr frame->Xsaved_eptr
-
-#define new_recursive frame->Xnew_recursive
-
-#define ctype frame->Xctype
-#define fc frame->Xfc
-#define fi frame->Xfi
-#define length frame->Xlength
-#define max frame->Xmax
-#define min frame->Xmin
-#define number frame->Xnumber
-#define offset frame->Xoffset
-#define op frame->Xop
-#define save_capture_last frame->Xsave_capture_last
-#define save_offset1 frame->Xsave_offset1
-#define save_offset2 frame->Xsave_offset2
-#define save_offset3 frame->Xsave_offset3
-
-#define condition frame->Xcondition
-#define cur_is_word frame->Xcur_is_word
-#define prev_is_word frame->Xprev_is_word
-
-#define newptrb frame->Xnewptrb
-
-/* When normal stack-based recursion is being used for match(), local variables
-are allocated on the stack and get preserved during recursion in the usual way.
-In this environment, fi and i, and fc and c, can be the same variables. */
-
-#else /* HEAP_MATCH_RECURSE not defined */
-#define fi i
-#define fc c
-
-/* Many of the following variables are used only in small blocks of the code.
-My normal style of coding would have declared them within each of those blocks.
-However, in order to accommodate the version of this code that uses an external
-"stack" implemented on the heap, it is easier to declare them all here, so the
-declarations can be cut out in a block. The only declarations within blocks
-below are for variables that do not have to be preserved over a recursive call
-to RMATCH(). */
-
-#ifdef SUPPORT_UNICODE
-PCRE2_SPTR charptr;
-#endif
-PCRE2_SPTR callpat;
-PCRE2_SPTR data;
-PCRE2_SPTR next_ecode;
-PCRE2_SPTR pp;
-PCRE2_SPTR prev;
-PCRE2_SPTR saved_eptr;
+frame_copy_size = frame_size - offsetof(heapframe, eptr);
-PCRE2_SIZE length;
-PCRE2_SIZE offset;
-PCRE2_SIZE save_offset1, save_offset2, save_offset3;
+/* Set up the first current frame at the start of the vector, and initialize
+fields that are not reset for new frames. */
-uint32_t number;
-uint32_t op;
-uint32_t save_capture_last;
+F = mb->match_frames;
+Frdepth = 0; /* "Recursion" depth */
+Fcapture_last = 0; /* Number of most recent capture */
+Fcurrent_recurse = RECURSE_UNSET; /* Not pattern recursing. */
+Fstart_match = Feptr = start_eptr; /* Current data pointer and start match */
+Fmark = NULL; /* Most recent mark */
+Foffset_top = 0; /* End of captures within the frame */
+Flast_group_offset = PCRE2_UNSET; /* Saved frame of most recent group */
+group_frame_type = 0; /* Not a start of group frame */
+goto NEW_FRAME; /* Start processing with this frame */
-#ifdef SUPPORT_UNICODE
-uint32_t prop_value;
-int prop_type;
-int prop_fail_result;
-int oclength;
-PCRE2_UCHAR occhars[6];
-#endif
+/* Come back here when we want to create a new frame for remembering a
+backtracking point. */
-int codelink;
-int ctype;
-int max;
-int min;
+MATCH_RECURSE:
-BOOL condition;
-BOOL cur_is_word;
-BOOL prev_is_word;
+/* Set up a new backtracking frame. If the vector is full, get a new one
+on the heap, doubling the size, but constrained by the heap limit. */
-eptrblock newptrb;
-recursion_info new_recursive;
-#endif /* HEAP_MATCH_RECURSE not defined */
+N = (heapframe *)((char *)F + frame_size);
+if (N >= mb->match_frames_top)
+ {
+ PCRE2_SIZE newsize = mb->frame_vector_size * 2;
+ heapframe *new;
-/* To save space on the stack and in the heap frame, I have doubled up on some
-of the local variables that are used only in localised parts of the code, but
-still need to be preserved over recursive calls of match(). These macros define
-the alternative names that are used. */
+ if ((newsize / 1024) > mb->heap_limit)
+ {
+ PCRE2_SIZE maxsize = ((mb->heap_limit * 1024)/frame_size) * frame_size;
+ if (mb->frame_vector_size >= maxsize) return PCRE2_ERROR_HEAPLIMIT;
+ newsize = maxsize;
+ }
-#define allow_zero cur_is_word
-#define cbegroup condition
-#define code_offset codelink
-#define condassert condition
-#define foc number
-#define matched_once prev_is_word
-#define save_mark data
+ new = mb->memctl.malloc(newsize, mb->memctl.memory_data);
+ if (new == NULL) return PCRE2_ERROR_NOMEMORY;
+ memcpy(new, mb->match_frames, mb->frame_vector_size);
-/* These statements are here to stop the compiler complaining about unitialized
-variables. */
+ F = (heapframe *)((char *)new + ((char *)F - (char *)mb->match_frames));
+ N = (heapframe *)((char *)F + frame_size);
-#ifdef SUPPORT_UNICODE
-prop_value = 0;
-prop_fail_result = 0;
-#endif
+ if (mb->match_frames != mb->stack_frames)
+ mb->memctl.free(mb->match_frames, mb->memctl.memory_data);
+ mb->match_frames = new;
+ mb->match_frames_top = (heapframe *)((char *)mb->match_frames + newsize);
+ mb->frame_vector_size = newsize;
+ }
+#ifdef DEBUG_SHOW_RMATCH
+fprintf(stderr, "++ RMATCH %2d frame=%d", Freturn_id, Frdepth + 1);
+if (group_frame_type != 0)
+ {
+ fprintf(stderr, " type=%x ", group_frame_type);
+ switch (GF_IDMASK(group_frame_type))
+ {
+ case GF_CAPTURE:
+ fprintf(stderr, "capture=%d", GF_DATAMASK(group_frame_type));
+ break;
-/* This label is used for tail recursion, which is used in a few cases even
-when HEAP_MATCH_RECURSE is not defined, in order to reduce the amount of stack
-that is used. Thanks to Ian Taylor for noticing this possibility and sending
-the original patch. */
+ case GF_NOCAPTURE:
+ fprintf(stderr, "nocapture op=%d", GF_DATAMASK(group_frame_type));
+ break;
-TAIL_RECURSE:
+ case GF_CONDASSERT:
+ fprintf(stderr, "condassert op=%d", GF_DATAMASK(group_frame_type));
+ break;
-/* OK, now we can get on with the real code of the function. Recursive calls
-are specified by the macro RMATCH and RRETURN is used to return. When
-HEAP_MATCH_RECURSE is *not* defined, these just turn into a recursive call to
-match() and a "return", respectively. However, RMATCH isn't like a function
-call because it's quite a complicated macro. It has to be used in one
-particular way. This shouldn't, however, impact performance when true recursion
-is being used. */
+ case GF_RECURSE:
+ fprintf(stderr, "recurse=%d", GF_DATAMASK(group_frame_type));
+ break;
-#ifdef SUPPORT_UNICODE
-utf = (mb->poptions & PCRE2_UTF) != 0;
-#else
-utf = FALSE;
+ default:
+ fprintf(stderr, "*** unknown ***");
+ break;
+ }
+ }
+fprintf(stderr, "\n");
#endif
-/* First check that we haven't called match() too many times, or that we
-haven't exceeded the recursive call limit. */
+/* Copy those fields that must be copied into the new frame, increase the
+"recursion" depth (i.e. the new frame's index) and then make the new frame
+current. */
+
+memcpy((char *)N + offsetof(heapframe, eptr),
+ (char *)F + offsetof(heapframe, eptr),
+ frame_copy_size);
+
+N->rdepth = Frdepth + 1;
+F = N;
-if (mb->match_call_count++ >= mb->match_limit) RRETURN(PCRE2_ERROR_MATCHLIMIT);
-if (rdepth >= mb->match_limit_recursion) RRETURN(PCRE2_ERROR_RECURSIONLIMIT);
+/* Carry on processing with a new frame. */
-/* At the start of a group with an unlimited repeat that may match an empty
-string, the variable mb->match_function_type contains the MATCH_CBEGROUP bit.
-It is done this way to save having to use another function argument, which
-would take up space on the stack. See also MATCH_CONDASSERT below.
+NEW_FRAME:
+Fgroup_frame_type = group_frame_type;
+Fecode = start_ecode; /* Starting code pointer */
+Fback_frame = frame_size; /* Default is go back one frame */
-When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
-such remembered pointers, to be checked when we hit the closing ket, in order
-to break infinite loops that match no characters. When match() is called in
-other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
-NOT be used with tail recursion, because the memory block that is used is on
-the stack, so a new one may be required for each match(). */
+/* If this is a special type of group frame, remember its offset for quick
+access at the end of the group. If this is a recursion, set a new current
+recursion value. */
-if ((mb->match_function_type & MATCH_CBEGROUP) != 0)
+if (group_frame_type != 0)
{
- newptrb.epb_saved_eptr = eptr;
- newptrb.epb_prev = eptrb;
- eptrb = &newptrb;
- mb->match_function_type &= ~MATCH_CBEGROUP;
+ Flast_group_offset = (char *)F - (char *)mb->match_frames;
+ if (GF_IDMASK(group_frame_type) == GF_RECURSE)
+ Fcurrent_recurse = GF_DATAMASK(group_frame_type);
+ group_frame_type = 0;
}
-/* Now, at last, we can start processing the opcodes. */
+
+/* ========================================================================= */
+/* This is the main processing loop. First check that we haven't recorded too
+many backtracks (search tree is too large), or that we haven't exceeded the
+recursive depth limit (used too many backtracking frames). If not, process the
+opcodes. */
+
+if (mb->match_call_count++ >= mb->match_limit) return PCRE2_ERROR_MATCHLIMIT;
+if (Frdepth >= mb->match_limit_depth) return PCRE2_ERROR_DEPTHLIMIT;
for (;;)
{
- minimize = possessive = FALSE;
- op = *ecode;
+#ifdef DEBUG_SHOW_OPS
+fprintf(stderr, "++ op=%d\n", *Fecode);
+#endif
- switch(op)
+ Fop = (uint8_t)(*Fecode); /* Cast needed for 16-bit and 32-bit modes */
+ switch(Fop)
{
- case OP_MARK:
- mb->nomatch_mark = ecode + 2;
- mb->mark = NULL; /* In case previously set by assertion */
- RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, mb,
- eptrb, RM55);
- if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
- mb->mark == NULL) mb->mark = ecode + 2;
-
- /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
- argument, and we must check whether that argument matches this MARK's
- argument. It is passed back in mb->start_match_ptr (an overloading of that
- variable). If it does match, we reset that variable to the current subject
- position and return MATCH_SKIP. Otherwise, pass back the return code
- unaltered. */
+ /* ===================================================================== */
+ /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes, to close
+ any currently open capturing brackets. Unlike reaching the end of a group,
+ where we know the starting frame is at the top of the chained frames, in
+ this case we have to search back for the relevant frame in case other types
+ of group that use chained frames have intervened. Multiple OP_CLOSEs always
+ come innermost first, which matches the chain order. We can ignore this in
+ a recursion, because captures are not passed out of recursions. */
- else if (rrc == MATCH_SKIP_ARG &&
- PRIV(strcmp)(ecode + 2, mb->start_match_ptr) == 0)
+ case OP_CLOSE:
+ if (Fcurrent_recurse == RECURSE_UNSET)
{
- mb->start_match_ptr = eptr;
- RRETURN(MATCH_SKIP);
+ number = GET2(Fecode, 1);
+ offset = Flast_group_offset;
+ for(;;)
+ {
+ if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL;
+ N = (heapframe *)((char *)mb->match_frames + offset);
+ P = (heapframe *)((char *)N - frame_size);
+ if (N->group_frame_type == (GF_CAPTURE | number)) break;
+ offset = P->last_group_offset;
+ }
+ offset = (number << 1) - 2;
+ Fcapture_last = number;
+ Fovector[offset] = P->eptr - mb->start_subject;
+ Fovector[offset+1] = Feptr - mb->start_subject;
+ if (offset >= Foffset_top) Foffset_top = offset + 2;
}
- RRETURN(rrc);
-
- case OP_FAIL:
- RRETURN(MATCH_NOMATCH);
-
- case OP_COMMIT:
- RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, mb,
- eptrb, RM52);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- RRETURN(MATCH_COMMIT);
-
- case OP_PRUNE:
- RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, mb,
- eptrb, RM51);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- RRETURN(MATCH_PRUNE);
-
- case OP_PRUNE_ARG:
- mb->nomatch_mark = ecode + 2;
- mb->mark = NULL; /* In case previously set by assertion */
- RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, mb,
- eptrb, RM56);
- if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
- mb->mark == NULL) mb->mark = ecode + 2;
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- RRETURN(MATCH_PRUNE);
-
- case OP_SKIP:
- RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, mb,
- eptrb, RM53);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- mb->start_match_ptr = eptr; /* Pass back current position */
- RRETURN(MATCH_SKIP);
-
- /* Note that, for Perl compatibility, SKIP with an argument does NOT set
- nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was
- not a matching mark, we have to re-run the match, ignoring the SKIP_ARG
- that failed and any that precede it (either they also failed, or were not
- triggered). To do this, we maintain a count of executed SKIP_ARGs. If a
- SKIP_ARG gets to top level, the match is re-run with mb->ignore_skip_arg
- set to the count of the one that failed. */
+ Fecode += PRIV(OP_lengths)[*Fecode];
+ break;
- case OP_SKIP_ARG:
- mb->skip_arg_count++;
- if (mb->skip_arg_count <= mb->ignore_skip_arg)
- {
- ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
- break;
- }
- RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, mb,
- eptrb, RM57);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- /* Pass back the current skip name by overloading mb->start_match_ptr and
- returning the special MATCH_SKIP_ARG return code. This will either be
- caught by a matching MARK, or get to the top, where it causes a rematch
- with mb->ignore_skip_arg set to the value of mb->skip_arg_count. */
+ /* ===================================================================== */
+ /* Real or forced end of the pattern, assertion, or recursion. In an
+ assertion ACCEPT, update the last used pointer and remember the current
+ frame so that the captures and mark can be fished out of it. */
- mb->start_match_ptr = ecode + 2;
- RRETURN(MATCH_SKIP_ARG);
+ case OP_ASSERT_ACCEPT:
+ if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
+ assert_accept_frame = F;
+ RRETURN(MATCH_ACCEPT);
- /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
- the branch in which it occurs can be determined. Overload the start of
- match pointer to do this. */
+ /* If recursing, we have to find the most recent recursion. */
- case OP_THEN:
- RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, mb,
- eptrb, RM54);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- mb->start_match_ptr = ecode;
- RRETURN(MATCH_THEN);
+ case OP_ACCEPT:
+ case OP_END:
- case OP_THEN_ARG:
- mb->nomatch_mark = ecode + 2;
- mb->mark = NULL; /* In case previously set by assertion */
- RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
- mb, eptrb, RM58);
- if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
- mb->mark == NULL) mb->mark = ecode + 2;
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- mb->start_match_ptr = ecode;
- RRETURN(MATCH_THEN);
+ /* Handle end of a recursion. */
- /* Handle an atomic group that does not contain any capturing parentheses.
- This can be handled like an assertion. Prior to 8.13, all atomic groups
- were handled this way. In 8.13, the code was changed as below for ONCE, so
- that backups pass through the group and thereby reset captured values.
- However, this uses a lot more stack, so in 8.20, atomic groups that do not
- contain any captures generate OP_ONCE_NC, which can be handled in the old,
- less stack intensive way.
-
- Check the alternative branches in turn - the matching won't pass the KET
- for this kind of subpattern. If any one branch matches, we carry on as at
- the end of a normal bracket, leaving the subject pointer, but resetting
- the start-of-match value in case it was changed by \K. */
-
- case OP_ONCE_NC:
- prev = ecode;
- saved_eptr = eptr;
- save_mark = mb->mark;
- do
+ if (Fcurrent_recurse != RECURSE_UNSET)
{
- RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, mb, eptrb, RM64);
- if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
+ offset = Flast_group_offset;
+ for(;;)
{
- mstart = mb->start_match_ptr;
- break;
- }
- if (rrc == MATCH_THEN)
- {
- next_ecode = ecode + GET(ecode,1);
- if (mb->start_match_ptr < next_ecode &&
- (*ecode == OP_ALT || *next_ecode == OP_ALT))
- rrc = MATCH_NOMATCH;
+ if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL;
+ N = (heapframe *)((char *)mb->match_frames + offset);
+ P = (heapframe *)((char *)N - frame_size);
+ if (GF_IDMASK(N->group_frame_type) == GF_RECURSE) break;
+ offset = P->last_group_offset;
}
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- ecode += GET(ecode,1);
- mb->mark = save_mark;
- }
- while (*ecode == OP_ALT);
+ /* N is now the frame of the recursion; the previous frame is at the
+ OP_RECURSE position. Go back there, copying the current subject position
+ and mark, and move on past the OP_RECURSE. */
- /* If hit the end of the group (which could be repeated), fail */
-
- if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
-
- /* Continue as from after the group, updating the offsets high water
- mark, since extracts may have been taken. */
-
- do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
-
- offset_top = mb->end_offset_top;
- eptr = mb->end_match_ptr;
+ P->eptr = Feptr;
+ P->mark = Fmark;
+ F = P;
+ Fecode += 1 + LINK_SIZE;
+ continue;
+ }
- /* For a non-repeating ket, just continue at this level. This also
- happens for a repeating ket if no characters were matched in the group.
- This is the forcible breaking of infinite loops as implemented in Perl
- 5.005. */
+ /* Not a recursion. Fail for an empty string match if either PCRE2_NOTEMPTY
+ is set, or if PCRE2_NOTEMPTY_ATSTART is set and we have matched at the
+ start of the subject. In both cases, backtracking will then try other
+ alternatives, if any. */
- if (*ecode == OP_KET || eptr == saved_eptr)
- {
- ecode += 1+LINK_SIZE;
- break;
- }
+ if (Feptr == Fstart_match &&
+ ((mb->moptions & PCRE2_NOTEMPTY) != 0 ||
+ ((mb->moptions & PCRE2_NOTEMPTY_ATSTART) != 0 &&
+ Fstart_match == mb->start_subject + mb->start_offset)))
+ RRETURN(MATCH_NOMATCH);
- /* The repeating kets try the rest of the pattern or restart from the
- preceding bracket, in the appropriate order. The second "call" of match()
- uses tail recursion, to avoid using another stack frame. */
+ /* Also fail if PCRE2_ENDANCHORED is set and the end of the match is not
+ the end of the subject. After (*ACCEPT) we fail the entire match (at this
+ position) but backtrack on reaching the end of the pattern. */
- if (*ecode == OP_KETRMIN)
+ if (Feptr < mb->end_subject &&
+ ((mb->moptions | mb->poptions) & PCRE2_ENDANCHORED) != 0)
{
- RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, mb, eptrb, RM65);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- ecode = prev;
- goto TAIL_RECURSE;
- }
- else /* OP_KETRMAX */
- {
- RMATCH(eptr, prev, offset_top, mb, eptrb, RM66);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- ecode += 1 + LINK_SIZE;
- goto TAIL_RECURSE;
+ if (Fop == OP_END) RRETURN(MATCH_NOMATCH);
+ return MATCH_NOMATCH;
}
- /* Control never gets here */
- /* Handle a capturing bracket, other than those that are possessive with an
- unlimited repeat. If there is space in the offset vector, save the current
- subject position in the working slot at the top of the vector. We mustn't
- change the current values of the data slot, because they may be set from a
- previous iteration of this group, and be referred to by a reference inside
- the group. A failure to match might occur after the group has succeeded,
- if something later on doesn't match. For this reason, we need to restore
- the working value and also the values of the final offsets, in case they
- were set by a previous iteration of the same bracket.
-
- If there isn't enough space in the offset vector, treat this as if it were
- a non-capturing bracket. Don't worry about setting the flag for the error
- case here; that is handled in the code for KET. */
+ /* We have a successful match of the whole pattern. Record the result and
+ then do a direct return from the function. If there is space in the offset
+ vector, set any pairs that follow the highest-numbered captured string but
+ are less than the number of capturing groups in the pattern to PCRE2_UNSET.
+ It is documented that this happens. "Gaps" are set to PCRE2_UNSET
+ dynamically. It is only those at the end that need setting here. */
- case OP_CBRA:
- case OP_SCBRA:
- number = GET2(ecode, 1+LINK_SIZE);
- offset = number << 1;
+ mb->end_match_ptr = Feptr; /* Record where we ended */
+ mb->end_offset_top = Foffset_top; /* and how many extracts were taken */
+ mb->mark = Fmark; /* and the last success mark */
+ if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
- if (offset < mb->offset_max)
- {
- save_offset1 = mb->ovector[offset];
- save_offset2 = mb->ovector[offset+1];
- save_offset3 = mb->ovector[mb->offset_end - number];
- save_capture_last = mb->capture_last;
- save_mark = mb->mark;
+ ovector[0] = Fstart_match - mb->start_subject;
+ ovector[1] = Feptr - mb->start_subject;
- mb->ovector[mb->offset_end - number] = eptr - mb->start_subject;
+ /* Set i to the smaller of the sizes of the external and frame ovectors. */
- for (;;)
- {
- if (op >= OP_SBRA) mb->match_function_type |= MATCH_CBEGROUP;
- RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, mb,
- eptrb, RM1);
- if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
-
- /* If we backed up to a THEN, check whether it is within the current
- branch by comparing the address of the THEN that is passed back with
- the end of the branch. If it is within the current branch, and the
- branch is one of two or more alternatives (it either starts or ends
- with OP_ALT), we have reached the limit of THEN's action, so convert
- the return code to NOMATCH, which will cause normal backtracking to
- happen from now on. Otherwise, THEN is passed back to an outer
- alternative. This implements Perl's treatment of parenthesized groups,
- where a group not containing | does not affect the current alternative,
- that is, (X) is NOT the same as (X|(*F)). */
-
- if (rrc == MATCH_THEN)
- {
- next_ecode = ecode + GET(ecode,1);
- if (mb->start_match_ptr < next_ecode &&
- (*ecode == OP_ALT || *next_ecode == OP_ALT))
- rrc = MATCH_NOMATCH;
- }
-
- /* Anything other than NOMATCH is passed back. */
-
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- mb->capture_last = save_capture_last;
- ecode += GET(ecode, 1);
- mb->mark = save_mark;
- if (*ecode != OP_ALT) break;
- }
+ i = 2 * ((top_bracket + 1 > oveccount)? oveccount : top_bracket + 1);
+ memcpy(ovector + 2, Fovector, (i - 2) * sizeof(PCRE2_SIZE));
+ while (--i >= Foffset_top + 2) ovector[i] = PCRE2_UNSET;
+ return MATCH_MATCH; /* Note: NOT RRETURN */
- mb->ovector[offset] = save_offset1;
- mb->ovector[offset+1] = save_offset2;
- mb->ovector[mb->offset_end - number] = save_offset3;
- /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
+ /*===================================================================== */
+ /* Match any single character type except newline; have to take care with
+ CRLF newlines and partial matching. */
- RRETURN(rrc);
+ case OP_ANY:
+ if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);
+ if (mb->partial != 0 &&
+ Feptr == mb->end_subject - 1 &&
+ NLBLOCK->nltype == NLTYPE_FIXED &&
+ NLBLOCK->nllen == 2 &&
+ UCHAR21TEST(Feptr) == NLBLOCK->nl[0])
+ {
+ mb->hitend = TRUE;
+ if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
}
+ /* Fall through */
- /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
- as a non-capturing bracket. */
+ /* Match any single character whatsoever. */
- /* VVVVVVVVVVVVVVVVVVVVVVVVV */
- /* VVVVVVVVVVVVVVVVVVVVVVVVV */
+ case OP_ALLANY:
+ if (Feptr >= mb->end_subject) /* DO NOT merge the Feptr++ here; it must */
+ { /* not be updated before SCHECK_PARTIAL. */
+ SCHECK_PARTIAL();
+ RRETURN(MATCH_NOMATCH);
+ }
+ Feptr++;
+#ifdef SUPPORT_UNICODE
+ if (utf) ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
+#endif
+ Fecode++;
+ break;
- /* Non-capturing or atomic group, except for possessive with unlimited
- repeat and ONCE group with no captures. Loop for all the alternatives.
- When we get to the final alternative within the brackets, we used to return
- the result of a recursive call to match() whatever happened so it was
- possible to reduce stack usage by turning this into a tail recursion,
- except in the case of a possibly empty group. However, now that there is
- the possiblity of (*THEN) occurring in the final alternative, this
- optimization is no longer always possible.
+ /* ===================================================================== */
+ /* Match a single code unit, even in UTF mode. This opcode really does
+ match any code unit, even newline. (It really should be called ANYCODEUNIT,
+ of course - the byte name is from pre-16 bit days.) */
- We can optimize if we know there are no (*THEN)s in the pattern; at present
- this is the best that can be done.
+ case OP_ANYBYTE:
+ if (Feptr >= mb->end_subject) /* DO NOT merge the Feptr++ here; it must */
+ { /* not be updated before SCHECK_PARTIAL. */
+ SCHECK_PARTIAL();
+ RRETURN(MATCH_NOMATCH);
+ }
+ Feptr++;
+ Fecode++;
+ break;
- MATCH_ONCE is returned when the end of an atomic group is successfully
- reached, but subsequent matching fails. It passes back up the tree (causing
- captured values to be reset) until the original atomic group level is
- reached. This is tested by comparing mb->once_target with the start of the
- group. At this point, the return is converted into MATCH_NOMATCH so that
- previous backup points can be taken. */
- case OP_ONCE:
- case OP_BRA:
- case OP_SBRA:
+ /* ===================================================================== */
+ /* Match a single character, casefully */
- for (;;)
+ case OP_CHAR:
+#ifdef SUPPORT_UNICODE
+ if (utf)
{
- if (op >= OP_SBRA || op == OP_ONCE)
- mb->match_function_type |= MATCH_CBEGROUP;
-
- /* If this is not a possibly empty group, and there are no (*THEN)s in
- the pattern, and this is the final alternative, optimize as described
- above. */
-
- else if (!mb->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
+ Flength = 1;
+ Fecode++;
+ GETCHARLEN(fc, Fecode, Flength);
+ if (Flength > (PCRE2_SIZE)(mb->end_subject - Feptr))
{
- ecode += PRIV(OP_lengths)[*ecode];
- goto TAIL_RECURSE;
- }
-
- /* In all other cases, we have to make another call to match(). */
-
- save_mark = mb->mark;
- save_capture_last = mb->capture_last;
- RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, mb, eptrb,
- RM2);
-
- /* See comment in the code for capturing groups above about handling
- THEN. */
-
- if (rrc == MATCH_THEN)
- {
- next_ecode = ecode + GET(ecode,1);
- if (mb->start_match_ptr < next_ecode &&
- (*ecode == OP_ALT || *next_ecode == OP_ALT))
- rrc = MATCH_NOMATCH;
+ CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
+ RRETURN(MATCH_NOMATCH);
}
-
- if (rrc != MATCH_NOMATCH)
+ for (; Flength > 0; Flength--)
{
- if (rrc == MATCH_ONCE)
- {
- PCRE2_SPTR scode = ecode;
- if (*scode != OP_ONCE) /* If not at start, find it */
- {
- while (*scode == OP_ALT) scode += GET(scode, 1);
- scode -= GET(scode, 1);
- }
- if (mb->once_target == scode) rrc = MATCH_NOMATCH;
- }
- RRETURN(rrc);
+ if (*Fecode++ != UCHAR21INC(Feptr)) RRETURN(MATCH_NOMATCH);
}
- ecode += GET(ecode, 1);
- mb->mark = save_mark;
- if (*ecode != OP_ALT) break;
- mb->capture_last = save_capture_last;
}
-
- RRETURN(MATCH_NOMATCH);
-
- /* Handle possessive capturing brackets with an unlimited repeat. We come
- here from BRAZERO with allow_zero set TRUE. The ovector values are
- handled similarly to the normal case above. However, the matching is
- different. The end of these brackets will always be OP_KETRPOS, which
- returns MATCH_KETRPOS without going further in the pattern. By this means
- we can handle the group by iteration rather than recursion, thereby
- reducing the amount of stack needed. If the ovector is too small for
- capturing, treat as non-capturing. */
-
- case OP_CBRAPOS:
- case OP_SCBRAPOS:
- allow_zero = FALSE;
-
- POSSESSIVE_CAPTURE:
- number = GET2(ecode, 1+LINK_SIZE);
- offset = number << 1;
- if (offset >= mb->offset_max) goto POSSESSIVE_NON_CAPTURE;
-
- matched_once = FALSE;
- code_offset = (int)(ecode - mb->start_code);
-
- save_offset1 = mb->ovector[offset];
- save_offset2 = mb->ovector[offset+1];
- save_offset3 = mb->ovector[mb->offset_end - number];
- save_capture_last = mb->capture_last;
-
- /* Each time round the loop, save the current subject position for use
- when the group matches. For MATCH_MATCH, the group has matched, so we
- restart it with a new subject starting position, remembering that we had
- at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
- usual. If we haven't matched any alternatives in any iteration, check to
- see if a previous iteration matched. If so, the group has matched;
- continue from afterwards. Otherwise it has failed; restore the previous
- capture values before returning NOMATCH. */
-
- for (;;)
+ else
+#endif
+ /* Not UTF mode */
{
- mb->ovector[mb->offset_end - number] = eptr - mb->start_subject;
- if (op >= OP_SBRA) mb->match_function_type |= MATCH_CBEGROUP;
- RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, mb,
- eptrb, RM63);
- if (rrc == MATCH_KETRPOS)
+ if (mb->end_subject - Feptr < 1)
{
- offset_top = mb->end_offset_top;
- ecode = mb->start_code + code_offset;
- save_capture_last = mb->capture_last;
- matched_once = TRUE;
- mstart = mb->start_match_ptr; /* In case \K changed it */
- if (eptr == mb->end_match_ptr) /* Matched an empty string */
- {
- do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
- break;
- }
- eptr = mb->end_match_ptr;
- continue;
+ SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
+ RRETURN(MATCH_NOMATCH);
}
+ if (Fecode[1] != *Feptr++) RRETURN(MATCH_NOMATCH);
+ Fecode += 2;
+ }
+ break;
- /* See comment in the code for capturing groups above about handling
- THEN. */
-
- if (rrc == MATCH_THEN)
- {
- next_ecode = ecode + GET(ecode,1);
- if (mb->start_match_ptr < next_ecode &&
- (*ecode == OP_ALT || *next_ecode == OP_ALT))
- rrc = MATCH_NOMATCH;
- }
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- mb->capture_last = save_capture_last;
- ecode += GET(ecode, 1);
- if (*ecode != OP_ALT) break;
- }
+ /* ===================================================================== */
+ /* Match a single character, caselessly. If we are at the end of the
+ subject, give up immediately. We get here only when the pattern character
+ has at most one other case. Characters with more than two cases are coded
+ as OP_PROP with the pseudo-property PT_CLIST. */
- if (!matched_once)
+ case OP_CHARI:
+ if (Feptr >= mb->end_subject)
{
- mb->ovector[offset] = save_offset1;
- mb->ovector[offset+1] = save_offset2;
- mb->ovector[mb->offset_end - number] = save_offset3;
+ SCHECK_PARTIAL();
+ RRETURN(MATCH_NOMATCH);
}
- if (allow_zero || matched_once)
+#ifdef SUPPORT_UNICODE
+ if (utf)
{
- ecode += 1 + LINK_SIZE;
- break;
- }
- RRETURN(MATCH_NOMATCH);
+ Flength = 1;
+ Fecode++;
+ GETCHARLEN(fc, Fecode, Flength);
- /* Non-capturing possessive bracket with unlimited repeat. We come here
- from BRAZERO with allow_zero = TRUE. The code is similar to the above,
- without the capturing complication. It is written out separately for speed
- and cleanliness. */
+ /* If the pattern character's value is < 128, we know that its other case
+ (if any) is also < 128 (and therefore only one code unit long in all
+ code-unit widths), so we can use the fast lookup table. We checked above
+ that there is at least one character left in the subject. */
- case OP_BRAPOS:
- case OP_SBRAPOS:
- allow_zero = FALSE;
-
- POSSESSIVE_NON_CAPTURE:
- matched_once = FALSE;
- code_offset = (int)(ecode - mb->start_code);
- save_capture_last = mb->capture_last;
-
- for (;;)
- {
- if (op >= OP_SBRA) mb->match_function_type |= MATCH_CBEGROUP;
- RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, mb,
- eptrb, RM48);
- if (rrc == MATCH_KETRPOS)
+ if (fc < 128)
{
- offset_top = mb->end_offset_top;
- ecode = mb->start_code + code_offset;
- matched_once = TRUE;
- mstart = mb->start_match_ptr; /* In case \K reset it */
- if (eptr == mb->end_match_ptr) /* Matched an empty string */
- {
- do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
- break;
- }
- eptr = mb->end_match_ptr;
- continue;
+ uint32_t cc = UCHAR21(Feptr);
+ if (mb->lcc[fc] != TABLE_GET(cc, mb->lcc, cc)) RRETURN(MATCH_NOMATCH);
+ Fecode++;
+ Feptr++;
}
- /* See comment in the code for capturing groups above about handling
- THEN. */
+ /* Otherwise we must pick up the subject character and use Unicode
+ property support to test its other case. Note that we cannot use the
+ value of "Flength" to check for sufficient bytes left, because the other
+ case of the character may have more or fewer code units. */
- if (rrc == MATCH_THEN)
+ else
{
- next_ecode = ecode + GET(ecode,1);
- if (mb->start_match_ptr < next_ecode &&
- (*ecode == OP_ALT || *next_ecode == OP_ALT))
- rrc = MATCH_NOMATCH;
+ uint32_t dc;
+ GETCHARINC(dc, Feptr);
+ Fecode += Flength;
+ if (dc != fc && dc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH);
}
-
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- ecode += GET(ecode, 1);
- if (*ecode != OP_ALT) break;
- mb->capture_last = save_capture_last;
}
+ else
+#endif /* SUPPORT_UNICODE */
- if (matched_once || allow_zero)
+ /* Not UTF mode; use the table for characters < 256. */
{
- ecode += 1 + LINK_SIZE;
- break;
+ if (TABLE_GET(Fecode[1], mb->lcc, Fecode[1])
+ != TABLE_GET(*Feptr, mb->lcc, *Feptr)) RRETURN(MATCH_NOMATCH);
+ Feptr++;
+ Fecode += 2;
}
- RRETURN(MATCH_NOMATCH);
-
- /* Control never reaches here. */
-
- /* Conditional group: compilation checked that there are no more than two
- branches. If the condition is false, skipping the first branch takes us
- past the end of the item if there is only one branch, but that's exactly
- what we want. */
-
- case OP_COND:
- case OP_SCOND:
-
- /* The variable codelink will be added to ecode when the condition is
- false, to get to the second branch. Setting it to the offset to the ALT
- or KET, then incrementing ecode achieves this effect. We now have ecode
- pointing to the condition or callout. */
+ break;
- codelink = GET(ecode, 1); /* Offset to the second branch */
- ecode += 1 + LINK_SIZE; /* From this opcode */
- /* Because of the way auto-callout works during compile, a callout item is
- inserted between OP_COND and an assertion condition. */
+ /* ===================================================================== */
+ /* Match not a single character. */
- if (*ecode == OP_CALLOUT || *ecode == OP_CALLOUT_STR)
+ case OP_NOT:
+ case OP_NOTI:
+ if (Feptr >= mb->end_subject)
{
- unsigned int callout_length = (*ecode == OP_CALLOUT)
- ? PRIV(OP_lengths)[OP_CALLOUT] : GET(ecode, 1 + 2*LINK_SIZE);
-
- if (mb->callout != NULL)
- {
- pcre2_callout_block cb;
- cb.version = 1;
- cb.capture_top = offset_top/2;
- cb.capture_last = mb->capture_last & CAPLMASK;
- cb.offset_vector = mb->ovector;
- cb.mark = mb->nomatch_mark;
- cb.subject = mb->start_subject;
- cb.subject_length = (PCRE2_SIZE)(mb->end_subject - mb->start_subject);
- cb.start_match = (PCRE2_SIZE)(mstart - mb->start_subject);
- cb.current_position = (PCRE2_SIZE)(eptr - mb->start_subject);
- cb.pattern_position = GET(ecode, 1);
- cb.next_item_length = GET(ecode, 1 + LINK_SIZE);
-
- if (*ecode == OP_CALLOUT)
- {
- cb.callout_number = ecode[1 + 2*LINK_SIZE];
- cb.callout_string_offset = 0;
- cb.callout_string = NULL;
- cb.callout_string_length = 0;
- }
- else
- {
- cb.callout_number = 0;
- cb.callout_string_offset = GET(ecode, 1 + 3*LINK_SIZE);
- cb.callout_string = ecode + (1 + 4*LINK_SIZE) + 1;
- cb.callout_string_length =
- callout_length - (1 + 4*LINK_SIZE) - 2;
- }
-
- if ((rrc = mb->callout(&cb, mb->callout_data)) > 0)
- RRETURN(MATCH_NOMATCH);
- if (rrc < 0) RRETURN(rrc);
- }
-
- /* Advance ecode past the callout, so it now points to the condition. We
- must adjust codelink so that the value of ecode+codelink is unchanged. */
-
- ecode += callout_length;
- codelink -= callout_length;
+ SCHECK_PARTIAL();
+ RRETURN(MATCH_NOMATCH);
}
-
- /* Test the various possible conditions */
-
- condition = FALSE;
- switch(condcode = *ecode)
+#ifdef SUPPORT_UNICODE
+ if (utf)
{
- case OP_RREF: /* Numbered group recursion test */
- if (mb->recursive != NULL) /* Not recursing => FALSE */
- {
- uint32_t recno = GET2(ecode, 1); /* Recursion group number*/
- condition = (recno == RREF_ANY || recno == mb->recursive->group_num);
- }
- break;
-
- case OP_DNRREF: /* Duplicate named group recursion test */
- if (mb->recursive != NULL)
- {
- int count = GET2(ecode, 1 + IMM2_SIZE);
- PCRE2_SPTR slot = mb->name_table + GET2(ecode, 1) * mb->name_entry_size;
- while (count-- > 0)
- {
- uint32_t recno = GET2(slot, 0);
- condition = recno == mb->recursive->group_num;
- if (condition) break;
- slot += mb->name_entry_size;
- }
- }
- break;
-
- case OP_CREF: /* Numbered group used test */
- offset = GET2(ecode, 1) << 1; /* Doubled ref number */
- condition = offset < offset_top &&
- mb->ovector[offset] != PCRE2_UNSET;
- break;
-
- case OP_DNCREF: /* Duplicate named group used test */
+ uint32_t ch;
+ Fecode++;
+ GETCHARINC(ch, Fecode);
+ GETCHARINC(fc, Feptr);
+ if (ch == fc)
{
- int count = GET2(ecode, 1 + IMM2_SIZE);
- PCRE2_SPTR slot = mb->name_table + GET2(ecode, 1) * mb->name_entry_size;
- while (count-- > 0)
- {
- offset = GET2(slot, 0) << 1;
- condition = offset < offset_top &&
- mb->ovector[offset] != PCRE2_UNSET;
- if (condition) break;
- slot += mb->name_entry_size;
- }
+ RRETURN(MATCH_NOMATCH); /* Caseful match */
}
- break;
-
- case OP_FALSE:
- case OP_FAIL: /* The assertion (?!) becomes OP_FAIL */
- break;
-
- case OP_TRUE:
- condition = TRUE;
- break;
-
- /* The condition is an assertion. Call match() to evaluate it - setting
- the MATCH_CONDASSERT bit in mb->match_function_type causes it to stop at
- the end of an assertion. */
-
- default:
- mb->match_function_type |= MATCH_CONDASSERT;
- RMATCH(eptr, ecode, offset_top, mb, NULL, RM3);
- if (rrc == MATCH_MATCH)
+ else if (Fop == OP_NOTI) /* If caseless */
{
- if (mb->end_offset_top > offset_top)
- offset_top = mb->end_offset_top; /* Captures may have happened */
- condition = TRUE;
-
- /* Advance ecode past the assertion to the start of the first branch,
- but adjust it so that the general choosing code below works. If the
- assertion has a quantifier that allows zero repeats we must skip over
- the BRAZERO. This is a lunatic thing to do, but somebody did! */
-
- if (*ecode == OP_BRAZERO) ecode++;
- ecode += GET(ecode, 1);
- while (*ecode == OP_ALT) ecode += GET(ecode, 1);
- ecode += 1 + LINK_SIZE - PRIV(OP_lengths)[condcode];
- }
-
- /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
- assertion; it is therefore treated as NOMATCH. Any other return is an
- error. */
-
- else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
- {
- RRETURN(rrc); /* Need braces because of following else */
- }
- break;
- }
-
- /* Choose branch according to the condition */
-
- ecode += condition? PRIV(OP_lengths)[condcode] : codelink;
-
- /* We are now at the branch that is to be obeyed. As there is only one, we
- can use tail recursion to avoid using another stack frame, except when
- there is unlimited repeat of a possibly empty group. In the latter case, a
- recursive call to match() is always required, unless the second alternative
- doesn't exist, in which case we can just plough on. Note that, for
- compatibility with Perl, the | in a conditional group is NOT treated as
- creating two alternatives. If a THEN is encountered in the branch, it
- propagates out to the enclosing alternative (unless nested in a deeper set
- of alternatives, of course). */
-
- if (condition || ecode[-(1+LINK_SIZE)] == OP_ALT)
- {
- if (op != OP_SCOND)
- {
- goto TAIL_RECURSE;
+ if (ch > 127)
+ ch = UCD_OTHERCASE(ch);
+ else
+ ch = TABLE_GET(ch, mb->fcc, ch);
+ if (ch == fc) RRETURN(MATCH_NOMATCH);
}
-
- mb->match_function_type |= MATCH_CBEGROUP;
- RMATCH(eptr, ecode, offset_top, mb, eptrb, RM49);
- RRETURN(rrc);
}
-
- /* Condition false & no alternative; continue after the group. */
-
else
+#endif /* SUPPORT_UNICODE */
{
+ uint32_t ch = Fecode[1];
+ fc = *Feptr++;
+ if (ch == fc || (Fop == OP_NOTI && TABLE_GET(ch, mb->fcc, ch) == fc))
+ RRETURN(MATCH_NOMATCH);
+ Fecode += 2;
}
break;
- /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
- to close any currently open capturing brackets. */
-
- case OP_CLOSE:
- number = GET2(ecode, 1); /* Must be less than 65536 */
- offset = number << 1;
- mb->capture_last = (mb->capture_last & OVFLMASK) | number;
- if (offset >= mb->offset_max) mb->capture_last |= OVFLBIT; else
- {
- mb->ovector[offset] =
- mb->ovector[mb->offset_end - number];
- mb->ovector[offset+1] = eptr - mb->start_subject;
-
- /* If this group is at or above the current highwater mark, ensure that
- any groups between the current high water mark and this group are marked
- unset and then update the high water mark. */
-
- if (offset >= offset_top)
- {
- register PCRE2_SIZE *iptr = mb->ovector + offset_top;
- register PCRE2_SIZE *iend = mb->ovector + offset;
- while (iptr < iend) *iptr++ = PCRE2_UNSET;
- offset_top = offset + 2;
- }
- }
- ecode += 1 + IMM2_SIZE;
- break;
-
+ /* ===================================================================== */
+ /* Match a single character repeatedly. */
- /* End of the pattern, either real or forced. In an assertion ACCEPT,
- update the last used pointer. */
+#define Loclength F->temp_size
+#define Lstart_eptr F->temp_sptr[0]
+#define Lcharptr F->temp_sptr[1]
+#define Lmin F->temp_32[0]
+#define Lmax F->temp_32[1]
+#define Lc F->temp_32[2]
+#define Loc F->temp_32[3]
- case OP_ASSERT_ACCEPT:
- if (eptr > mb->last_used_ptr) mb->last_used_ptr = eptr;
+ case OP_EXACT:
+ case OP_EXACTI:
+ Lmin = Lmax = GET2(Fecode, 1);
+ Fecode += 1 + IMM2_SIZE;
+ goto REPEATCHAR;
- case OP_ACCEPT:
- case OP_END:
+ case OP_POSUPTO:
+ case OP_POSUPTOI:
+ reptype = REPTYPE_POS;
+ Lmin = 0;
+ Lmax = GET2(Fecode, 1);
+ Fecode += 1 + IMM2_SIZE;
+ goto REPEATCHAR;
- /* If we have matched an empty string, fail if not in an assertion and not
- in a recursion if either PCRE2_NOTEMPTY is set, or if PCRE2_NOTEMPTY_ATSTART
- is set and we have matched at the start of the subject. In both cases,
- backtracking will then try other alternatives, if any. */
+ case OP_UPTO:
+ case OP_UPTOI:
+ reptype = REPTYPE_MAX;
+ Lmin = 0;
+ Lmax = GET2(Fecode, 1);
+ Fecode += 1 + IMM2_SIZE;
+ goto REPEATCHAR;
- if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
- mb->recursive == NULL &&
- ((mb->moptions & PCRE2_NOTEMPTY) != 0 ||
- ((mb->moptions & PCRE2_NOTEMPTY_ATSTART) != 0 &&
- mstart == mb->start_subject + mb->start_offset)))
- RRETURN(MATCH_NOMATCH);
+ case OP_MINUPTO:
+ case OP_MINUPTOI:
+ reptype = REPTYPE_MIN;
+ Lmin = 0;
+ Lmax = GET2(Fecode, 1);
+ Fecode += 1 + IMM2_SIZE;
+ goto REPEATCHAR;
- /* Otherwise, we have a match. */
+ case OP_POSSTAR:
+ case OP_POSSTARI:
+ reptype = REPTYPE_POS;
+ Lmin = 0;
+ Lmax = UINT32_MAX;
+ Fecode++;
+ goto REPEATCHAR;
- mb->end_match_ptr = eptr; /* Record where we ended */
- mb->end_offset_top = offset_top; /* and how many extracts were taken */
- mb->start_match_ptr = mstart; /* and the start (\K can modify) */
+ case OP_POSPLUS:
+ case OP_POSPLUSI:
+ reptype = REPTYPE_POS;
+ Lmin = 1;
+ Lmax = UINT32_MAX;
+ Fecode++;
+ goto REPEATCHAR;
- /* For some reason, the macros don't work properly if an expression is
- given as the argument to RRETURN when the heap is in use. */
+ case OP_POSQUERY:
+ case OP_POSQUERYI:
+ reptype = REPTYPE_POS;
+ Lmin = 0;
+ Lmax = 1;
+ Fecode++;
+ goto REPEATCHAR;
- rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
- RRETURN(rrc);
+ case OP_STAR:
+ case OP_STARI:
+ case OP_MINSTAR:
+ case OP_MINSTARI:
+ case OP_PLUS:
+ case OP_PLUSI:
+ case OP_MINPLUS:
+ case OP_MINPLUSI:
+ case OP_QUERY:
+ case OP_QUERYI:
+ case OP_MINQUERY:
+ case OP_MINQUERYI:
+ fc = *Fecode++ - ((Fop < OP_STARI)? OP_STAR : OP_STARI);
+ Lmin = rep_min[fc];
+ Lmax = rep_max[fc];
+ reptype = rep_typ[fc];
- /* Assertion brackets. Check the alternative branches in turn - the
- matching won't pass the KET for an assertion. If any one branch matches,
- the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
- start of each branch to move the current point backwards, so the code at
- this level is identical to the lookahead case. When the assertion is part
- of a condition, we want to return immediately afterwards. The caller of
- this incarnation of the match() function will have set MATCH_CONDASSERT in
- mb->match_function type, and one of these opcodes will be the first opcode
- that is processed. We use a local variable that is preserved over calls to
- match() to remember this case. */
+ /* Common code for all repeated single-character matches. We first check
+ for the minimum number of characters. If the minimum equals the maximum, we
+ are done. Otherwise, if minimizing, check the rest of the pattern for a
+ match; if there isn't one, advance up to the maximum, one character at a
+ time.
- case OP_ASSERT:
- case OP_ASSERTBACK:
- save_mark = mb->mark;
- if ((mb->match_function_type & MATCH_CONDASSERT) != 0)
- {
- condassert = TRUE;
- mb->match_function_type &= ~MATCH_CONDASSERT;
- }
- else condassert = FALSE;
+ If maximizing, advance up to the maximum number of matching characters,
+ until Feptr is past the end of the maximum run. If possessive, we are
+ then done (no backing up). Otherwise, match at this position; anything
+ other than no match is immediately returned. For nomatch, back up one
+ character, unless we are matching \R and the last thing matched was
+ \r\n, in which case, back up two code units until we reach the first
+ optional character position.
- /* Loop for each branch */
+ The various UTF/non-UTF and caseful/caseless cases are handled separately,
+ for speed. */
- do
+ REPEATCHAR:
+#ifdef SUPPORT_UNICODE
+ if (utf)
{
- RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, mb, NULL, RM4);
+ Flength = 1;
+ Lcharptr = Fecode;
+ GETCHARLEN(fc, Fecode, Flength);
+ Fecode += Flength;
- /* A match means that the assertion is true; break out of the loop
- that matches its alternatives. */
+ /* Handle multi-code-unit character matching, caseful and caseless. */
- if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
+ if (Flength > 1)
{
- mstart = mb->start_match_ptr; /* In case \K reset it */
- break;
- }
-
- /* If not matched, restore the previous mark setting. */
-
- mb->mark = save_mark;
-
- /* See comment in the code for capturing groups above about handling
- THEN. */
+ uint32_t othercase;
- if (rrc == MATCH_THEN)
- {
- next_ecode = ecode + GET(ecode,1);
- if (mb->start_match_ptr < next_ecode &&
- (*ecode == OP_ALT || *next_ecode == OP_ALT))
- rrc = MATCH_NOMATCH;
- }
+ if (Fop >= OP_STARI && /* Caseless */
+ (othercase = UCD_OTHERCASE(fc)) != fc)
+ Loclength = PRIV(ord2utf)(othercase, Foccu);
+ else Loclength = 0;
- /* Anything other than NOMATCH causes the entire assertion to fail,
- passing back the return code. This includes COMMIT, SKIP, PRUNE and an
- uncaptured THEN, which means they take their normal effect. This
- consistent approach does not always have exactly the same effect as in
- Perl. */
+ for (i = 1; i <= Lmin; i++)
+ {
+ if (Feptr <= mb->end_subject - Flength &&
+ memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0) Feptr += Flength;
+ else if (Loclength > 0 &&
+ Feptr <= mb->end_subject - Loclength &&
+ memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0)
+ Feptr += Loclength;
+ else
+ {
+ CHECK_PARTIAL();
+ RRETURN(MATCH_NOMATCH);
+ }
+ }
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- ecode += GET(ecode, 1);
- }
- while (*ecode == OP_ALT); /* Continue for next alternative */
+ if (Lmin == Lmax) continue;
- /* If we have tried all the alternative branches, the assertion has
- failed. If not, we broke out after a match. */
+ if (reptype == REPTYPE_MIN)
+ {
+ for (;;)
+ {
+ RMATCH(Fecode, RM202);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
+ if (Feptr <= mb->end_subject - Flength &&
+ memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0) Feptr += Flength;
+ else if (Loclength > 0 &&
+ Feptr <= mb->end_subject - Loclength &&
+ memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0)
+ Feptr += Loclength;
+ else
+ {
+ CHECK_PARTIAL();
+ RRETURN(MATCH_NOMATCH);
+ }
+ }
+ /* Control never gets here */
+ }
- if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
+ else /* Maximize */
+ {
+ Lstart_eptr = Feptr;
+ for (i = Lmin; i < Lmax; i++)
+ {
+ if (Feptr <= mb->end_subject - Flength &&
+ memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0)
+ Feptr += Flength;
+ else if (Loclength > 0 &&
+ Feptr <= mb->end_subject - Loclength &&
+ memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0)
+ Feptr += Loclength;
+ else
+ {
+ CHECK_PARTIAL();
+ break;
+ }
+ }
- /* If checking an assertion for a condition, return MATCH_MATCH. */
+ /* After \C in UTF mode, Lstart_eptr might be in the middle of a
+ Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
+ go too far. */
- if (condassert) RRETURN(MATCH_MATCH);
+ if (reptype != REPTYPE_POS) for(;;)
+ {
+ if (Feptr <= Lstart_eptr) break;
+ RMATCH(Fecode, RM203);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ Feptr--;
+ BACKCHAR(Feptr);
+ }
+ }
+ break; /* End of repeated wide character handling */
+ }
- /* Continue from after a successful assertion, updating the offsets high
- water mark, since extracts may have been taken during the assertion. */
+ /* Length of UTF character is 1. Put it into the preserved variable and
+ fall through to the non-UTF code. */
- do ecode += GET(ecode,1); while (*ecode == OP_ALT);
- ecode += 1 + LINK_SIZE;
- offset_top = mb->end_offset_top;
- continue;
+ Lc = fc;
+ }
+ else
+#endif /* SUPPORT_UNICODE */
- /* Negative assertion: all branches must fail to match for the assertion to
- succeed. */
+ /* When not in UTF mode, load a single-code-unit character. Then proceed as
+ above. */
- case OP_ASSERT_NOT:
- case OP_ASSERTBACK_NOT:
- save_mark = mb->mark;
- if ((mb->match_function_type & MATCH_CONDASSERT) != 0)
- {
- condassert = TRUE;
- mb->match_function_type &= ~MATCH_CONDASSERT;
- }
- else condassert = FALSE;
+ Lc = *Fecode++;
- /* Loop for each alternative branch. */
+ /* Caseless comparison */
- do
+ if (Fop >= OP_STARI)
{
- RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, mb, NULL, RM5);
- mb->mark = save_mark; /* Always restore the mark setting */
+#if PCRE2_CODE_UNIT_WIDTH == 8
+ /* Lc must be < 128 in UTF-8 mode. */
+ Loc = mb->fcc[Lc];
+#else /* 16-bit & 32-bit */
+#ifdef SUPPORT_UNICODE
+ if (utf && Lc > 127) Loc = UCD_OTHERCASE(Lc);
+ else
+#endif /* SUPPORT_UNICODE */
+ Loc = TABLE_GET(Lc, mb->fcc, Lc);
+#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
- switch(rrc)
+ for (i = 1; i <= Lmin; i++)
{
- case MATCH_MATCH: /* A successful match means */
- case MATCH_ACCEPT: /* the assertion has failed. */
- RRETURN(MATCH_NOMATCH);
-
- case MATCH_NOMATCH: /* Carry on with next branch */
- break;
-
- /* See comment in the code for capturing groups above about handling
- THEN. */
-
- case MATCH_THEN:
- next_ecode = ecode + GET(ecode,1);
- if (mb->start_match_ptr < next_ecode &&
- (*ecode == OP_ALT || *next_ecode == OP_ALT))
+ uint32_t cc; /* Faster than PCRE2_UCHAR */
+ if (Feptr >= mb->end_subject)
{
- rrc = MATCH_NOMATCH;
- break;
+ SCHECK_PARTIAL();
+ RRETURN(MATCH_NOMATCH);
}
- /* Otherwise fall through. */
-
- /* COMMIT, SKIP, PRUNE, and an uncaptured THEN cause the whole
- assertion to fail to match, without considering any more alternatives.
- Failing to match means the assertion is true. This is a consistent
- approach, but does not always have the same effect as in Perl. */
-
- case MATCH_COMMIT:
- case MATCH_SKIP:
- case MATCH_SKIP_ARG:
- case MATCH_PRUNE:
- do ecode += GET(ecode,1); while (*ecode == OP_ALT);
- goto NEG_ASSERT_TRUE; /* Break out of alternation loop */
-
- /* Anything else is an error */
-
- default:
- RRETURN(rrc);
+ cc = UCHAR21TEST(Feptr);
+ if (Lc != cc && Loc != cc) RRETURN(MATCH_NOMATCH);
+ Feptr++;
}
+ if (Lmin == Lmax) continue;
- /* Continue with next branch */
-
- ecode += GET(ecode,1);
- }
- while (*ecode == OP_ALT);
-
- /* All branches in the assertion failed to match. */
-
- NEG_ASSERT_TRUE:
- if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
- ecode += 1 + LINK_SIZE; /* Continue with current branch */
- continue;
-
- /* Move the subject pointer back. This occurs only at the start of
- each branch of a lookbehind assertion. If we are too close to the start to
- move back, this match function fails. When working with UTF-8 we move
- back a number of characters, not bytes. */
-
- case OP_REVERSE:
- i = GET(ecode, 1);
-#ifdef SUPPORT_UNICODE
- if (utf)
- {
- while (i-- > 0)
+ if (reptype == REPTYPE_MIN)
{
- if (eptr <= mb->start_subject) RRETURN(MATCH_NOMATCH);
- eptr--;
- BACKCHAR(eptr);
+ for (;;)
+ {
+ uint32_t cc; /* Faster than PCRE2_UCHAR */
+ RMATCH(Fecode, RM25);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
+ if (Feptr >= mb->end_subject)
+ {
+ SCHECK_PARTIAL();
+ RRETURN(MATCH_NOMATCH);
+ }
+ cc = UCHAR21TEST(Feptr);
+ if (Lc != cc && Loc != cc) RRETURN(MATCH_NOMATCH);
+ Feptr++;
+ }
+ /* Control never gets here */
}
- }
- else
-#endif
-
- /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
-
- {
- if (i > eptr - mb->start_subject) RRETURN(MATCH_NOMATCH);
- eptr -= i;
- }
-
- /* Save the earliest consulted character, then skip to next op code */
-
- if (eptr < mb->start_used_ptr) mb->start_used_ptr = eptr;
- ecode += 1 + LINK_SIZE;
- break;
- /* The callout item calls an external function, if one is provided, passing
- details of the match so far. This is mainly for debugging, though the
- function is able to force a failure. */
-
- case OP_CALLOUT:
- case OP_CALLOUT_STR:
- {
- unsigned int callout_length = (*ecode == OP_CALLOUT)
- ? PRIV(OP_lengths)[OP_CALLOUT] : GET(ecode, 1 + 2*LINK_SIZE);
-
- if (mb->callout != NULL)
+ else /* Maximize */
{
- pcre2_callout_block cb;
- cb.version = 1;
- cb.callout_number = ecode[LINK_SIZE + 1];
- cb.capture_top = offset_top/2;
- cb.capture_last = mb->capture_last & CAPLMASK;
- cb.offset_vector = mb->ovector;
- cb.mark = mb->nomatch_mark;
- cb.subject = mb->start_subject;
- cb.subject_length = (PCRE2_SIZE)(mb->end_subject - mb->start_subject);
- cb.start_match = (PCRE2_SIZE)(mstart - mb->start_subject);
- cb.current_position = (PCRE2_SIZE)(eptr - mb->start_subject);
- cb.pattern_position = GET(ecode, 1);
- cb.next_item_length = GET(ecode, 1 + LINK_SIZE);
-
- if (*ecode == OP_CALLOUT)
+ Lstart_eptr = Feptr;
+ for (i = Lmin; i < Lmax; i++)
{
- cb.callout_number = ecode[1 + 2*LINK_SIZE];
- cb.callout_string_offset = 0;
- cb.callout_string = NULL;
- cb.callout_string_length = 0;
+ uint32_t cc; /* Faster than PCRE2_UCHAR */
+ if (Feptr >= mb->end_subject)
+ {
+ SCHECK_PARTIAL();
+ break;
+ }
+ cc = UCHAR21TEST(Feptr);
+ if (Lc != cc && Loc != cc) break;
+ Feptr++;
}
- else
+ if (reptype != REPTYPE_POS) for (;;)
{
- cb.callout_number = 0;
- cb.callout_string_offset = GET(ecode, 1 + 3*LINK_SIZE);
- cb.callout_string = ecode + (1 + 4*LINK_SIZE) + 1;
- cb.callout_string_length =
- callout_length - (1 + 4*LINK_SIZE) - 2;
+ if (Feptr == Lstart_eptr) break;
+ RMATCH(Fecode, RM26);
+ Feptr--;
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
}
-
- if ((rrc = mb->callout(&cb, mb->callout_data)) > 0)
- RRETURN(MATCH_NOMATCH);
- if (rrc < 0) RRETURN(rrc);
}
- ecode += callout_length;
}
- break;
-
- /* Recursion either matches the current regex, or some subexpression. The
- offset data is the offset to the starting bracket from the start of the
- whole pattern. (This is so that it works from duplicated subpatterns.)
-
- The state of the capturing groups is preserved over recursion, and
- re-instated afterwards. We don't know how many are started and not yet
- finished (offset_top records the completed total) so we just have to save
- all the potential data. There may be up to 65535 such values, which is too
- large to put on the stack, but using malloc for small numbers seems
- expensive. As a compromise, the stack is used when there are no more than
- OP_RECURSE_STACK_SAVE_MAX values to store; otherwise malloc is used.
- There are also other values that have to be saved. We use a chained
- sequence of blocks that actually live on the stack. Thanks to Robin Houston
- for the original version of this logic. It has, however, been hacked around
- a lot, so he is not to blame for the current way it works. */
+ /* Caseful comparisons (includes all multi-byte characters) */
- case OP_RECURSE:
+ else
{
- ovecsave_frame *fr;
- recursion_info *ri;
- uint32_t recno;
-
- callpat = mb->start_code + GET(ecode, 1);
- recno = (callpat == mb->start_code)? 0 : GET2(callpat, 1 + LINK_SIZE);
-
- /* Check for repeating a pattern recursion without advancing the subject
- pointer. This should catch convoluted mutual recursions. (Some simple
- cases are caught at compile time.) */
-
- for (ri = mb->recursive; ri != NULL; ri = ri->prevrec)
- if (recno == ri->group_num && eptr == ri->subject_position)
- RRETURN(PCRE2_ERROR_RECURSELOOP);
-
- /* Add to "recursing stack" */
-
- new_recursive.group_num = recno;
- new_recursive.saved_capture_last = mb->capture_last;
- new_recursive.subject_position = eptr;
- new_recursive.prevrec = mb->recursive;
- mb->recursive = &new_recursive;
-
- /* Where to continue from afterwards */
-
- ecode += 1 + LINK_SIZE;
-
- /* When we are using the system stack for match() recursion we can call a
- function that uses the system stack for preserving the ovector while
- processing the pattern recursion, but only if the ovector is small
- enough. */
-
-#ifndef HEAP_MATCH_RECURSE
- if (mb->offset_end <= OP_RECURSE_STACK_SAVE_MAX)
- {
- rrc = op_recurse_ovecsave(eptr, callpat, mstart, offset_top, mb,
- eptrb, rdepth);
- mb->recursive = new_recursive.prevrec;
- if (rrc != MATCH_MATCH && rrc != MATCH_ACCEPT) RRETURN(rrc);
-
- /* Set where we got to in the subject, and reset the start, in case
- it was changed by \K. This *is* propagated back out of a recursion,
- for Perl compatibility. */
-
- eptr = mb->end_match_ptr;
- mstart = mb->start_match_ptr;
- break; /* End of processing OP_RECURSE */
- }
-#endif
- /* If the ovector is too big, or if we are using the heap for match()
- recursion, we have to use the heap for saving the ovector. Used ovecsave
- frames are kept on a chain and re-used. This makes a small improvement in
- execution time on Linux. */
-
- if (mb->ovecsave_chain != NULL)
- {
- new_recursive.ovec_save = mb->ovecsave_chain->saved_ovec;
- mb->ovecsave_chain = mb->ovecsave_chain->next;
- }
- else
+ for (i = 1; i <= Lmin; i++)
{
- fr = (ovecsave_frame *)(mb->memctl.malloc(sizeof(ovecsave_frame *) +
- mb->offset_end * sizeof(PCRE2_SIZE), mb->memctl.memory_data));
- if (fr == NULL) RRETURN(PCRE2_ERROR_NOMEMORY);
- new_recursive.ovec_save = fr->saved_ovec;
+ if (Feptr >= mb->end_subject)
+ {
+ SCHECK_PARTIAL();
+ RRETURN(MATCH_NOMATCH);
+ }
+ if (Lc != UCHAR21INCTEST(Feptr)) RRETURN(MATCH_NOMATCH);
}
- memcpy(new_recursive.ovec_save, mb->ovector,
- mb->offset_end * sizeof(PCRE2_SIZE));
+ if (Lmin == Lmax) continue;
- /* Do the recursion. After processing each alternative, restore the
- ovector data and the last captured value. This code has the same overall
- logic as the code in the op_recurse_ovecsave() function, but is adapted
- to use RMATCH/RRETURN and to release the heap block containing the saved
- ovector. */
-
- cbegroup = (*callpat >= OP_SBRA);
- do
+ if (reptype == REPTYPE_MIN)
{
- if (cbegroup) mb->match_function_type |= MATCH_CBEGROUP;
- RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
- mb, eptrb, RM6);
- memcpy(mb->ovector, new_recursive.ovec_save,
- mb->offset_end * sizeof(PCRE2_SIZE));
- mb->capture_last = new_recursive.saved_capture_last;
- mb->recursive = new_recursive.prevrec;
-
- if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
+ for (;;)
{
- fr = (ovecsave_frame *)
- ((uint8_t *)new_recursive.ovec_save - sizeof(ovecsave_frame *));
- fr->next = mb->ovecsave_chain;
- mb->ovecsave_chain = fr;
-
- /* Set where we got to in the subject, and reset the start, in case
- it was changed by \K. This *is* propagated back out of a recursion,
- for Perl compatibility. */
-
- eptr = mb->end_match_ptr;
- mstart = mb->start_match_ptr;
- goto RECURSION_MATCHED; /* Exit loop; end processing */
+ RMATCH(Fecode, RM27);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
+ if (Feptr >= mb->end_subject)
+ {
+ SCHECK_PARTIAL();
+ RRETURN(MATCH_NOMATCH);
+ }
+ if (Lc != UCHAR21INCTEST(Feptr)) RRETURN(MATCH_NOMATCH);
}
+ /* Control never gets here */
+ }
+ else /* Maximize */
+ {
+ Lstart_eptr = Feptr;
+ for (i = Lmin; i < Lmax; i++)
+ {
+ if (Feptr >= mb->end_subject)
+ {
+ SCHECK_PARTIAL();
+ break;
+ }
- /* PCRE does not allow THEN, SKIP, PRUNE or COMMIT to escape beyond a
- recursion; they cause a NOMATCH for the entire recursion. These codes
- are defined in a range that can be tested for. */
+ if (Lc != UCHAR21TEST(Feptr)) break;
+ Feptr++;
+ }
- if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX)
+ if (reptype != REPTYPE_POS) for (;;)
{
- rrc = MATCH_NOMATCH;
- goto RECURSION_RETURN;
+ if (Feptr <= Lstart_eptr) break;
+ RMATCH(Fecode, RM28);
+ Feptr--;
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
}
-
- /* Any return code other than NOMATCH is an error. */
-
- if (rrc != MATCH_NOMATCH) goto RECURSION_RETURN;
- mb->recursive = &new_recursive;
- callpat += GET(callpat, 1);
}
- while (*callpat == OP_ALT);
-
- RECURSION_RETURN:
- mb->recursive = new_recursive.prevrec;
- fr = (ovecsave_frame *)
- ((uint8_t *)new_recursive.ovec_save - sizeof(ovecsave_frame *));
- fr->next = mb->ovecsave_chain;
- mb->ovecsave_chain = fr;
- RRETURN(rrc);
}
-
- RECURSION_MATCHED:
break;
- /* An alternation is the end of a branch; scan along to find the end of the
- bracketed group and go to there. */
+#undef Loclength
+#undef Lstart_eptr
+#undef Lcharptr
+#undef Lmin
+#undef Lmax
+#undef Lc
+#undef Loc
- case OP_ALT:
- do ecode += GET(ecode,1); while (*ecode == OP_ALT);
- break;
- /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
- indicating that it may occur zero times. It may repeat infinitely, or not
- at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
- with fixed upper repeat limits are compiled as a number of copies, with the
- optional ones preceded by BRAZERO or BRAMINZERO. */
+ /* ===================================================================== */
+ /* Match a negated single one-byte character repeatedly. This is almost a
+ repeat of the code for a repeated single character, but I haven't found a
+ nice way of commoning these up that doesn't require a test of the
+ positive/negative option for each character match. Maybe that wouldn't add
+ very much to the time taken, but character matching *is* what this is all
+ about... */
- case OP_BRAZERO:
- next_ecode = ecode + 1;
- RMATCH(eptr, next_ecode, offset_top, mb, eptrb, RM10);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- do next_ecode += GET(next_ecode, 1); while (*next_ecode == OP_ALT);
- ecode = next_ecode + 1 + LINK_SIZE;
- break;
+#define Lstart_eptr F->temp_sptr[0]
+#define Lmin F->temp_32[0]
+#define Lmax F->temp_32[1]
+#define Lc F->temp_32[2]
+#define Loc F->temp_32[3]
- case OP_BRAMINZERO:
- next_ecode = ecode + 1;
- do next_ecode += GET(next_ecode, 1); while (*next_ecode == OP_ALT);
- RMATCH(eptr, next_ecode + 1+LINK_SIZE, offset_top, mb, eptrb, RM11);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- ecode++;
- break;
+ case OP_NOTEXACT:
+ case OP_NOTEXACTI:
+ Lmin = Lmax = GET2(Fecode, 1);
+ Fecode += 1 + IMM2_SIZE;
+ goto REPEATNOTCHAR;
- case OP_SKIPZERO:
- next_ecode = ecode+1;
- do next_ecode += GET(next_ecode,1); while (*next_ecode == OP_ALT);
- ecode = next_ecode + 1 + LINK_SIZE;
- break;
+ case OP_NOTUPTO:
+ case OP_NOTUPTOI:
+ Lmin = 0;
+ Lmax = GET2(Fecode, 1);
+ reptype = REPTYPE_MAX;
+ Fecode += 1 + IMM2_SIZE;
+ goto REPEATNOTCHAR;
- /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
- here; just jump to the group, with allow_zero set TRUE. */
+ case OP_NOTMINUPTO:
+ case OP_NOTMINUPTOI:
+ Lmin = 0;
+ Lmax = GET2(Fecode, 1);
+ reptype = REPTYPE_MIN;
+ Fecode += 1 + IMM2_SIZE;
+ goto REPEATNOTCHAR;
- case OP_BRAPOSZERO:
- op = *(++ecode);
- allow_zero = TRUE;
- if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
- goto POSSESSIVE_NON_CAPTURE;
+ case OP_NOTPOSSTAR:
+ case OP_NOTPOSSTARI:
+ reptype = REPTYPE_POS;
+ Lmin = 0;
+ Lmax = UINT32_MAX;
+ Fecode++;
+ goto REPEATNOTCHAR;
- /* End of a group, repeated or non-repeating. */
+ case OP_NOTPOSPLUS:
+ case OP_NOTPOSPLUSI:
+ reptype = REPTYPE_POS;
+ Lmin = 1;
+ Lmax = UINT32_MAX;
+ Fecode++;
+ goto REPEATNOTCHAR;
- case OP_KET:
- case OP_KETRMIN:
- case OP_KETRMAX:
- case OP_KETRPOS:
- prev = ecode - GET(ecode, 1);
+ case OP_NOTPOSQUERY:
+ case OP_NOTPOSQUERYI:
+ reptype = REPTYPE_POS;
+ Lmin = 0;
+ Lmax = 1;
+ Fecode++;
+ goto REPEATNOTCHAR;
- /* If this was a group that remembered the subject start, in order to break
- infinite repeats of empty string matches, retrieve the subject start from
- the chain. Otherwise, set it NULL. */
+ case OP_NOTPOSUPTO:
+ case OP_NOTPOSUPTOI:
+ reptype = REPTYPE_POS;
+ Lmin = 0;
+ Lmax = GET2(Fecode, 1);
+ Fecode += 1 + IMM2_SIZE;
+ goto REPEATNOTCHAR;
- if (*prev >= OP_SBRA || *prev == OP_ONCE)
- {
- saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
- eptrb = eptrb->epb_prev; /* Backup to previous group */
- }
- else saved_eptr = NULL;
+ case OP_NOTSTAR:
+ case OP_NOTSTARI:
+ case OP_NOTMINSTAR:
+ case OP_NOTMINSTARI:
+ case OP_NOTPLUS:
+ case OP_NOTPLUSI:
+ case OP_NOTMINPLUS:
+ case OP_NOTMINPLUSI:
+ case OP_NOTQUERY:
+ case OP_NOTQUERYI:
+ case OP_NOTMINQUERY:
+ case OP_NOTMINQUERYI:
+ fc = *Fecode++ - ((Fop >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
+ Lmin = rep_min[fc];
+ Lmax = rep_max[fc];
+ reptype = rep_typ[fc];
- /* If we are at the end of an assertion group or a non-capturing atomic
- group, stop matching and return MATCH_MATCH, but record the current high
- water mark for use by positive assertions. We also need to record the match
- start in case it was changed by \K. */
+ /* Common code for all repeated single-character non-matches. */
- if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
- *prev == OP_ONCE_NC)
- {
- mb->end_match_ptr = eptr; /* For ONCE_NC */
- mb->end_offset_top = offset_top;
- mb->start_match_ptr = mstart;
- if (eptr > mb->last_used_ptr) mb->last_used_ptr = eptr;
- RRETURN(MATCH_MATCH); /* Sets mb->mark */
- }
+ REPEATNOTCHAR:
+ GETCHARINCTEST(Lc, Fecode);
- /* For capturing groups we have to check the group number back at the start
- and if necessary complete handling an extraction by setting the offsets and
- bumping the high water mark. Whole-pattern recursion is coded as a recurse
- into group 0, so it won't be picked up here. Instead, we catch it when the
- OP_END is reached. Other recursion is handled here. We just have to record
- the current subject position and start match pointer and give a MATCH
- return. */
+ /* The code is duplicated for the caseless and caseful cases, for speed,
+ since matching characters is likely to be quite common. First, ensure the
+ minimum number of matches are present. If Lmin = Lmax, we are done.
+ Otherwise, if minimizing, keep trying the rest of the expression and
+ advancing one matching character if failing, up to the maximum.
+ Alternatively, if maximizing, find the maximum number of characters and
+ work backwards. */
- if (*prev == OP_CBRA || *prev == OP_SCBRA ||
- *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
+ if (Fop >= OP_NOTSTARI) /* Caseless */
{
- number = GET2(prev, 1+LINK_SIZE);
- offset = number << 1;
-
- /* Handle a recursively called group. */
-
- if (mb->recursive != NULL && mb->recursive->group_num == number)
- {
- mb->end_match_ptr = eptr;
- mb->start_match_ptr = mstart;
- if (eptr > mb->last_used_ptr) mb->last_used_ptr = eptr;
- RRETURN(MATCH_MATCH);
- }
+#ifdef SUPPORT_UNICODE
+ if (utf && Lc > 127)
+ Loc = UCD_OTHERCASE(Lc);
+ else
+#endif /* SUPPORT_UNICODE */
- /* Deal with capturing */
+ Loc = TABLE_GET(Lc, mb->fcc, Lc); /* Other case from table */
- mb->capture_last = (mb->capture_last & OVFLMASK) | number;
- if (offset >= mb->offset_max) mb->capture_last |= OVFLBIT; else
+#ifdef SUPPORT_UNICODE
+ if (utf)
{
- /* If offset is greater than offset_top, it means that we are
- "skipping" a capturing group, and that group's offsets must be marked
- unset. In earlier versions of PCRE, all the offsets were unset at the
- start of matching, but this doesn't work because atomic groups and
- assertions can cause a value to be set that should later be unset.
- Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
- part of the atomic group, but this is not on the final matching path,
- so must be unset when 2 is set. (If there is no group 2, there is no
- problem, because offset_top will then be 2, indicating no capture.) */
-
- if (offset > offset_top)
+ uint32_t d;
+ for (i = 1; i <= Lmin; i++)
{
- register PCRE2_SIZE *iptr = mb->ovector + offset_top;
- register PCRE2_SIZE *iend = mb->ovector + offset;
- while (iptr < iend) *iptr++ = PCRE2_UNSET;
+ if (Feptr >= mb->end_subject)
+ {
+ SCHECK_PARTIAL();
+ RRETURN(MATCH_NOMATCH);
+ }
+ GETCHARINC(d, Feptr);
+ if (Lc == d || Loc == d) RRETURN(MATCH_NOMATCH);
}
-
- /* Now make the extraction */
-
- mb->ovector[offset] = mb->ovector[mb->offset_end - number];
- mb->ovector[offset+1] = eptr - mb->start_subject;
- if (offset_top <= offset) offset_top = offset + 2;
- }
- }
-
- /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
- and return the MATCH_KETRPOS. This makes it possible to do the repeats one
- at a time from the outer level, thus saving stack. This must precede the
- empty string test - in this case that test is done at the outer level. */
-
- if (*ecode == OP_KETRPOS)
- {
- mb->start_match_ptr = mstart; /* In case \K reset it */
- mb->end_match_ptr = eptr;
- mb->end_offset_top = offset_top;
- if (eptr > mb->last_used_ptr) mb->last_used_ptr = eptr;
- RRETURN(MATCH_KETRPOS);
- }
-
- /* For an ordinary non-repeating ket, just continue at this level. This
- also happens for a repeating ket if no characters were matched in the
- group. This is the forcible breaking of infinite loops as implemented in
- Perl 5.005. For a non-repeating atomic group that includes captures,
- establish a backup point by processing the rest of the pattern at a lower
- level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
- original OP_ONCE level, thereby bypassing intermediate backup points, but
- resetting any captures that happened along the way. */
-
- if (*ecode == OP_KET || eptr == saved_eptr)
- {
- if (*prev == OP_ONCE)
- {
- RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, mb, eptrb, RM12);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- mb->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
- RRETURN(MATCH_ONCE);
- }
- ecode += 1 + LINK_SIZE; /* Carry on at this level */
- break;
- }
-
- /* The normal repeating kets try the rest of the pattern or restart from
- the preceding bracket, in the appropriate order. In the second case, we can
- use tail recursion to avoid using another stack frame, unless we have an
- an atomic group or an unlimited repeat of a group that can match an empty
- string. */
-
- if (*ecode == OP_KETRMIN)
- {
- RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, mb, eptrb, RM7);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (*prev == OP_ONCE)
- {
- RMATCH(eptr, prev, offset_top, mb, eptrb, RM8);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- mb->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
- RRETURN(MATCH_ONCE);
- }
- if (*prev >= OP_SBRA) /* Could match an empty string */
- {
- RMATCH(eptr, prev, offset_top, mb, eptrb, RM50);
- RRETURN(rrc);
}
- ecode = prev;
- goto TAIL_RECURSE;
- }
- else /* OP_KETRMAX */
- {
- RMATCH(eptr, prev, offset_top, mb, eptrb, RM13);
- if (rrc == MATCH_ONCE && mb->once_target == prev) rrc = MATCH_NOMATCH;
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (*prev == OP_ONCE)
- {
- RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, mb, eptrb, RM9);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- mb->once_target = prev;
- RRETURN(MATCH_ONCE);
- }
- ecode += 1 + LINK_SIZE;
- goto TAIL_RECURSE;
- }
- /* Control never gets here */
-
- /* Not multiline mode: start of subject assertion, unless notbol. */
-
- case OP_CIRC:
- if ((mb->moptions & PCRE2_NOTBOL) != 0 && eptr == mb->start_subject)
- RRETURN(MATCH_NOMATCH);
-
- /* Start of subject assertion */
-
- case OP_SOD:
- if (eptr != mb->start_subject) RRETURN(MATCH_NOMATCH);
- ecode++;
- break;
-
- /* Multiline mode: start of subject unless notbol, or after any newline
- except for one at the very end, unless PCRE2_ALT_CIRCUMFLEX is set. */
-
- case OP_CIRCM:
- if ((mb->moptions & PCRE2_NOTBOL) != 0 && eptr == mb->start_subject)
- RRETURN(MATCH_NOMATCH);
- if (eptr != mb->start_subject &&
- ((eptr == mb->end_subject &&
- (mb->poptions & PCRE2_ALT_CIRCUMFLEX) == 0) ||
- !WAS_NEWLINE(eptr)))
- RRETURN(MATCH_NOMATCH);
- ecode++;
- break;
-
- /* Start of match assertion */
-
- case OP_SOM:
- if (eptr != mb->start_subject + mb->start_offset) RRETURN(MATCH_NOMATCH);
- ecode++;
- break;
-
- /* Reset the start of match point */
-
- case OP_SET_SOM:
- mstart = eptr;
- ecode++;
- break;
-
- /* Multiline mode: assert before any newline, or before end of subject
- unless noteol is set. */
+ else
+#endif /* SUPPORT_UNICODE */
- case OP_DOLLM:
- if (eptr < mb->end_subject)
- {
- if (!IS_NEWLINE(eptr))
+ /* Not UTF mode */
{
- if (mb->partial != 0 &&
- eptr + 1 >= mb->end_subject &&
- NLBLOCK->nltype == NLTYPE_FIXED &&
- NLBLOCK->nllen == 2 &&
- UCHAR21TEST(eptr) == NLBLOCK->nl[0])
+ for (i = 1; i <= Lmin; i++)
{
- mb->hitend = TRUE;
- if (mb->partial > 1) RRETURN(PCRE2_ERROR_PARTIAL);
+ if (Feptr >= mb->end_subject)
+ {
+ SCHECK_PARTIAL();
+ RRETURN(MATCH_NOMATCH);
+ }
+ if (Lc == *Feptr || Loc == *Feptr) RRETURN(MATCH_NOMATCH);
+ Feptr++;
}
- RRETURN(MATCH_NOMATCH);
}
- }
- else
- {
- if ((mb->moptions & PCRE2_NOTEOL) != 0) RRETURN(MATCH_NOMATCH);
- SCHECK_PARTIAL();
- }
- ecode++;
- break;
-
- /* Not multiline mode: assert before a terminating newline or before end of
- subject unless noteol is set. */
-
- case OP_DOLL:
- if ((mb->moptions & PCRE2_NOTEOL) != 0) RRETURN(MATCH_NOMATCH);
- if ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0) goto ASSERT_NL_OR_EOS;
- /* ... else fall through for endonly */
+ if (Lmin == Lmax) continue; /* Finished for exact count */
- /* End of subject assertion (\z) */
-
- case OP_EOD:
- if (eptr < mb->end_subject) RRETURN(MATCH_NOMATCH);
- SCHECK_PARTIAL();
- ecode++;
- break;
-
- /* End of subject or ending \n assertion (\Z) */
-
- case OP_EODN:
- ASSERT_NL_OR_EOS:
- if (eptr < mb->end_subject &&
- (!IS_NEWLINE(eptr) || eptr != mb->end_subject - mb->nllen))
- {
- if (mb->partial != 0 &&
- eptr + 1 >= mb->end_subject &&
- NLBLOCK->nltype == NLTYPE_FIXED &&
- NLBLOCK->nllen == 2 &&
- UCHAR21TEST(eptr) == NLBLOCK->nl[0])
+ if (reptype == REPTYPE_MIN)
{
- mb->hitend = TRUE;
- if (mb->partial > 1) RRETURN(PCRE2_ERROR_PARTIAL);
- }
- RRETURN(MATCH_NOMATCH);
- }
-
- /* Either at end of string or \n before end. */
-
- SCHECK_PARTIAL();
- ecode++;
- break;
-
- /* Word boundary assertions */
-
- case OP_NOT_WORD_BOUNDARY:
- case OP_WORD_BOUNDARY:
- {
-
- /* Find out if the previous and current characters are "word" characters.
- It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
- be "non-word" characters. Remember the earliest consulted character for
- partial matching. */
-
#ifdef SUPPORT_UNICODE
- if (utf)
- {
- /* Get status of previous character */
-
- if (eptr == mb->start_subject) prev_is_word = FALSE; else
+ if (utf)
{
- PCRE2_SPTR lastptr = eptr - 1;
- BACKCHAR(lastptr);
- if (lastptr < mb->start_used_ptr) mb->start_used_ptr = lastptr;
- GETCHAR(c, lastptr);
- if ((mb->poptions & PCRE2_UCP) != 0)
+ uint32_t d;
+ for (;;)
{
- if (c == '_') prev_is_word = TRUE; else
+ RMATCH(Fecode, RM204);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
+ if (Feptr >= mb->end_subject)
{
- int cat = UCD_CATEGORY(c);
- prev_is_word = (cat == ucp_L || cat == ucp_N);
+ SCHECK_PARTIAL();
+ RRETURN(MATCH_NOMATCH);
}
+ GETCHARINC(d, Feptr);
+ if (Lc == d || Loc == d) RRETURN(MATCH_NOMATCH);
}
- else
- prev_is_word = c < 256 && (mb->ctypes[c] & ctype_word) != 0;
- }
-
- /* Get status of next character */
-
- if (eptr >= mb->end_subject)
- {
- SCHECK_PARTIAL();
- cur_is_word = FALSE;
}
else
+#endif /*SUPPORT_UNICODE */
+
+ /* Not UTF mode */
{
- PCRE2_SPTR nextptr = eptr + 1;
- FORWARDCHARTEST(nextptr, mb->end_subject);
- if (nextptr > mb->last_used_ptr) mb->last_used_ptr = nextptr;
- GETCHAR(c, eptr);
- if ((mb->poptions & PCRE2_UCP) != 0)
+ for (;;)
{
- if (c == '_') cur_is_word = TRUE; else
+ RMATCH(Fecode, RM29);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
+ if (Feptr >= mb->end_subject)
{
- int cat = UCD_CATEGORY(c);
- cur_is_word = (cat == ucp_L || cat == ucp_N);
+ SCHECK_PARTIAL();
+ RRETURN(MATCH_NOMATCH);
}
+ if (Lc == *Feptr || Loc == *Feptr) RRETURN(MATCH_NOMATCH);
+ Feptr++;
}
- else
- cur_is_word = c < 256 && (mb->ctypes[c] & ctype_word) != 0;
}
+ /* Control never gets here */
}
- else
-#endif /* SUPPORT UTF */
- /* Not in UTF-8 mode, but we may still have PCRE2_UCP set, and for
- consistency with the behaviour of \w we do use it in this case. */
+ /* Maximize case */
+ else
{
- /* Get status of previous character */
+ Lstart_eptr = Feptr;
- if (eptr == mb->start_subject) prev_is_word = FALSE; else
- {
- if (eptr <= mb->start_used_ptr) mb->start_used_ptr = eptr - 1;
#ifdef SUPPORT_UNICODE
- if ((mb->poptions & PCRE2_UCP) != 0)
+ if (utf)
+ {
+ uint32_t d;
+ for (i = Lmin; i < Lmax; i++)
{
- c = eptr[-1];
- if (c == '_') prev_is_word = TRUE; else
+ int len = 1;
+ if (Feptr >= mb->end_subject)
{
- int cat = UCD_CATEGORY(c);
- prev_is_word = (cat == ucp_L || cat == ucp_N);
+ SCHECK_PARTIAL();
+ break;
}
+ GETCHARLEN(d, Feptr, len);
+ if (Lc == d || Loc == d) break;
+ Feptr += len;
}
- else
-#endif
- prev_is_word = MAX_255(eptr[-1])
- && ((mb->ctypes[eptr[-1]] & ctype_word) != 0);
- }
- /* Get status of next character */
+ /* After \C in UTF mode, Lstart_eptr might be in the middle of a
+ Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
+ go too far. */
- if (eptr >= mb->end_subject)
- {
- SCHECK_PARTIAL();
- cur_is_word = FALSE;
+ if (reptype != REPTYPE_POS) for(;;)
+ {
+ if (Feptr <= Lstart_eptr) break;
+ RMATCH(Fecode, RM205);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ Feptr--;
+ BACKCHAR(Feptr);
+ }
}
else
+#endif /* SUPPORT_UNICODE */
+
+ /* Not UTF mode */
{
- if (eptr >= mb->last_used_ptr) mb->last_used_ptr = eptr + 1;
-#ifdef SUPPORT_UNICODE
- if ((mb->poptions & PCRE2_UCP) != 0)
+ for (i = Lmin; i < Lmax; i++)
{
- c = *eptr;
- if (c == '_') cur_is_word = TRUE; else
+ if (Feptr >= mb->end_subject)
{
- int cat = UCD_CATEGORY(c);
- cur_is_word = (cat == ucp_L || cat == ucp_N);
+ SCHECK_PARTIAL();
+ break;
}
+ if (Lc == *Feptr || Loc == *Feptr) break;
+ Feptr++;
+ }
+ if (reptype != REPTYPE_POS) for (;;)
+ {
+ if (Feptr == Lstart_eptr) break;
+ RMATCH(Fecode, RM30);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ Feptr--;
}
- else
-#endif
- cur_is_word = MAX_255(*eptr)
- && ((mb->ctypes[*eptr] & ctype_word) != 0);
}
}
-
- /* Now see if the situation is what we want */
-
- if ((*ecode++ == OP_WORD_BOUNDARY)?
- cur_is_word == prev_is_word : cur_is_word != prev_is_word)
- RRETURN(MATCH_NOMATCH);
- }
- break;
-
- /* Match any single character type except newline; have to take care with
- CRLF newlines and partial matching. */
-
- case OP_ANY:
- if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
- if (mb->partial != 0 &&
- eptr + 1 >= mb->end_subject &&
- NLBLOCK->nltype == NLTYPE_FIXED &&
- NLBLOCK->nllen == 2 &&
- UCHAR21TEST(eptr) == NLBLOCK->nl[0])
- {
- mb->hitend = TRUE;
- if (mb->partial > 1) RRETURN(PCRE2_ERROR_PARTIAL);
- }
-
- /* Fall through */
-
- /* Match any single character whatsoever. */
-
- case OP_ALLANY:
- if (eptr >= mb->end_subject) /* DO NOT merge the eptr++ here; it must */
- { /* not be updated before SCHECK_PARTIAL. */
- SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
}
- eptr++;
-#ifdef SUPPORT_UNICODE
- if (utf) ACROSSCHAR(eptr < mb->end_subject, *eptr, eptr++);
-#endif
- ecode++;
- break;
-
- /* Match a single code unit, even in UTF-8 mode. This opcode really does
- match any code unit, even newline. (It really should be called ANYCODEUNIT,
- of course - the byte name is from pre-16 bit days.) */
- case OP_ANYBYTE:
- if (eptr >= mb->end_subject) /* DO NOT merge the eptr++ here; it must */
- { /* not be updated before SCHECK_PARTIAL. */
- SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
- }
- eptr++;
- ecode++;
- break;
-
- case OP_NOT_DIGIT:
- if (eptr >= mb->end_subject)
- {
- SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
- }
- GETCHARINCTEST(c, eptr);
- if (
-#ifdef SUPPORT_WIDE_CHARS
- c < 256 &&
-#endif
- (mb->ctypes[c] & ctype_digit) != 0
- )
- RRETURN(MATCH_NOMATCH);
- ecode++;
- break;
-
- case OP_DIGIT:
- if (eptr >= mb->end_subject)
- {
- SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
- }
- GETCHARINCTEST(c, eptr);
- if (
-#ifdef SUPPORT_WIDE_CHARS
- c > 255 ||
-#endif
- (mb->ctypes[c] & ctype_digit) == 0
- )
- RRETURN(MATCH_NOMATCH);
- ecode++;
- break;
-
- case OP_NOT_WHITESPACE:
- if (eptr >= mb->end_subject)
- {
- SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
- }
- GETCHARINCTEST(c, eptr);
- if (
-#ifdef SUPPORT_WIDE_CHARS
- c < 256 &&
-#endif
- (mb->ctypes[c] & ctype_space) != 0
- )
- RRETURN(MATCH_NOMATCH);
- ecode++;
- break;
-
- case OP_WHITESPACE:
- if (eptr >= mb->end_subject)
- {
- SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
- }
- GETCHARINCTEST(c, eptr);
- if (
-#ifdef SUPPORT_WIDE_CHARS
- c > 255 ||
-#endif
- (mb->ctypes[c] & ctype_space) == 0
- )
- RRETURN(MATCH_NOMATCH);
- ecode++;
- break;
-
- case OP_NOT_WORDCHAR:
- if (eptr >= mb->end_subject)
- {
- SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
- }
- GETCHARINCTEST(c, eptr);
- if (
-#ifdef SUPPORT_WIDE_CHARS
- c < 256 &&
-#endif
- (mb->ctypes[c] & ctype_word) != 0
- )
- RRETURN(MATCH_NOMATCH);
- ecode++;
- break;
-
- case OP_WORDCHAR:
- if (eptr >= mb->end_subject)
- {
- SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
- }
- GETCHARINCTEST(c, eptr);
- if (
-#ifdef SUPPORT_WIDE_CHARS
- c > 255 ||
-#endif
- (mb->ctypes[c] & ctype_word) == 0
- )
- RRETURN(MATCH_NOMATCH);
- ecode++;
- break;
-
- case OP_ANYNL:
- if (eptr >= mb->end_subject)
- {
- SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
- }
- GETCHARINCTEST(c, eptr);
- switch(c)
- {
- default: RRETURN(MATCH_NOMATCH);
-
- case CHAR_CR:
- if (eptr >= mb->end_subject)
- {
- SCHECK_PARTIAL();
- }
- else if (UCHAR21TEST(eptr) == CHAR_LF) eptr++;
- break;
-
- case CHAR_LF:
- break;
-
- case CHAR_VT:
- case CHAR_FF:
- case CHAR_NEL:
-#ifndef EBCDIC
- case 0x2028:
- case 0x2029:
-#endif /* Not EBCDIC */
- if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH);
- break;
- }
- ecode++;
- break;
-
- case OP_NOT_HSPACE:
- if (eptr >= mb->end_subject)
- {
- SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
- }
- GETCHARINCTEST(c, eptr);
- switch(c)
- {
- HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
- default: break;
- }
- ecode++;
- break;
-
- case OP_HSPACE:
- if (eptr >= mb->end_subject)
- {
- SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
- }
- GETCHARINCTEST(c, eptr);
- switch(c)
- {
- HSPACE_CASES: break; /* Byte and multibyte cases */
- default: RRETURN(MATCH_NOMATCH);
- }
- ecode++;
- break;
-
- case OP_NOT_VSPACE:
- if (eptr >= mb->end_subject)
- {
- SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
- }
- GETCHARINCTEST(c, eptr);
- switch(c)
- {
- VSPACE_CASES: RRETURN(MATCH_NOMATCH);
- default: break;
- }
- ecode++;
- break;
+ /* Caseful comparisons */
- case OP_VSPACE:
- if (eptr >= mb->end_subject)
- {
- SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
- }
- GETCHARINCTEST(c, eptr);
- switch(c)
+ else
{
- VSPACE_CASES: break;
- default: RRETURN(MATCH_NOMATCH);
- }
- ecode++;
- break;
-
#ifdef SUPPORT_UNICODE
- /* Check the next character by Unicode property. We will get here only
- if the support is in the binary; otherwise a compile-time error occurs. */
-
- case OP_PROP:
- case OP_NOTPROP:
- if (eptr >= mb->end_subject)
- {
- SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
- }
- GETCHARINCTEST(c, eptr);
- {
- const uint32_t *cp;
- const ucd_record *prop = GET_UCD(c);
-
- switch(ecode[1])
+ if (utf)
{
- case PT_ANY:
- if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
- break;
-
- case PT_LAMP:
- if ((prop->chartype == ucp_Lu ||
- prop->chartype == ucp_Ll ||
- prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
- RRETURN(MATCH_NOMATCH);
- break;
-
- case PT_GC:
- if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
- RRETURN(MATCH_NOMATCH);
- break;
-
- case PT_PC:
- if ((ecode[2] != prop->chartype) == (op == OP_PROP))
- RRETURN(MATCH_NOMATCH);
- break;
-
- case PT_SC:
- if ((ecode[2] != prop->script) == (op == OP_PROP))
- RRETURN(MATCH_NOMATCH);
- break;
-
- /* These are specials */
-
- case PT_ALNUM:
- if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
- PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
- RRETURN(MATCH_NOMATCH);
- break;
-
- /* Perl space used to exclude VT, but from Perl 5.18 it is included,
- which means that Perl space and POSIX space are now identical. PCRE
- was changed at release 8.34. */
-
- case PT_SPACE: /* Perl space */
- case PT_PXSPACE: /* POSIX space */
- switch(c)
+ uint32_t d;
+ for (i = 1; i <= Lmin; i++)
{
- HSPACE_CASES:
- VSPACE_CASES:
- if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
- break;
-
- default:
- if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) ==
- (op == OP_NOTPROP)) RRETURN(MATCH_NOMATCH);
- break;
- }
- break;
-
- case PT_WORD:
- if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
- PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
- c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
- RRETURN(MATCH_NOMATCH);
- break;
-
- case PT_CLIST:
- cp = PRIV(ucd_caseless_sets) + ecode[2];
- for (;;)
- {
- if (c < *cp)
- { if (op == OP_PROP) { RRETURN(MATCH_NOMATCH); } else break; }
- if (c == *cp++)
- { if (op == OP_PROP) break; else { RRETURN(MATCH_NOMATCH); } }
+ if (Feptr >= mb->end_subject)
+ {
+ SCHECK_PARTIAL();
+ RRETURN(MATCH_NOMATCH);
+ }
+ GETCHARINC(d, Feptr);
+ if (Lc == d) RRETURN(MATCH_NOMATCH);
}
- break;
-
- case PT_UCNC:
- if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
- c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
- c >= 0xe000) == (op == OP_NOTPROP))
- RRETURN(MATCH_NOMATCH);
- break;
-
- /* This should never occur */
-
- default:
- RRETURN(PCRE2_ERROR_INTERNAL);
- }
-
- ecode += 3;
- }
- break;
-
- /* Match an extended Unicode sequence. We will get here only if the support
- is in the binary; otherwise a compile-time error occurs. */
-
- case OP_EXTUNI:
- if (eptr >= mb->end_subject)
- {
- SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
- }
- else
- {
- int lgb, rgb;
- GETCHARINCTEST(c, eptr);
- lgb = UCD_GRAPHBREAK(c);
- while (eptr < mb->end_subject)
- {
- int len = 1;
- if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
- rgb = UCD_GRAPHBREAK(c);
- if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
- lgb = rgb;
- eptr += len;
- }
- }
- CHECK_PARTIAL();
- ecode++;
- break;
-#endif /* SUPPORT_UNICODE */
-
-
- /* Match a back reference, possibly repeatedly. Look past the end of the
- item to see if there is repeat information following.
-
- The OP_REF and OP_REFI opcodes are used for a reference to a numbered group
- or to a non-duplicated named group. For a duplicated named group, OP_DNREF
- and OP_DNREFI are used. In this case we must scan the list of groups to
- which the name refers, and use the first one that is set. */
-
- case OP_DNREF:
- case OP_DNREFI:
- caseless = op == OP_DNREFI;
- {
- int count = GET2(ecode, 1+IMM2_SIZE);
- PCRE2_SPTR slot = mb->name_table + GET2(ecode, 1) * mb->name_entry_size;
- ecode += 1 + 2*IMM2_SIZE;
-
- /* Initializing 'offset' avoids a compiler warning in the REF_REPEAT
- code. */
-
- offset = 0;
- while (count-- > 0)
- {
- offset = GET2(slot, 0) << 1;
- if (offset < offset_top && mb->ovector[offset] != PCRE2_UNSET) break;
- slot += mb->name_entry_size;
}
- }
- goto REF_REPEAT;
-
- case OP_REF:
- case OP_REFI:
- caseless = op == OP_REFI;
- offset = GET2(ecode, 1) << 1; /* Doubled ref number */
- ecode += 1 + IMM2_SIZE;
-
- /* Set up for repetition, or handle the non-repeated case */
-
- REF_REPEAT:
- switch (*ecode)
- {
- case OP_CRSTAR:
- case OP_CRMINSTAR:
- case OP_CRPLUS:
- case OP_CRMINPLUS:
- case OP_CRQUERY:
- case OP_CRMINQUERY:
- c = *ecode++ - OP_CRSTAR;
- minimize = (c & 1) != 0;
- min = rep_min[c]; /* Pick up values from tables; */
- max = rep_max[c]; /* zero for max => infinity */
- if (max == 0) max = INT_MAX;
- break;
-
- case OP_CRRANGE:
- case OP_CRMINRANGE:
- minimize = (*ecode == OP_CRMINRANGE);
- min = GET2(ecode, 1);
- max = GET2(ecode, 1 + IMM2_SIZE);
- if (max == 0) max = INT_MAX;
- ecode += 1 + 2 * IMM2_SIZE;
- break;
-
- default: /* No repeat follows */
+ else
+#endif
+ /* Not UTF mode */
{
- int rc = match_ref(offset, offset_top, eptr, mb, caseless, &length);
- if (rc != 0)
+ for (i = 1; i <= Lmin; i++)
{
- if (rc > 0) eptr = mb->end_subject; /* Partial match */
- CHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ if (Feptr >= mb->end_subject)
+ {
+ SCHECK_PARTIAL();
+ RRETURN(MATCH_NOMATCH);
+ }
+ if (Lc == *Feptr++) RRETURN(MATCH_NOMATCH);
}
}
- eptr += length;
- continue; /* With the main loop */
- }
- /* Handle repeated back references. If a set group has length zero, just
- continue with the main loop, because it matches however many times. For an
- unset reference, if the minimum is zero, we can also just continue. We an
- also continue if PCRE2_MATCH_UNSET_BACKREF is set, because this makes unset
- group be have as a zero-length group. For any other unset cases, carrying
- on will result in NOMATCH. */
+ if (Lmin == Lmax) continue;
- if (offset < offset_top && mb->ovector[offset] != PCRE2_UNSET)
- {
- if (mb->ovector[offset] == mb->ovector[offset + 1]) continue;
- }
- else /* Group is not set */
- {
- if (min == 0 || (mb->poptions & PCRE2_MATCH_UNSET_BACKREF) != 0)
- continue;
- }
-
- /* First, ensure the minimum number of matches are present. We get back
- the length of the reference string explicitly rather than passing the
- address of eptr, so that eptr can be a register variable. */
-
- for (i = 1; i <= min; i++)
- {
- PCRE2_SIZE slength;
- int rc = match_ref(offset, offset_top, eptr, mb, caseless, &slength);
- if (rc != 0)
+ if (reptype == REPTYPE_MIN)
{
- if (rc > 0) eptr = mb->end_subject; /* Partial match */
- CHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
- }
- eptr += slength;
- }
-
- /* If min = max, continue at the same level without recursion.
- They are not both allowed to be zero. */
-
- if (min == max) continue;
-
- /* If minimizing, keep trying and advancing the pointer */
-
- if (minimize)
- {
- for (fi = min;; fi++)
- {
- int rc;
- PCRE2_SIZE slength;
- RMATCH(eptr, ecode, offset_top, mb, eptrb, RM14);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
- rc = match_ref(offset, offset_top, eptr, mb, caseless, &slength);
- if (rc != 0)
+#ifdef SUPPORT_UNICODE
+ if (utf)
{
- if (rc > 0) eptr = mb->end_subject; /* Partial match */
- CHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
+ uint32_t d;
+ for (;;)
+ {
+ RMATCH(Fecode, RM206);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
+ if (Feptr >= mb->end_subject)
+ {
+ SCHECK_PARTIAL();
+ RRETURN(MATCH_NOMATCH);
+ }
+ GETCHARINC(d, Feptr);
+ if (Lc == d) RRETURN(MATCH_NOMATCH);
+ }
}
- eptr += slength;
- }
- /* Control never gets here */
- }
-
- /* If maximizing, find the longest string and work backwards, as long as
- the matched lengths for each iteration are the same. */
-
- else
- {
- BOOL samelengths = TRUE;
- pp = eptr;
- length = mb->ovector[offset+1] - mb->ovector[offset];
-
- for (i = min; i < max; i++)
- {
- PCRE2_SIZE slength;
- int rc = match_ref(offset, offset_top, eptr, mb, caseless, &slength);
-
- if (rc != 0)
+ else
+#endif
+ /* Not UTF mode */
{
- /* Can't use CHECK_PARTIAL because we don't want to update eptr in
- the soft partial matching case. */
-
- if (rc > 0 && mb->partial != 0 &&
- mb->end_subject > mb->start_used_ptr)
+ for (;;)
{
- mb->hitend = TRUE;
- if (mb->partial > 1) RRETURN(PCRE2_ERROR_PARTIAL);
+ RMATCH(Fecode, RM31);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
+ if (Feptr >= mb->end_subject)
+ {
+ SCHECK_PARTIAL();
+ RRETURN(MATCH_NOMATCH);
+ }
+ if (Lc == *Feptr++) RRETURN(MATCH_NOMATCH);
}
- break;
}
-
- if (slength != length) samelengths = FALSE;
- eptr += slength;
+ /* Control never gets here */
}
- /* If the length matched for each repetition is the same as the length of
- the captured group, we can easily work backwards. This is the normal
- case. However, in caseless UTF-8 mode there are pairs of case-equivalent
- characters whose lengths (in terms of code units) differ. However, this
- is very rare, so we handle it by re-matching fewer and fewer times. */
+ /* Maximize case */
- if (samelengths)
+ else
{
- while (eptr >= pp)
+ Lstart_eptr = Feptr;
+
+#ifdef SUPPORT_UNICODE
+ if (utf)
{
- RMATCH(eptr, ecode, offset_top, mb, eptrb, RM15);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- eptr -= length;
- }
- }
+ uint32_t d;
+ for (i = Lmin; i < Lmax; i++)
+ {
+ int len = 1;
+ if (Feptr >= mb->end_subject)
+ {
+ SCHECK_PARTIAL();
+ break;
+ }
+ GETCHARLEN(d, Feptr, len);
+ if (Lc == d) break;
+ Feptr += len;
+ }
- /* The rare case of non-matching lengths. Re-scan the repetition for each
- iteration. We know that match_ref() will succeed every time. */
+ /* After \C in UTF mode, Lstart_eptr might be in the middle of a
+ Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
+ go too far. */
- else
- {
- max = i;
- for (;;)
+ if (reptype != REPTYPE_POS) for(;;)
+ {
+ if (Feptr <= Lstart_eptr) break;
+ RMATCH(Fecode, RM207);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ Feptr--;
+ BACKCHAR(Feptr);
+ }
+ }
+ else
+#endif
+ /* Not UTF mode */
{
- RMATCH(eptr, ecode, offset_top, mb, eptrb, RM68);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr == pp) break; /* Failed after minimal repetition */
- eptr = pp;
- max--;
- for (i = min; i < max; i++)
+ for (i = Lmin; i < Lmax; i++)
{
- PCRE2_SIZE slength;
- (void)match_ref(offset, offset_top, eptr, mb, caseless, &slength);
- eptr += slength;
+ if (Feptr >= mb->end_subject)
+ {
+ SCHECK_PARTIAL();
+ break;
+ }
+ if (Lc == *Feptr) break;
+ Feptr++;
+ }
+ if (reptype != REPTYPE_POS) for (;;)
+ {
+ if (Feptr == Lstart_eptr) break;
+ RMATCH(Fecode, RM32);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ Feptr--;
}
}
}
-
- RRETURN(MATCH_NOMATCH);
}
- /* Control never gets here */
+ break;
- /* Match a bit-mapped character class, possibly repeatedly. This op code is
- used when all the characters in the class have values in the range 0-255,
- and either the matching is caseful, or the characters are in the range
- 0-127 when UTF-8 processing is enabled. The only difference between
+#undef Lstart_eptr
+#undef Lmin
+#undef Lmax
+#undef Lc
+#undef Loc
+
+
+ /* ===================================================================== */
+ /* Match a bit-mapped character class, possibly repeatedly. These opcodes
+ are used when all the characters in the class have values in the range
+ 0-255, and either the matching is caseful, or the characters are in the
+ range 0-127 when UTF processing is enabled. The only difference between
OP_CLASS and OP_NCLASS occurs when a data character outside the range is
- encountered.
+ encountered. */
- First, look past the end of the item to see if there is repeat information
- following. Then obey similar code to character type repeats - written out
- again for speed. */
+#define Lmin F->temp_32[0]
+#define Lmax F->temp_32[1]
+#define Lstart_eptr F->temp_sptr[0]
+#define Lbyte_map_address F->temp_sptr[1]
+#define Lbyte_map ((unsigned char *)Lbyte_map_address)
case OP_NCLASS:
case OP_CLASS:
{
- /* The data variable is saved across frames, so the byte map needs to
- be stored there. */
-#define BYTE_MAP ((uint8_t *)data)
- data = ecode + 1; /* Save for matching */
- ecode += 1 + (32 / sizeof(PCRE2_UCHAR)); /* Advance past the item */
+ Lbyte_map_address = Fecode + 1; /* Save for matching */
+ Fecode += 1 + (32 / sizeof(PCRE2_UCHAR)); /* Advance past the item */
+
+ /* Look past the end of the item to see if there is repeat information
+ following. Then obey similar code to character type repeats. */
- switch (*ecode)
+ switch (*Fecode)
{
case OP_CRSTAR:
case OP_CRMINSTAR:
@@ -2999,27 +1809,24 @@ for (;;)
case OP_CRPOSSTAR:
case OP_CRPOSPLUS:
case OP_CRPOSQUERY:
- c = *ecode++ - OP_CRSTAR;
- if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0;
- else possessive = TRUE;
- min = rep_min[c]; /* Pick up values from tables; */
- max = rep_max[c]; /* zero for max => infinity */
- if (max == 0) max = INT_MAX;
+ fc = *Fecode++ - OP_CRSTAR;
+ Lmin = rep_min[fc];
+ Lmax = rep_max[fc];
+ reptype = rep_typ[fc];
break;
case OP_CRRANGE:
case OP_CRMINRANGE:
case OP_CRPOSRANGE:
- minimize = (*ecode == OP_CRMINRANGE);
- possessive = (*ecode == OP_CRPOSRANGE);
- min = GET2(ecode, 1);
- max = GET2(ecode, 1 + IMM2_SIZE);
- if (max == 0) max = INT_MAX;
- ecode += 1 + 2 * IMM2_SIZE;
+ Lmin = GET2(Fecode, 1);
+ Lmax = GET2(Fecode, 1 + IMM2_SIZE);
+ if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */
+ reptype = rep_typ[*Fecode - OP_CRSTAR];
+ Fecode += 1 + 2 * IMM2_SIZE;
break;
default: /* No repeat follows */
- min = max = 1;
+ Lmin = Lmax = 1;
break;
}
@@ -3028,100 +1835,99 @@ for (;;)
#ifdef SUPPORT_UNICODE
if (utf)
{
- for (i = 1; i <= min; i++)
+ for (i = 1; i <= Lmin; i++)
{
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- GETCHARINC(c, eptr);
- if (c > 255)
+ GETCHARINC(fc, Feptr);
+ if (fc > 255)
{
- if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
+ if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH);
}
else
- if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
+ if ((Lbyte_map[fc/8] & (1 << (fc&7))) == 0) RRETURN(MATCH_NOMATCH);
}
}
else
#endif
/* Not UTF mode */
{
- for (i = 1; i <= min; i++)
+ for (i = 1; i <= Lmin; i++)
{
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- c = *eptr++;
+ fc = *Feptr++;
#if PCRE2_CODE_UNIT_WIDTH != 8
- if (c > 255)
+ if (fc > 255)
{
- if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
+ if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH);
}
else
#endif
- if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
+ if ((Lbyte_map[fc/8] & (1 << (fc&7))) == 0) RRETURN(MATCH_NOMATCH);
}
}
- /* If max == min we can continue with the main loop without the
- need to recurse. */
+ /* If Lmax == Lmin we are done. Continue with main loop. */
- if (min == max) continue;
+ if (Lmin == Lmax) continue;
/* If minimizing, keep testing the rest of the expression and advancing
the pointer while it matches the class. */
- if (minimize)
+ if (reptype == REPTYPE_MIN)
{
#ifdef SUPPORT_UNICODE
if (utf)
{
- for (fi = min;; fi++)
+ for (;;)
{
- RMATCH(eptr, ecode, offset_top, mb, eptrb, RM16);
+ RMATCH(Fecode, RM200);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
- if (eptr >= mb->end_subject)
+ if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- GETCHARINC(c, eptr);
- if (c > 255)
+ GETCHARINC(fc, Feptr);
+ if (fc > 255)
{
- if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
+ if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH);
}
else
- if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
+ if ((Lbyte_map[fc/8] & (1 << (fc&7))) == 0) RRETURN(MATCH_NOMATCH);
}
}
else
#endif
/* Not UTF mode */
{
- for (fi = min;; fi++)
+ for (;;)
{
- RMATCH(eptr, ecode, offset_top, mb, eptrb, RM17);
+ RMATCH(Fecode, RM23);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
- if (eptr >= mb->end_subject)
+ if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- c = *eptr++;
+ fc = *Feptr++;
#if PCRE2_CODE_UNIT_WIDTH != 8
- if (c > 255)
+ if (fc > 255)
{
- if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
+ if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH);
}
else
#endif
- if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
+ if ((Lbyte_map[fc/8] & (1 << (fc&7))) == 0) RRETURN(MATCH_NOMATCH);
}
}
/* Control never gets here */
@@ -3131,91 +1937,106 @@ for (;;)
else
{
- pp = eptr;
+ Lstart_eptr = Feptr;
#ifdef SUPPORT_UNICODE
if (utf)
{
- for (i = min; i < max; i++)
+ for (i = Lmin; i < Lmax; i++)
{
int len = 1;
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
break;
}
- GETCHARLEN(c, eptr, len);
- if (c > 255)
+ GETCHARLEN(fc, Feptr, len);
+ if (fc > 255)
{
- if (op == OP_CLASS) break;
+ if (Fop == OP_CLASS) break;
}
else
- if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
- eptr += len;
+ if ((Lbyte_map[fc/8] & (1 << (fc&7))) == 0) break;
+ Feptr += len;
}
- if (possessive) continue; /* No backtracking */
+ if (reptype == REPTYPE_POS) continue; /* No backtracking */
+
+ /* After \C in UTF mode, Lstart_eptr might be in the middle of a
+ Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
+ go too far. */
for (;;)
{
- RMATCH(eptr, ecode, offset_top, mb, eptrb, RM18);
+ RMATCH(Fecode, RM201);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr-- == pp) break; /* Stop if tried at original pos */
- BACKCHAR(eptr);
+ if (Feptr-- <= Lstart_eptr) break; /* Tried at original position */
+ BACKCHAR(Feptr);
}
}
else
#endif
/* Not UTF mode */
{
- for (i = min; i < max; i++)
+ for (i = Lmin; i < Lmax; i++)
{
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
break;
}
- c = *eptr;
+ fc = *Feptr;
#if PCRE2_CODE_UNIT_WIDTH != 8
- if (c > 255)
+ if (fc > 255)
{
- if (op == OP_CLASS) break;
+ if (Fop == OP_CLASS) break;
}
else
#endif
- if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
- eptr++;
+ if ((Lbyte_map[fc/8] & (1 << (fc&7))) == 0) break;
+ Feptr++;
}
- if (possessive) continue; /* No backtracking */
+ if (reptype == REPTYPE_POS) continue; /* No backtracking */
- while (eptr >= pp)
+ while (Feptr >= Lstart_eptr)
{
- RMATCH(eptr, ecode, offset_top, mb, eptrb, RM19);
+ RMATCH(Fecode, RM24);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- eptr--;
+ Feptr--;
}
}
RRETURN(MATCH_NOMATCH);
}
-#undef BYTE_MAP
}
/* Control never gets here */
+#undef Lbyte_map_address
+#undef Lbyte_map
+#undef Lstart_eptr
+#undef Lmin
+#undef Lmax
+
+ /* ===================================================================== */
/* Match an extended character class. In the 8-bit library, this opcode is
encountered only when UTF-8 mode mode is supported. In the 16-bit and
32-bit libraries, codepoints greater than 255 may be encountered even when
UTF is not supported. */
+#define Lstart_eptr F->temp_sptr[0]
+#define Lxclass_data F->temp_sptr[1]
+#define Lmin F->temp_32[0]
+#define Lmax F->temp_32[1]
+
#ifdef SUPPORT_WIDE_CHARS
case OP_XCLASS:
{
- data = ecode + 1 + LINK_SIZE; /* Save for matching */
- ecode += GET(ecode, 1); /* Advance past the item */
+ Lxclass_data = Fecode + 1 + LINK_SIZE; /* Save for matching */
+ Fecode += GET(Fecode, 1); /* Advance past the item */
- switch (*ecode)
+ switch (*Fecode)
{
case OP_CRSTAR:
case OP_CRMINSTAR:
@@ -3226,65 +2047,61 @@ for (;;)
case OP_CRPOSSTAR:
case OP_CRPOSPLUS:
case OP_CRPOSQUERY:
- c = *ecode++ - OP_CRSTAR;
- if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0;
- else possessive = TRUE;
- min = rep_min[c]; /* Pick up values from tables; */
- max = rep_max[c]; /* zero for max => infinity */
- if (max == 0) max = INT_MAX;
+ fc = *Fecode++ - OP_CRSTAR;
+ Lmin = rep_min[fc];
+ Lmax = rep_max[fc];
+ reptype = rep_typ[fc];
break;
case OP_CRRANGE:
case OP_CRMINRANGE:
case OP_CRPOSRANGE:
- minimize = (*ecode == OP_CRMINRANGE);
- possessive = (*ecode == OP_CRPOSRANGE);
- min = GET2(ecode, 1);
- max = GET2(ecode, 1 + IMM2_SIZE);
- if (max == 0) max = INT_MAX;
- ecode += 1 + 2 * IMM2_SIZE;
+ Lmin = GET2(Fecode, 1);
+ Lmax = GET2(Fecode, 1 + IMM2_SIZE);
+ if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */
+ reptype = rep_typ[*Fecode - OP_CRSTAR];
+ Fecode += 1 + 2 * IMM2_SIZE;
break;
default: /* No repeat follows */
- min = max = 1;
+ Lmin = Lmax = 1;
break;
}
/* First, ensure the minimum number of matches are present. */
- for (i = 1; i <= min; i++)
+ for (i = 1; i <= Lmin; i++)
{
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- GETCHARINCTEST(c, eptr);
- if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
+ GETCHARINCTEST(fc, Feptr);
+ if (!PRIV(xclass)(fc, Lxclass_data, utf)) RRETURN(MATCH_NOMATCH);
}
- /* If max == min we can continue with the main loop without the
- need to recurse. */
+ /* If Lmax == Lmin we can just continue with the main loop. */
- if (min == max) continue;
+ if (Lmin == Lmax) continue;
/* If minimizing, keep testing the rest of the expression and advancing
the pointer while it matches the class. */
- if (minimize)
+ if (reptype == REPTYPE_MIN)
{
- for (fi = min;; fi++)
+ for (;;)
{
- RMATCH(eptr, ecode, offset_top, mb, eptrb, RM20);
+ RMATCH(Fecode, RM100);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
- if (eptr >= mb->end_subject)
+ if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- GETCHARINCTEST(c, eptr);
- if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
+ GETCHARINCTEST(fc, Feptr);
+ if (!PRIV(xclass)(fc, Lxclass_data, utf)) RRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
}
@@ -3293,33 +2110,37 @@ for (;;)
else
{
- pp = eptr;
- for (i = min; i < max; i++)
+ Lstart_eptr = Feptr;
+ for (i = Lmin; i < Lmax; i++)
{
int len = 1;
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
break;
}
#ifdef SUPPORT_UNICODE
- GETCHARLENTEST(c, eptr, len);
+ GETCHARLENTEST(fc, Feptr, len);
#else
- c = *eptr;
+ fc = *Feptr;
#endif
- if (!PRIV(xclass)(c, data, utf)) break;
- eptr += len;
+ if (!PRIV(xclass)(fc, Lxclass_data, utf)) break;
+ Feptr += len;
}
- if (possessive) continue; /* No backtracking */
+ if (reptype == REPTYPE_POS) continue; /* No backtracking */
+
+ /* After \C in UTF mode, Lstart_eptr might be in the middle of a
+ Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
+ go too far. */
for(;;)
{
- RMATCH(eptr, ecode, offset_top, mb, eptrb, RM21);
+ RMATCH(Fecode, RM101);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr-- == pp) break; /* Stop if tried at original pos */
+ if (Feptr-- <= Lstart_eptr) break; /* Tried at original position */
#ifdef SUPPORT_UNICODE
- if (utf) BACKCHAR(eptr);
+ if (utf) BACKCHAR(Feptr);
#endif
}
RRETURN(MATCH_NOMATCH);
@@ -3327,887 +2148,370 @@ for (;;)
/* Control never gets here */
}
-#endif /* End of XCLASS */
+#endif /* SUPPORT_WIDE_CHARS: end of XCLASS */
- /* Match a single character, casefully */
+#undef Lstart_eptr
+#undef Lxclass_data
+#undef Lmin
+#undef Lmax
- case OP_CHAR:
-#ifdef SUPPORT_UNICODE
- if (utf)
+
+ /* ===================================================================== */
+ /* Match various character types when PCRE2_UCP is not set. These opcodes
+ are not generated when PCRE2_UCP is set - instead appropriate property
+ tests are compiled. */
+
+ case OP_NOT_DIGIT:
+ if (Feptr >= mb->end_subject)
{
- length = 1;
- ecode++;
- GETCHARLEN(fc, ecode, length);
- if (length > (PCRE2_SIZE)(mb->end_subject - eptr))
- {
- CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
- RRETURN(MATCH_NOMATCH);
- }
- for (; length > 0; length--)
- {
- if (*ecode++ != UCHAR21INC(eptr)) RRETURN(MATCH_NOMATCH);
- }
+ SCHECK_PARTIAL();
+ RRETURN(MATCH_NOMATCH);
}
- else
-#endif
- /* Not UTF mode */
+ GETCHARINCTEST(fc, Feptr);
+ if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_digit) != 0)
+ RRETURN(MATCH_NOMATCH);
+ Fecode++;
+ break;
+
+ case OP_DIGIT:
+ if (Feptr >= mb->end_subject)
{
- if (mb->end_subject - eptr < 1)
- {
- SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
- RRETURN(MATCH_NOMATCH);
- }
- if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
- ecode += 2;
+ SCHECK_PARTIAL();
+ RRETURN(MATCH_NOMATCH);
}
+ GETCHARINCTEST(fc, Feptr);
+ if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_digit) == 0)
+ RRETURN(MATCH_NOMATCH);
+ Fecode++;
break;
- /* Match a single character, caselessly. If we are at the end of the
- subject, give up immediately. */
-
- case OP_CHARI:
- if (eptr >= mb->end_subject)
+ case OP_NOT_WHITESPACE:
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
+ GETCHARINCTEST(fc, Feptr);
+ if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_space) != 0)
+ RRETURN(MATCH_NOMATCH);
+ Fecode++;
+ break;
-#ifdef SUPPORT_UNICODE
- if (utf)
+ case OP_WHITESPACE:
+ if (Feptr >= mb->end_subject)
{
- length = 1;
- ecode++;
- GETCHARLEN(fc, ecode, length);
-
- /* If the pattern character's value is < 128, we have only one byte, and
- we know that its other case must also be one byte long, so we can use the
- fast lookup table. We know that there is at least one byte left in the
- subject. */
-
- if (fc < 128)
- {
- uint32_t cc = UCHAR21(eptr);
- if (mb->lcc[fc] != TABLE_GET(cc, mb->lcc, cc)) RRETURN(MATCH_NOMATCH);
- ecode++;
- eptr++;
- }
-
- /* Otherwise we must pick up the subject character. Note that we cannot
- use the value of "length" to check for sufficient bytes left, because the
- other case of the character may have more or fewer bytes. */
-
- else
- {
- uint32_t dc;
- GETCHARINC(dc, eptr);
- ecode += length;
-
- /* If we have Unicode property support, we can use it to test the other
- case of the character, if there is one. */
-
- if (fc != dc)
- {
-#ifdef SUPPORT_UNICODE
- if (dc != UCD_OTHERCASE(fc))
-#endif
- RRETURN(MATCH_NOMATCH);
- }
- }
+ SCHECK_PARTIAL();
+ RRETURN(MATCH_NOMATCH);
}
- else
-#endif /* SUPPORT_UNICODE */
+ GETCHARINCTEST(fc, Feptr);
+ if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_space) == 0)
+ RRETURN(MATCH_NOMATCH);
+ Fecode++;
+ break;
- /* Not UTF mode */
+ case OP_NOT_WORDCHAR:
+ if (Feptr >= mb->end_subject)
{
- if (TABLE_GET(ecode[1], mb->lcc, ecode[1])
- != TABLE_GET(*eptr, mb->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
- eptr++;
- ecode += 2;
+ SCHECK_PARTIAL();
+ RRETURN(MATCH_NOMATCH);
}
+ GETCHARINCTEST(fc, Feptr);
+ if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0)
+ RRETURN(MATCH_NOMATCH);
+ Fecode++;
break;
- /* Match a single character repeatedly. */
-
- case OP_EXACT:
- case OP_EXACTI:
- min = max = GET2(ecode, 1);
- ecode += 1 + IMM2_SIZE;
- goto REPEATCHAR;
-
- case OP_POSUPTO:
- case OP_POSUPTOI:
- possessive = TRUE;
- /* Fall through */
-
- case OP_UPTO:
- case OP_UPTOI:
- case OP_MINUPTO:
- case OP_MINUPTOI:
- min = 0;
- max = GET2(ecode, 1);
- minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
- ecode += 1 + IMM2_SIZE;
- goto REPEATCHAR;
-
- case OP_POSSTAR:
- case OP_POSSTARI:
- possessive = TRUE;
- min = 0;
- max = INT_MAX;
- ecode++;
- goto REPEATCHAR;
-
- case OP_POSPLUS:
- case OP_POSPLUSI:
- possessive = TRUE;
- min = 1;
- max = INT_MAX;
- ecode++;
- goto REPEATCHAR;
-
- case OP_POSQUERY:
- case OP_POSQUERYI:
- possessive = TRUE;
- min = 0;
- max = 1;
- ecode++;
- goto REPEATCHAR;
-
- case OP_STAR:
- case OP_STARI:
- case OP_MINSTAR:
- case OP_MINSTARI:
- case OP_PLUS:
- case OP_PLUSI:
- case OP_MINPLUS:
- case OP_MINPLUSI:
- case OP_QUERY:
- case OP_QUERYI:
- case OP_MINQUERY:
- case OP_MINQUERYI:
- c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
- minimize = (c & 1) != 0;
- min = rep_min[c]; /* Pick up values from tables; */
- max = rep_max[c]; /* zero for max => infinity */
- if (max == 0) max = INT_MAX;
-
- /* Common code for all repeated single-character matches. We first check
- for the minimum number of characters. If the minimum equals the maximum, we
- are done. Otherwise, if minimizing, check the rest of the pattern for a
- match; if there isn't one, advance up to the maximum, one character at a
- time.
-
- If maximizing, advance up to the maximum number of matching characters,
- until eptr is past the end of the maximum run. If possessive, we are
- then done (no backing up). Otherwise, match at this position; anything
- other than no match is immediately returned. For nomatch, back up one
- character, unless we are matching \R and the last thing matched was
- \r\n, in which case, back up two bytes. When we reach the first optional
- character position, we can save stack by doing a tail recurse.
-
- The various UTF/non-UTF and caseful/caseless cases are handled separately,
- for speed. */
-
- REPEATCHAR:
-#ifdef SUPPORT_UNICODE
- if (utf)
+ case OP_WORDCHAR:
+ if (Feptr >= mb->end_subject)
{
- length = 1;
- charptr = ecode;
- GETCHARLEN(fc, ecode, length);
- ecode += length;
-
- /* Handle multibyte character matching specially here. There is
- support for caseless matching if UCP support is present. */
-
- if (length > 1)
- {
- uint32_t othercase;
- if (op >= OP_STARI && /* Caseless */
- (othercase = UCD_OTHERCASE(fc)) != fc)
- oclength = PRIV(ord2utf)(othercase, occhars);
- else oclength = 0;
-
- for (i = 1; i <= min; i++)
- {
- if (eptr <= mb->end_subject - length &&
- memcmp(eptr, charptr, CU2BYTES(length)) == 0) eptr += length;
- else if (oclength > 0 &&
- eptr <= mb->end_subject - oclength &&
- memcmp(eptr, occhars, CU2BYTES(oclength)) == 0) eptr += oclength;
- else
- {
- CHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
- }
- }
-
- if (min == max) continue;
-
- if (minimize)
- {
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, mb, eptrb, RM22);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
- if (eptr <= mb->end_subject - length &&
- memcmp(eptr, charptr, CU2BYTES(length)) == 0) eptr += length;
- else if (oclength > 0 &&
- eptr <= mb->end_subject - oclength &&
- memcmp(eptr, occhars, CU2BYTES(oclength)) == 0) eptr += oclength;
- else
- {
- CHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
- }
- }
- /* Control never gets here */
- }
-
- else /* Maximize */
- {
- pp = eptr;
- for (i = min; i < max; i++)
- {
- if (eptr <= mb->end_subject - length &&
- memcmp(eptr, charptr, CU2BYTES(length)) == 0) eptr += length;
- else if (oclength > 0 &&
- eptr <= mb->end_subject - oclength &&
- memcmp(eptr, occhars, CU2BYTES(oclength)) == 0) eptr += oclength;
- else
- {
- CHECK_PARTIAL();
- break;
- }
- }
-
- if (possessive) continue; /* No backtracking */
-
- /* After \C in UTF mode, pp might be in the middle of a Unicode
- character. Use <= pp to ensure backtracking doesn't go too far. */
-
- for(;;)
- {
- if (eptr <= pp) goto TAIL_RECURSE;
- RMATCH(eptr, ecode, offset_top, mb, eptrb, RM23);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- eptr--;
- BACKCHAR(eptr);
- }
- }
- /* Control never gets here */
- }
-
- /* If the length of a UTF-8 character is 1, we fall through here, and
- obey the code as for non-UTF-8 characters below, though in this case the
- value of fc will always be < 128. */
+ SCHECK_PARTIAL();
+ RRETURN(MATCH_NOMATCH);
}
- else
-#endif /* SUPPORT_UNICODE */
-
- /* When not in UTF-8 mode, load a single-byte character. */
- fc = *ecode++;
-
- /* The value of fc at this point is always one character, though we may
- or may not be in UTF mode. The code is duplicated for the caseless and
- caseful cases, for speed, since matching characters is likely to be quite
- common. First, ensure the minimum number of matches are present. If min =
- max, continue at the same level without recursing. Otherwise, if
- minimizing, keep trying the rest of the expression and advancing one
- matching character if failing, up to the maximum. Alternatively, if
- maximizing, find the maximum number of characters and work backwards. */
+ GETCHARINCTEST(fc, Feptr);
+ if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_word) == 0)
+ RRETURN(MATCH_NOMATCH);
+ Fecode++;
+ break;
- if (op >= OP_STARI) /* Caseless */
+ case OP_ANYNL:
+ if (Feptr >= mb->end_subject)
{
-#if PCRE2_CODE_UNIT_WIDTH == 8
- /* fc must be < 128 if UTF is enabled. */
- foc = mb->fcc[fc];
-#else
-#ifdef SUPPORT_UNICODE
- if (utf && fc > 127)
- foc = UCD_OTHERCASE(fc);
- else
-#endif /* SUPPORT_UNICODE */
- foc = TABLE_GET(fc, mb->fcc, fc);
-#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
-
- for (i = 1; i <= min; i++)
- {
- uint32_t cc; /* Faster than PCRE2_UCHAR */
- if (eptr >= mb->end_subject)
- {
- SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
- }
- cc = UCHAR21TEST(eptr);
- if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
- eptr++;
- }
- if (min == max) continue;
- if (minimize)
- {
- for (fi = min;; fi++)
- {
- uint32_t cc; /* Faster than PCRE2_UCHAR */
- RMATCH(eptr, ecode, offset_top, mb, eptrb, RM24);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
- if (eptr >= mb->end_subject)
- {
- SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
- }
- cc = UCHAR21TEST(eptr);
- if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
- eptr++;
- }
- /* Control never gets here */
- }
- else /* Maximize */
- {
- pp = eptr;
- for (i = min; i < max; i++)
- {
- uint32_t cc; /* Faster than PCRE2_UCHAR */
- if (eptr >= mb->end_subject)
- {
- SCHECK_PARTIAL();
- break;
- }
- cc = UCHAR21TEST(eptr);
- if (fc != cc && foc != cc) break;
- eptr++;
- }
- if (possessive) continue; /* No backtracking */
- for (;;)
- {
- if (eptr == pp) goto TAIL_RECURSE;
- RMATCH(eptr, ecode, offset_top, mb, eptrb, RM25);
- eptr--;
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- }
- /* Control never gets here */
- }
+ SCHECK_PARTIAL();
+ RRETURN(MATCH_NOMATCH);
}
-
- /* Caseful comparisons (includes all multi-byte characters) */
-
- else
+ GETCHARINCTEST(fc, Feptr);
+ switch(fc)
{
- for (i = 1; i <= min; i++)
+ default: RRETURN(MATCH_NOMATCH);
+
+ case CHAR_CR:
+ if (Feptr >= mb->end_subject)
{
- if (eptr >= mb->end_subject)
- {
- SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
- }
- if (fc != UCHAR21INCTEST(eptr)) RRETURN(MATCH_NOMATCH);
+ SCHECK_PARTIAL();
}
+ else if (UCHAR21TEST(Feptr) == CHAR_LF) Feptr++;
+ break;
- if (min == max) continue;
+ case CHAR_LF:
+ break;
- if (minimize)
- {
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, mb, eptrb, RM26);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
- if (eptr >= mb->end_subject)
- {
- SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
- }
- if (fc != UCHAR21INCTEST(eptr)) RRETURN(MATCH_NOMATCH);
- }
- /* Control never gets here */
- }
- else /* Maximize */
- {
- pp = eptr;
- for (i = min; i < max; i++)
- {
- if (eptr >= mb->end_subject)
- {
- SCHECK_PARTIAL();
- break;
- }
- if (fc != UCHAR21TEST(eptr)) break;
- eptr++;
- }
- if (possessive) continue; /* No backtracking */
- for (;;)
- {
- if (eptr == pp) goto TAIL_RECURSE;
- RMATCH(eptr, ecode, offset_top, mb, eptrb, RM27);
- eptr--;
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- }
- /* Control never gets here */
- }
+ case CHAR_VT:
+ case CHAR_FF:
+ case CHAR_NEL:
+#ifndef EBCDIC
+ case 0x2028:
+ case 0x2029:
+#endif /* Not EBCDIC */
+ if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH);
+ break;
}
- /* Control never gets here */
+ Fecode++;
+ break;
- /* Match a negated single one-byte character. The character we are
- checking can be multibyte. */
+ case OP_NOT_HSPACE:
+ if (Feptr >= mb->end_subject)
+ {
+ SCHECK_PARTIAL();
+ RRETURN(MATCH_NOMATCH);
+ }
+ GETCHARINCTEST(fc, Feptr);
+ switch(fc)
+ {
+ HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
+ default: break;
+ }
+ Fecode++;
+ break;
- case OP_NOT:
- case OP_NOTI:
- if (eptr >= mb->end_subject)
+ case OP_HSPACE:
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
-#ifdef SUPPORT_UNICODE
- if (utf)
+ GETCHARINCTEST(fc, Feptr);
+ switch(fc)
{
- register uint32_t ch, och;
+ HSPACE_CASES: break; /* Byte and multibyte cases */
+ default: RRETURN(MATCH_NOMATCH);
+ }
+ Fecode++;
+ break;
- ecode++;
- GETCHARINC(ch, ecode);
- GETCHARINC(c, eptr);
+ case OP_NOT_VSPACE:
+ if (Feptr >= mb->end_subject)
+ {
+ SCHECK_PARTIAL();
+ RRETURN(MATCH_NOMATCH);
+ }
+ GETCHARINCTEST(fc, Feptr);
+ switch(fc)
+ {
+ VSPACE_CASES: RRETURN(MATCH_NOMATCH);
+ default: break;
+ }
+ Fecode++;
+ break;
- if (op == OP_NOT)
- {
- if (ch == c) RRETURN(MATCH_NOMATCH);
- }
- else
- {
- if (ch > 127)
- och = UCD_OTHERCASE(ch);
- else
- och = TABLE_GET(ch, mb->fcc, ch);
- if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
- }
+ case OP_VSPACE:
+ if (Feptr >= mb->end_subject)
+ {
+ SCHECK_PARTIAL();
+ RRETURN(MATCH_NOMATCH);
}
- else
-#endif /* SUPPORT_UNICODE */
+ GETCHARINCTEST(fc, Feptr);
+ switch(fc)
{
- register uint32_t ch = ecode[1];
- c = *eptr++;
- if (ch == c || (op == OP_NOTI && TABLE_GET(ch, mb->fcc, ch) == c))
- RRETURN(MATCH_NOMATCH);
- ecode += 2;
+ VSPACE_CASES: break;
+ default: RRETURN(MATCH_NOMATCH);
}
+ Fecode++;
break;
- /* Match a negated single one-byte character repeatedly. This is almost a
- repeat of the code for a repeated single character, but I haven't found a
- nice way of commoning these up that doesn't require a test of the
- positive/negative option for each character match. Maybe that wouldn't add
- very much to the time taken, but character matching *is* what this is all
- about... */
- case OP_NOTEXACT:
- case OP_NOTEXACTI:
- min = max = GET2(ecode, 1);
- ecode += 1 + IMM2_SIZE;
- goto REPEATNOTCHAR;
+#ifdef SUPPORT_UNICODE
- case OP_NOTUPTO:
- case OP_NOTUPTOI:
- case OP_NOTMINUPTO:
- case OP_NOTMINUPTOI:
- min = 0;
- max = GET2(ecode, 1);
- minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
- ecode += 1 + IMM2_SIZE;
- goto REPEATNOTCHAR;
+ /* ===================================================================== */
+ /* Check the next character by Unicode property. We will get here only
+ if the support is in the binary; otherwise a compile-time error occurs. */
- case OP_NOTPOSSTAR:
- case OP_NOTPOSSTARI:
- possessive = TRUE;
- min = 0;
- max = INT_MAX;
- ecode++;
- goto REPEATNOTCHAR;
+ case OP_PROP:
+ case OP_NOTPROP:
+ if (Feptr >= mb->end_subject)
+ {
+ SCHECK_PARTIAL();
+ RRETURN(MATCH_NOMATCH);
+ }
+ GETCHARINCTEST(fc, Feptr);
+ {
+ const uint32_t *cp;
+ const ucd_record *prop = GET_UCD(fc);
- case OP_NOTPOSPLUS:
- case OP_NOTPOSPLUSI:
- possessive = TRUE;
- min = 1;
- max = INT_MAX;
- ecode++;
- goto REPEATNOTCHAR;
+ switch(Fecode[1])
+ {
+ case PT_ANY:
+ if (Fop == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
+ break;
- case OP_NOTPOSQUERY:
- case OP_NOTPOSQUERYI:
- possessive = TRUE;
- min = 0;
- max = 1;
- ecode++;
- goto REPEATNOTCHAR;
+ case PT_LAMP:
+ if ((prop->chartype == ucp_Lu ||
+ prop->chartype == ucp_Ll ||
+ prop->chartype == ucp_Lt) == (Fop == OP_NOTPROP))
+ RRETURN(MATCH_NOMATCH);
+ break;
- case OP_NOTPOSUPTO:
- case OP_NOTPOSUPTOI:
- possessive = TRUE;
- min = 0;
- max = GET2(ecode, 1);
- ecode += 1 + IMM2_SIZE;
- goto REPEATNOTCHAR;
+ case PT_GC:
+ if ((Fecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (Fop == OP_PROP))
+ RRETURN(MATCH_NOMATCH);
+ break;
- case OP_NOTSTAR:
- case OP_NOTSTARI:
- case OP_NOTMINSTAR:
- case OP_NOTMINSTARI:
- case OP_NOTPLUS:
- case OP_NOTPLUSI:
- case OP_NOTMINPLUS:
- case OP_NOTMINPLUSI:
- case OP_NOTQUERY:
- case OP_NOTQUERYI:
- case OP_NOTMINQUERY:
- case OP_NOTMINQUERYI:
- c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
- minimize = (c & 1) != 0;
- min = rep_min[c]; /* Pick up values from tables; */
- max = rep_max[c]; /* zero for max => infinity */
- if (max == 0) max = INT_MAX;
+ case PT_PC:
+ if ((Fecode[2] != prop->chartype) == (Fop == OP_PROP))
+ RRETURN(MATCH_NOMATCH);
+ break;
- /* Common code for all repeated single-byte matches. */
+ case PT_SC:
+ if ((Fecode[2] != prop->script) == (Fop == OP_PROP))
+ RRETURN(MATCH_NOMATCH);
+ break;
- REPEATNOTCHAR:
- GETCHARINCTEST(fc, ecode);
+ /* These are specials */
- /* The code is duplicated for the caseless and caseful cases, for speed,
- since matching characters is likely to be quite common. First, ensure the
- minimum number of matches are present. If min = max, continue at the same
- level without recursing. Otherwise, if minimizing, keep trying the rest of
- the expression and advancing one matching character if failing, up to the
- maximum. Alternatively, if maximizing, find the maximum number of
- characters and work backwards. */
+ case PT_ALNUM:
+ if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
+ PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (Fop == OP_NOTPROP))
+ RRETURN(MATCH_NOMATCH);
+ break;
- if (op >= OP_NOTSTARI) /* Caseless */
- {
-#ifdef SUPPORT_UNICODE
- if (utf && fc > 127)
- foc = UCD_OTHERCASE(fc);
- else
-#endif /* SUPPORT_UNICODE */
- foc = TABLE_GET(fc, mb->fcc, fc);
+ /* Perl space used to exclude VT, but from Perl 5.18 it is included,
+ which means that Perl space and POSIX space are now identical. PCRE
+ was changed at release 8.34. */
-#ifdef SUPPORT_UNICODE
- if (utf)
- {
- register uint32_t d;
- for (i = 1; i <= min; i++)
- {
- if (eptr >= mb->end_subject)
- {
- SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
- }
- GETCHARINC(d, eptr);
- if (fc == d || (uint32_t)foc == d) RRETURN(MATCH_NOMATCH);
- }
- }
- else
-#endif /* SUPPORT_UNICODE */
- /* Not UTF mode */
- {
- for (i = 1; i <= min; i++)
+ case PT_SPACE: /* Perl space */
+ case PT_PXSPACE: /* POSIX space */
+ switch(fc)
{
- if (eptr >= mb->end_subject)
- {
- SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
- }
- if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
- eptr++;
+ HSPACE_CASES:
+ VSPACE_CASES:
+ if (Fop == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
+ break;
+
+ default:
+ if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) ==
+ (Fop == OP_NOTPROP)) RRETURN(MATCH_NOMATCH);
+ break;
}
- }
+ break;
- if (min == max) continue;
+ case PT_WORD:
+ if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
+ PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
+ fc == CHAR_UNDERSCORE) == (Fop == OP_NOTPROP))
+ RRETURN(MATCH_NOMATCH);
+ break;
- if (minimize)
- {
-#ifdef SUPPORT_UNICODE
- if (utf)
- {
- register uint32_t d;
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, mb, eptrb, RM28);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
- if (eptr >= mb->end_subject)
- {
- SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
- }
- GETCHARINC(d, eptr);
- if (fc == d || (uint32_t)foc == d) RRETURN(MATCH_NOMATCH);
- }
- }
- else
-#endif /*SUPPORT_UNICODE */
- /* Not UTF mode */
+ case PT_CLIST:
+ cp = PRIV(ucd_caseless_sets) + Fecode[2];
+ for (;;)
{
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, mb, eptrb, RM29);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
- if (eptr >= mb->end_subject)
- {
- SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
- }
- if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
- eptr++;
- }
+ if (fc < *cp)
+ { if (Fop == OP_PROP) { RRETURN(MATCH_NOMATCH); } else break; }
+ if (fc == *cp++)
+ { if (Fop == OP_PROP) break; else { RRETURN(MATCH_NOMATCH); } }
}
- /* Control never gets here */
- }
-
- /* Maximize case */
-
- else
- {
- pp = eptr;
+ break;
-#ifdef SUPPORT_UNICODE
- if (utf)
- {
- register uint32_t d;
- for (i = min; i < max; i++)
- {
- int len = 1;
- if (eptr >= mb->end_subject)
- {
- SCHECK_PARTIAL();
- break;
- }
- GETCHARLEN(d, eptr, len);
- if (fc == d || (uint32_t)foc == d) break;
- eptr += len;
- }
- if (possessive) continue; /* No backtracking */
+ case PT_UCNC:
+ if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT ||
+ fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) ||
+ fc >= 0xe000) == (Fop == OP_NOTPROP))
+ RRETURN(MATCH_NOMATCH);
+ break;
- /* After \C in UTF mode, pp might be in the middle of a Unicode
- character. Use <= pp to ensure backtracking doesn't go too far. */
+ /* This should never occur */
- for(;;)
- {
- if (eptr <= pp) goto TAIL_RECURSE;
- RMATCH(eptr, ecode, offset_top, mb, eptrb, RM30);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- eptr--;
- BACKCHAR(eptr);
- }
- }
- else
-#endif /* SUPPORT_UNICODE */
- /* Not UTF mode */
- {
- for (i = min; i < max; i++)
- {
- if (eptr >= mb->end_subject)
- {
- SCHECK_PARTIAL();
- break;
- }
- if (fc == *eptr || foc == *eptr) break;
- eptr++;
- }
- if (possessive) continue; /* No backtracking */
- for (;;)
- {
- if (eptr == pp) goto TAIL_RECURSE;
- RMATCH(eptr, ecode, offset_top, mb, eptrb, RM31);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- eptr--;
- }
- }
- /* Control never gets here */
+ default:
+ return PCRE2_ERROR_INTERNAL;
}
+
+ Fecode += 3;
}
+ break;
- /* Caseful comparisons */
+ /* ===================================================================== */
+ /* Match an extended Unicode sequence. We will get here only if the support
+ is in the binary; otherwise a compile-time error occurs. */
+
+ case OP_EXTUNI:
+ if (Feptr >= mb->end_subject)
+ {
+ SCHECK_PARTIAL();
+ RRETURN(MATCH_NOMATCH);
+ }
else
{
-#ifdef SUPPORT_UNICODE
- if (utf)
- {
- register uint32_t d;
- for (i = 1; i <= min; i++)
- {
- if (eptr >= mb->end_subject)
- {
- SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
- }
- GETCHARINC(d, eptr);
- if (fc == d) RRETURN(MATCH_NOMATCH);
- }
- }
- else
-#endif
- /* Not UTF mode */
- {
- for (i = 1; i <= min; i++)
- {
- if (eptr >= mb->end_subject)
- {
- SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
- }
- if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
- }
- }
-
- if (min == max) continue;
-
- if (minimize)
- {
-#ifdef SUPPORT_UNICODE
- if (utf)
- {
- register uint32_t d;
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, mb, eptrb, RM32);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
- if (eptr >= mb->end_subject)
- {
- SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
- }
- GETCHARINC(d, eptr);
- if (fc == d) RRETURN(MATCH_NOMATCH);
- }
- }
- else
-#endif
- /* Not UTF mode */
- {
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, mb, eptrb, RM33);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
- if (eptr >= mb->end_subject)
- {
- SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
- }
- if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
- }
- }
- /* Control never gets here */
- }
-
- /* Maximize case */
-
- else
- {
- pp = eptr;
+ GETCHARINCTEST(fc, Feptr);
+ Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject, utf,
+ NULL);
+ }
+ CHECK_PARTIAL();
+ Fecode++;
+ break;
-#ifdef SUPPORT_UNICODE
- if (utf)
- {
- register uint32_t d;
- for (i = min; i < max; i++)
- {
- int len = 1;
- if (eptr >= mb->end_subject)
- {
- SCHECK_PARTIAL();
- break;
- }
- GETCHARLEN(d, eptr, len);
- if (fc == d) break;
- eptr += len;
- }
- if (possessive) continue; /* No backtracking */
+#endif /* SUPPORT_UNICODE */
- /* After \C in UTF mode, pp might be in the middle of a Unicode
- character. Use <= pp to ensure backtracking doesn't go too far. */
- for(;;)
- {
- if (eptr <= pp) goto TAIL_RECURSE;
- RMATCH(eptr, ecode, offset_top, mb, eptrb, RM34);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- eptr--;
- BACKCHAR(eptr);
- }
- }
- else
-#endif
- /* Not UTF mode */
- {
- for (i = min; i < max; i++)
- {
- if (eptr >= mb->end_subject)
- {
- SCHECK_PARTIAL();
- break;
- }
- if (fc == *eptr) break;
- eptr++;
- }
- if (possessive) continue; /* No backtracking */
- for (;;)
- {
- if (eptr == pp) goto TAIL_RECURSE;
- RMATCH(eptr, ecode, offset_top, mb, eptrb, RM35);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- eptr--;
- }
- }
- /* Control never gets here */
- }
- }
- /* Control never gets here */
+ /* ===================================================================== */
+ /* Match a single character type repeatedly. Note that the property type
+ does not need to be in a stack frame as it is not used within an RMATCH()
+ loop. */
- /* Match a single character type repeatedly; several different opcodes
- share code. This is very similar to the code for single characters, but we
- repeat it in the interests of efficiency. */
+#define Lstart_eptr F->temp_sptr[0]
+#define Lmin F->temp_32[0]
+#define Lmax F->temp_32[1]
+#define Lctype F->temp_32[2]
+#define Lpropvalue F->temp_32[3]
case OP_TYPEEXACT:
- min = max = GET2(ecode, 1);
- minimize = TRUE;
- ecode += 1 + IMM2_SIZE;
+ Lmin = Lmax = GET2(Fecode, 1);
+ Fecode += 1 + IMM2_SIZE;
goto REPEATTYPE;
case OP_TYPEUPTO:
case OP_TYPEMINUPTO:
- min = 0;
- max = GET2(ecode, 1);
- minimize = *ecode == OP_TYPEMINUPTO;
- ecode += 1 + IMM2_SIZE;
+ Lmin = 0;
+ Lmax = GET2(Fecode, 1);
+ reptype = (*Fecode == OP_TYPEMINUPTO)? REPTYPE_MIN : REPTYPE_MAX;
+ Fecode += 1 + IMM2_SIZE;
goto REPEATTYPE;
case OP_TYPEPOSSTAR:
- possessive = TRUE;
- min = 0;
- max = INT_MAX;
- ecode++;
+ reptype = REPTYPE_POS;
+ Lmin = 0;
+ Lmax = UINT32_MAX;
+ Fecode++;
goto REPEATTYPE;
case OP_TYPEPOSPLUS:
- possessive = TRUE;
- min = 1;
- max = INT_MAX;
- ecode++;
+ reptype = REPTYPE_POS;
+ Lmin = 1;
+ Lmax = UINT32_MAX;
+ Fecode++;
goto REPEATTYPE;
case OP_TYPEPOSQUERY:
- possessive = TRUE;
- min = 0;
- max = 1;
- ecode++;
+ reptype = REPTYPE_POS;
+ Lmin = 0;
+ Lmax = 1;
+ Fecode++;
goto REPEATTYPE;
case OP_TYPEPOSUPTO:
- possessive = TRUE;
- min = 0;
- max = GET2(ecode, 1);
- ecode += 1 + IMM2_SIZE;
+ reptype = REPTYPE_POS;
+ Lmin = 0;
+ Lmax = GET2(Fecode, 1);
+ Fecode += 1 + IMM2_SIZE;
goto REPEATTYPE;
case OP_TYPESTAR:
@@ -4216,127 +2520,122 @@ for (;;)
case OP_TYPEMINPLUS:
case OP_TYPEQUERY:
case OP_TYPEMINQUERY:
- c = *ecode++ - OP_TYPESTAR;
- minimize = (c & 1) != 0;
- min = rep_min[c]; /* Pick up values from tables; */
- max = rep_max[c]; /* zero for max => infinity */
- if (max == 0) max = INT_MAX;
+ fc = *Fecode++ - OP_TYPESTAR;
+ Lmin = rep_min[fc];
+ Lmax = rep_max[fc];
+ reptype = rep_typ[fc];
- /* Common code for all repeated single character type matches. Note that
- in UTF-8 mode, '.' matches a character of any length, but for the other
- character types, the valid characters are all one-byte long. */
+ /* Common code for all repeated character type matches. */
REPEATTYPE:
- ctype = *ecode++; /* Code for the character type */
+ Lctype = *Fecode++; /* Code for the character type */
#ifdef SUPPORT_UNICODE
- if (ctype == OP_PROP || ctype == OP_NOTPROP)
+ if (Lctype == OP_PROP || Lctype == OP_NOTPROP)
{
- prop_fail_result = ctype == OP_NOTPROP;
- prop_type = *ecode++;
- prop_value = *ecode++;
+ proptype = *Fecode++;
+ Lpropvalue = *Fecode++;
}
- else prop_type = -1;
+ else proptype = -1;
#endif
/* First, ensure the minimum number of matches are present. Use inline
code for maximizing the speed, and do the type test once at the start
- (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
- is tidier. Also separate the UCP code, which can be the same for both UTF-8
- and single-bytes. */
+ (i.e. keep it out of the loop). The code for UTF mode is separated out for
+ tidiness, except for Unicode property tests. */
- if (min > 0)
+ if (Lmin > 0)
{
#ifdef SUPPORT_UNICODE
- if (prop_type >= 0)
+ if (proptype >= 0) /* Property tests in all modes */
{
- switch(prop_type)
+ switch(proptype)
{
case PT_ANY:
- if (prop_fail_result) RRETURN(MATCH_NOMATCH);
- for (i = 1; i <= min; i++)
+ if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
+ for (i = 1; i <= Lmin; i++)
{
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- GETCHARINCTEST(c, eptr);
+ GETCHARINCTEST(fc, Feptr);
}
break;
case PT_LAMP:
- for (i = 1; i <= min; i++)
+ for (i = 1; i <= Lmin; i++)
{
int chartype;
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- GETCHARINCTEST(c, eptr);
- chartype = UCD_CHARTYPE(c);
+ GETCHARINCTEST(fc, Feptr);
+ chartype = UCD_CHARTYPE(fc);
if ((chartype == ucp_Lu ||
chartype == ucp_Ll ||
- chartype == ucp_Lt) == prop_fail_result)
+ chartype == ucp_Lt) == (Lctype == OP_NOTPROP))
RRETURN(MATCH_NOMATCH);
}
break;
case PT_GC:
- for (i = 1; i <= min; i++)
+ for (i = 1; i <= Lmin; i++)
{
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- GETCHARINCTEST(c, eptr);
- if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
+ GETCHARINCTEST(fc, Feptr);
+ if ((UCD_CATEGORY(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
RRETURN(MATCH_NOMATCH);
}
break;
case PT_PC:
- for (i = 1; i <= min; i++)
+ for (i = 1; i <= Lmin; i++)
{
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- GETCHARINCTEST(c, eptr);
- if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
+ GETCHARINCTEST(fc, Feptr);
+ if ((UCD_CHARTYPE(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
RRETURN(MATCH_NOMATCH);
}
break;
case PT_SC:
- for (i = 1; i <= min; i++)
+ for (i = 1; i <= Lmin; i++)
{
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- GETCHARINCTEST(c, eptr);
- if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
+ GETCHARINCTEST(fc, Feptr);
+ if ((UCD_SCRIPT(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
RRETURN(MATCH_NOMATCH);
}
break;
case PT_ALNUM:
- for (i = 1; i <= min; i++)
+ for (i = 1; i <= Lmin; i++)
{
int category;
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- GETCHARINCTEST(c, eptr);
- category = UCD_CATEGORY(c);
- if ((category == ucp_L || category == ucp_N) == prop_fail_result)
+ GETCHARINCTEST(fc, Feptr);
+ category = UCD_CATEGORY(fc);
+ if ((category == ucp_L || category == ucp_N) == (Lctype == OP_NOTPROP))
RRETURN(MATCH_NOMATCH);
}
break;
@@ -4347,23 +2646,23 @@ for (;;)
case PT_SPACE: /* Perl space */
case PT_PXSPACE: /* POSIX space */
- for (i = 1; i <= min; i++)
+ for (i = 1; i <= Lmin; i++)
{
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- GETCHARINCTEST(c, eptr);
- switch(c)
+ GETCHARINCTEST(fc, Feptr);
+ switch(fc)
{
HSPACE_CASES:
VSPACE_CASES:
- if (prop_fail_result) RRETURN(MATCH_NOMATCH);
+ if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
break;
default:
- if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
+ if ((UCD_CATEGORY(fc) == ucp_Z) == (Lctype == OP_NOTPROP))
RRETURN(MATCH_NOMATCH);
break;
}
@@ -4371,55 +2670,61 @@ for (;;)
break;
case PT_WORD:
- for (i = 1; i <= min; i++)
+ for (i = 1; i <= Lmin; i++)
{
int category;
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- GETCHARINCTEST(c, eptr);
- category = UCD_CATEGORY(c);
- if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
- == prop_fail_result)
+ GETCHARINCTEST(fc, Feptr);
+ category = UCD_CATEGORY(fc);
+ if ((category == ucp_L || category == ucp_N ||
+ fc == CHAR_UNDERSCORE) == (Lctype == OP_NOTPROP))
RRETURN(MATCH_NOMATCH);
}
break;
case PT_CLIST:
- for (i = 1; i <= min; i++)
+ for (i = 1; i <= Lmin; i++)
{
const uint32_t *cp;
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- GETCHARINCTEST(c, eptr);
- cp = PRIV(ucd_caseless_sets) + prop_value;
+ GETCHARINCTEST(fc, Feptr);
+ cp = PRIV(ucd_caseless_sets) + Lpropvalue;
for (;;)
{
- if (c < *cp)
- { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
- if (c == *cp++)
- { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
+ if (fc < *cp)
+ {
+ if (Lctype == OP_NOTPROP) break;
+ RRETURN(MATCH_NOMATCH);
+ }
+ if (fc == *cp++)
+ {
+ if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
+ break;
+ }
}
}
break;
case PT_UCNC:
- for (i = 1; i <= min; i++)
+ for (i = 1; i <= Lmin; i++)
{
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- GETCHARINCTEST(c, eptr);
- if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
- c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
- c >= 0xe000) == prop_fail_result)
+ GETCHARINCTEST(fc, Feptr);
+ if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT ||
+ fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) ||
+ fc >= 0xe000) == (Lctype == OP_NOTPROP))
RRETURN(MATCH_NOMATCH);
}
break;
@@ -4427,105 +2732,95 @@ for (;;)
/* This should not occur */
default:
- RRETURN(PCRE2_ERROR_INTERNAL);
+ return PCRE2_ERROR_INTERNAL;
}
}
/* Match extended Unicode sequences. We will get here only if the
support is in the binary; otherwise a compile-time error occurs. */
- else if (ctype == OP_EXTUNI)
+ else if (Lctype == OP_EXTUNI)
{
- for (i = 1; i <= min; i++)
+ for (i = 1; i <= Lmin; i++)
{
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
else
{
- int lgb, rgb;
- GETCHARINCTEST(c, eptr);
- lgb = UCD_GRAPHBREAK(c);
- while (eptr < mb->end_subject)
- {
- int len = 1;
- if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
- rgb = UCD_GRAPHBREAK(c);
- if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
- lgb = rgb;
- eptr += len;
- }
+ GETCHARINCTEST(fc, Feptr);
+ Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject,
+ mb->end_subject, utf, NULL);
}
CHECK_PARTIAL();
}
}
-
else
#endif /* SUPPORT_UNICODE */
-/* Handle all other cases when the coding is UTF-8 */
+/* Handle all other cases in UTF mode */
#ifdef SUPPORT_UNICODE
- if (utf) switch(ctype)
+ if (utf) switch(Lctype)
{
case OP_ANY:
- for (i = 1; i <= min; i++)
+ for (i = 1; i <= Lmin; i++)
{
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
+ if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);
if (mb->partial != 0 &&
- eptr + 1 >= mb->end_subject &&
+ Feptr + 1 >= mb->end_subject &&
NLBLOCK->nltype == NLTYPE_FIXED &&
NLBLOCK->nllen == 2 &&
- UCHAR21(eptr) == NLBLOCK->nl[0])
+ UCHAR21(Feptr) == NLBLOCK->nl[0])
{
mb->hitend = TRUE;
- if (mb->partial > 1) RRETURN(PCRE2_ERROR_PARTIAL);
+ if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
}
- eptr++;
- ACROSSCHAR(eptr < mb->end_subject, *eptr, eptr++);
+ Feptr++;
+ ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
}
break;
case OP_ALLANY:
- for (i = 1; i <= min; i++)
+ for (i = 1; i <= Lmin; i++)
{
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- eptr++;
- ACROSSCHAR(eptr < mb->end_subject, *eptr, eptr++);
+ Feptr++;
+ ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
}
break;
case OP_ANYBYTE:
- if (eptr > mb->end_subject - min) RRETURN(MATCH_NOMATCH);
- eptr += min;
+ if (Feptr > mb->end_subject - Lmin) RRETURN(MATCH_NOMATCH);
+ Feptr += Lmin;
break;
case OP_ANYNL:
- for (i = 1; i <= min; i++)
+ for (i = 1; i <= Lmin; i++)
{
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- GETCHARINC(c, eptr);
- switch(c)
+ GETCHARINC(fc, Feptr);
+ switch(fc)
{
default: RRETURN(MATCH_NOMATCH);
case CHAR_CR:
- if (eptr < mb->end_subject && UCHAR21(eptr) == CHAR_LF) eptr++;
+ if (Feptr < mb->end_subject && UCHAR21(Feptr) == CHAR_LF) Feptr++;
break;
case CHAR_LF:
@@ -4545,49 +2840,49 @@ for (;;)
break;
case OP_NOT_HSPACE:
- for (i = 1; i <= min; i++)
+ for (i = 1; i <= Lmin; i++)
{
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- GETCHARINC(c, eptr);
- switch(c)
+ GETCHARINC(fc, Feptr);
+ switch(fc)
{
- HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
+ HSPACE_CASES: RRETURN(MATCH_NOMATCH);
default: break;
}
}
break;
case OP_HSPACE:
- for (i = 1; i <= min; i++)
+ for (i = 1; i <= Lmin; i++)
{
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- GETCHARINC(c, eptr);
- switch(c)
+ GETCHARINC(fc, Feptr);
+ switch(fc)
{
- HSPACE_CASES: break; /* Byte and multibyte cases */
+ HSPACE_CASES: break;
default: RRETURN(MATCH_NOMATCH);
}
}
break;
case OP_NOT_VSPACE:
- for (i = 1; i <= min; i++)
+ for (i = 1; i <= Lmin; i++)
{
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- GETCHARINC(c, eptr);
- switch(c)
+ GETCHARINC(fc, Feptr);
+ switch(fc)
{
VSPACE_CASES: RRETURN(MATCH_NOMATCH);
default: break;
@@ -4596,15 +2891,15 @@ for (;;)
break;
case OP_VSPACE:
- for (i = 1; i <= min; i++)
+ for (i = 1; i <= Lmin; i++)
{
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- GETCHARINC(c, eptr);
- switch(c)
+ GETCHARINC(fc, Feptr);
+ switch(fc)
{
VSPACE_CASES: break;
default: RRETURN(MATCH_NOMATCH);
@@ -4613,170 +2908,174 @@ for (;;)
break;
case OP_NOT_DIGIT:
- for (i = 1; i <= min; i++)
+ for (i = 1; i <= Lmin; i++)
{
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- GETCHARINC(c, eptr);
- if (c < 128 && (mb->ctypes[c] & ctype_digit) != 0)
+ GETCHARINC(fc, Feptr);
+ if (fc < 128 && (mb->ctypes[fc] & ctype_digit) != 0)
RRETURN(MATCH_NOMATCH);
}
break;
case OP_DIGIT:
- for (i = 1; i <= min; i++)
+ for (i = 1; i <= Lmin; i++)
{
uint32_t cc;
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- cc = UCHAR21(eptr);
+ cc = UCHAR21(Feptr);
if (cc >= 128 || (mb->ctypes[cc] & ctype_digit) == 0)
RRETURN(MATCH_NOMATCH);
- eptr++;
- /* No need to skip more bytes - we know it's a 1-byte character */
+ Feptr++;
+ /* No need to skip more code units - we know it has only one. */
}
break;
case OP_NOT_WHITESPACE:
- for (i = 1; i <= min; i++)
+ for (i = 1; i <= Lmin; i++)
{
uint32_t cc;
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- cc = UCHAR21(eptr);
+ cc = UCHAR21(Feptr);
if (cc < 128 && (mb->ctypes[cc] & ctype_space) != 0)
RRETURN(MATCH_NOMATCH);
- eptr++;
- ACROSSCHAR(eptr < mb->end_subject, *eptr, eptr++);
+ Feptr++;
+ ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
}
break;
case OP_WHITESPACE:
- for (i = 1; i <= min; i++)
+ for (i = 1; i <= Lmin; i++)
{
uint32_t cc;
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- cc = UCHAR21(eptr);
+ cc = UCHAR21(Feptr);
if (cc >= 128 || (mb->ctypes[cc] & ctype_space) == 0)
RRETURN(MATCH_NOMATCH);
- eptr++;
- /* No need to skip more bytes - we know it's a 1-byte character */
+ Feptr++;
+ /* No need to skip more code units - we know it has only one. */
}
break;
case OP_NOT_WORDCHAR:
- for (i = 1; i <= min; i++)
+ for (i = 1; i <= Lmin; i++)
{
uint32_t cc;
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- cc = UCHAR21(eptr);
+ cc = UCHAR21(Feptr);
if (cc < 128 && (mb->ctypes[cc] & ctype_word) != 0)
RRETURN(MATCH_NOMATCH);
- eptr++;
- ACROSSCHAR(eptr < mb->end_subject, *eptr, eptr++);
+ Feptr++;
+ ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
}
break;
case OP_WORDCHAR:
- for (i = 1; i <= min; i++)
+ for (i = 1; i <= Lmin; i++)
{
uint32_t cc;
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- cc = UCHAR21(eptr);
+ cc = UCHAR21(Feptr);
if (cc >= 128 || (mb->ctypes[cc] & ctype_word) == 0)
RRETURN(MATCH_NOMATCH);
- eptr++;
- /* No need to skip more bytes - we know it's a 1-byte character */
+ Feptr++;
+ /* No need to skip more code units - we know it has only one. */
}
break;
default:
- RRETURN(PCRE2_ERROR_INTERNAL);
- } /* End switch(ctype) */
+ return PCRE2_ERROR_INTERNAL;
+ } /* End switch(Lctype) */
else
#endif /* SUPPORT_UNICODE */
- /* Code for the non-UTF-8 case for minimum matching of operators other
+ /* Code for the non-UTF case for minimum matching of operators other
than OP_PROP and OP_NOTPROP. */
- switch(ctype)
+ switch(Lctype)
{
case OP_ANY:
- for (i = 1; i <= min; i++)
+ for (i = 1; i <= Lmin; i++)
{
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
+ if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);
if (mb->partial != 0 &&
- eptr + 1 >= mb->end_subject &&
+ Feptr + 1 >= mb->end_subject &&
NLBLOCK->nltype == NLTYPE_FIXED &&
NLBLOCK->nllen == 2 &&
- *eptr == NLBLOCK->nl[0])
+ *Feptr == NLBLOCK->nl[0])
{
mb->hitend = TRUE;
- if (mb->partial > 1) RRETURN(PCRE2_ERROR_PARTIAL);
+ if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
}
- eptr++;
+ Feptr++;
}
break;
case OP_ALLANY:
- if (eptr > mb->end_subject - min)
+ if (Feptr > mb->end_subject - Lmin)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- eptr += min;
- break;
-
- case OP_ANYBYTE:
- if (eptr > mb->end_subject - min)
- {
- SCHECK_PARTIAL();
- RRETURN(MATCH_NOMATCH);
- }
- eptr += min;
+ Feptr += Lmin;
break;
+ /* This OP_ANYBYTE case will never be reached because \C gets turned
+ into OP_ALLANY in non-UTF mode. Cut out the code so that coverage
+ reports don't complain about it's never being used. */
+
+/* case OP_ANYBYTE:
+* if (Feptr > mb->end_subject - Lmin)
+* {
+* SCHECK_PARTIAL();
+* RRETURN(MATCH_NOMATCH);
+* }
+* Feptr += Lmin;
+* break;
+*/
case OP_ANYNL:
- for (i = 1; i <= min; i++)
+ for (i = 1; i <= Lmin; i++)
{
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- switch(*eptr++)
+ switch(*Feptr++)
{
default: RRETURN(MATCH_NOMATCH);
case CHAR_CR:
- if (eptr < mb->end_subject && *eptr == CHAR_LF) eptr++;
+ if (Feptr < mb->end_subject && *Feptr == CHAR_LF) Feptr++;
break;
case CHAR_LF:
@@ -4796,14 +3095,14 @@ for (;;)
break;
case OP_NOT_HSPACE:
- for (i = 1; i <= min; i++)
+ for (i = 1; i <= Lmin; i++)
{
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- switch(*eptr++)
+ switch(*Feptr++)
{
default: break;
HSPACE_BYTE_CASES:
@@ -4816,14 +3115,14 @@ for (;;)
break;
case OP_HSPACE:
- for (i = 1; i <= min; i++)
+ for (i = 1; i <= Lmin; i++)
{
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- switch(*eptr++)
+ switch(*Feptr++)
{
default: RRETURN(MATCH_NOMATCH);
HSPACE_BYTE_CASES:
@@ -4836,14 +3135,14 @@ for (;;)
break;
case OP_NOT_VSPACE:
- for (i = 1; i <= min; i++)
+ for (i = 1; i <= Lmin; i++)
{
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- switch(*eptr++)
+ switch(*Feptr++)
{
VSPACE_BYTE_CASES:
#if PCRE2_CODE_UNIT_WIDTH != 8
@@ -4856,14 +3155,14 @@ for (;;)
break;
case OP_VSPACE:
- for (i = 1; i <= min; i++)
+ for (i = 1; i <= Lmin; i++)
{
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- switch(*eptr++)
+ switch(*Feptr++)
{
default: RRETURN(MATCH_NOMATCH);
VSPACE_BYTE_CASES:
@@ -4876,212 +3175,212 @@ for (;;)
break;
case OP_NOT_DIGIT:
- for (i = 1; i <= min; i++)
+ for (i = 1; i <= Lmin; i++)
{
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- if (MAX_255(*eptr) && (mb->ctypes[*eptr] & ctype_digit) != 0)
+ if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_digit) != 0)
RRETURN(MATCH_NOMATCH);
- eptr++;
+ Feptr++;
}
break;
case OP_DIGIT:
- for (i = 1; i <= min; i++)
+ for (i = 1; i <= Lmin; i++)
{
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- if (!MAX_255(*eptr) || (mb->ctypes[*eptr] & ctype_digit) == 0)
+ if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_digit) == 0)
RRETURN(MATCH_NOMATCH);
- eptr++;
+ Feptr++;
}
break;
case OP_NOT_WHITESPACE:
- for (i = 1; i <= min; i++)
+ for (i = 1; i <= Lmin; i++)
{
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- if (MAX_255(*eptr) && (mb->ctypes[*eptr] & ctype_space) != 0)
+ if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_space) != 0)
RRETURN(MATCH_NOMATCH);
- eptr++;
+ Feptr++;
}
break;
case OP_WHITESPACE:
- for (i = 1; i <= min; i++)
+ for (i = 1; i <= Lmin; i++)
{
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- if (!MAX_255(*eptr) || (mb->ctypes[*eptr] & ctype_space) == 0)
+ if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_space) == 0)
RRETURN(MATCH_NOMATCH);
- eptr++;
+ Feptr++;
}
break;
case OP_NOT_WORDCHAR:
- for (i = 1; i <= min; i++)
+ for (i = 1; i <= Lmin; i++)
{
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- if (MAX_255(*eptr) && (mb->ctypes[*eptr] & ctype_word) != 0)
+ if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_word) != 0)
RRETURN(MATCH_NOMATCH);
- eptr++;
+ Feptr++;
}
break;
case OP_WORDCHAR:
- for (i = 1; i <= min; i++)
+ for (i = 1; i <= Lmin; i++)
{
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- if (!MAX_255(*eptr) || (mb->ctypes[*eptr] & ctype_word) == 0)
+ if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_word) == 0)
RRETURN(MATCH_NOMATCH);
- eptr++;
+ Feptr++;
}
break;
default:
- RRETURN(PCRE2_ERROR_INTERNAL);
+ return PCRE2_ERROR_INTERNAL;
}
}
- /* If min = max, continue at the same level without recursing */
+ /* If Lmin = Lmax we are done. Continue with the main loop. */
- if (min == max) continue;
+ if (Lmin == Lmax) continue;
/* If minimizing, we have to test the rest of the pattern before each
- subsequent match. Again, separate the UTF-8 case for speed, and also
- separate the UCP cases. */
+ subsequent match. */
- if (minimize)
+ if (reptype == REPTYPE_MIN)
{
#ifdef SUPPORT_UNICODE
- if (prop_type >= 0)
+ if (proptype >= 0)
{
- switch(prop_type)
+ switch(proptype)
{
case PT_ANY:
- for (fi = min;; fi++)
+ for (;;)
{
- RMATCH(eptr, ecode, offset_top, mb, eptrb, RM36);
+ RMATCH(Fecode, RM208);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
- if (eptr >= mb->end_subject)
+ if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- GETCHARINCTEST(c, eptr);
- if (prop_fail_result) RRETURN(MATCH_NOMATCH);
+ GETCHARINCTEST(fc, Feptr);
+ if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
case PT_LAMP:
- for (fi = min;; fi++)
+ for (;;)
{
int chartype;
- RMATCH(eptr, ecode, offset_top, mb, eptrb, RM37);
+ RMATCH(Fecode, RM209);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
- if (eptr >= mb->end_subject)
+ if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- GETCHARINCTEST(c, eptr);
- chartype = UCD_CHARTYPE(c);
+ GETCHARINCTEST(fc, Feptr);
+ chartype = UCD_CHARTYPE(fc);
if ((chartype == ucp_Lu ||
chartype == ucp_Ll ||
- chartype == ucp_Lt) == prop_fail_result)
+ chartype == ucp_Lt) == (Lctype == OP_NOTPROP))
RRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
case PT_GC:
- for (fi = min;; fi++)
+ for (;;)
{
- RMATCH(eptr, ecode, offset_top, mb, eptrb, RM38);
+ RMATCH(Fecode, RM210);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
- if (eptr >= mb->end_subject)
+ if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- GETCHARINCTEST(c, eptr);
- if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
+ GETCHARINCTEST(fc, Feptr);
+ if ((UCD_CATEGORY(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
RRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
case PT_PC:
- for (fi = min;; fi++)
+ for (;;)
{
- RMATCH(eptr, ecode, offset_top, mb, eptrb, RM39);
+ RMATCH(Fecode, RM211);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
- if (eptr >= mb->end_subject)
+ if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- GETCHARINCTEST(c, eptr);
- if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
+ GETCHARINCTEST(fc, Feptr);
+ if ((UCD_CHARTYPE(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
RRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
case PT_SC:
- for (fi = min;; fi++)
+ for (;;)
{
- RMATCH(eptr, ecode, offset_top, mb, eptrb, RM40);
+ RMATCH(Fecode, RM212);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
- if (eptr >= mb->end_subject)
+ if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- GETCHARINCTEST(c, eptr);
- if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
+ GETCHARINCTEST(fc, Feptr);
+ if ((UCD_SCRIPT(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
RRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
case PT_ALNUM:
- for (fi = min;; fi++)
+ for (;;)
{
int category;
- RMATCH(eptr, ecode, offset_top, mb, eptrb, RM59);
+ RMATCH(Fecode, RM213);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
- if (eptr >= mb->end_subject)
+ if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- GETCHARINCTEST(c, eptr);
- category = UCD_CATEGORY(c);
- if ((category == ucp_L || category == ucp_N) == prop_fail_result)
+ GETCHARINCTEST(fc, Feptr);
+ category = UCD_CATEGORY(fc);
+ if ((category == ucp_L || category == ucp_N) ==
+ (Lctype == OP_NOTPROP))
RRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
@@ -5092,26 +3391,26 @@ for (;;)
case PT_SPACE: /* Perl space */
case PT_PXSPACE: /* POSIX space */
- for (fi = min;; fi++)
+ for (;;)
{
- RMATCH(eptr, ecode, offset_top, mb, eptrb, RM61);
+ RMATCH(Fecode, RM214);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
- if (eptr >= mb->end_subject)
+ if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- GETCHARINCTEST(c, eptr);
- switch(c)
+ GETCHARINCTEST(fc, Feptr);
+ switch(fc)
{
HSPACE_CASES:
VSPACE_CASES:
- if (prop_fail_result) RRETURN(MATCH_NOMATCH);
+ if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
break;
default:
- if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
+ if ((UCD_CATEGORY(fc) == ucp_Z) == (Lctype == OP_NOTPROP))
RRETURN(MATCH_NOMATCH);
break;
}
@@ -5119,105 +3418,101 @@ for (;;)
/* Control never gets here */
case PT_WORD:
- for (fi = min;; fi++)
+ for (;;)
{
int category;
- RMATCH(eptr, ecode, offset_top, mb, eptrb, RM62);
+ RMATCH(Fecode, RM215);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
- if (eptr >= mb->end_subject)
+ if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- GETCHARINCTEST(c, eptr);
- category = UCD_CATEGORY(c);
+ GETCHARINCTEST(fc, Feptr);
+ category = UCD_CATEGORY(fc);
if ((category == ucp_L ||
category == ucp_N ||
- c == CHAR_UNDERSCORE)
- == prop_fail_result)
+ fc == CHAR_UNDERSCORE) == (Lctype == OP_NOTPROP))
RRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
case PT_CLIST:
- for (fi = min;; fi++)
+ for (;;)
{
const uint32_t *cp;
- RMATCH(eptr, ecode, offset_top, mb, eptrb, RM67);
+ RMATCH(Fecode, RM216);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
- if (eptr >= mb->end_subject)
+ if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- GETCHARINCTEST(c, eptr);
- cp = PRIV(ucd_caseless_sets) + prop_value;
+ GETCHARINCTEST(fc, Feptr);
+ cp = PRIV(ucd_caseless_sets) + Lpropvalue;
for (;;)
{
- if (c < *cp)
- { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
- if (c == *cp++)
- { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
+ if (fc < *cp)
+ {
+ if (Lctype == OP_NOTPROP) break;
+ RRETURN(MATCH_NOMATCH);
+ }
+ if (fc == *cp++)
+ {
+ if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
+ break;
+ }
}
}
/* Control never gets here */
case PT_UCNC:
- for (fi = min;; fi++)
+ for (;;)
{
- RMATCH(eptr, ecode, offset_top, mb, eptrb, RM60);
+ RMATCH(Fecode, RM217);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
- if (eptr >= mb->end_subject)
+ if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- GETCHARINCTEST(c, eptr);
- if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
- c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
- c >= 0xe000) == prop_fail_result)
+ GETCHARINCTEST(fc, Feptr);
+ if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT ||
+ fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) ||
+ fc >= 0xe000) == (Lctype == OP_NOTPROP))
RRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
/* This should never occur */
default:
- RRETURN(PCRE2_ERROR_INTERNAL);
+ return PCRE2_ERROR_INTERNAL;
}
}
/* Match extended Unicode sequences. We will get here only if the
support is in the binary; otherwise a compile-time error occurs. */
- else if (ctype == OP_EXTUNI)
+ else if (Lctype == OP_EXTUNI)
{
- for (fi = min;; fi++)
+ for (;;)
{
- RMATCH(eptr, ecode, offset_top, mb, eptrb, RM41);
+ RMATCH(Fecode, RM218);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
- if (eptr >= mb->end_subject)
+ if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
else
{
- int lgb, rgb;
- GETCHARINCTEST(c, eptr);
- lgb = UCD_GRAPHBREAK(c);
- while (eptr < mb->end_subject)
- {
- int len = 1;
- if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
- rgb = UCD_GRAPHBREAK(c);
- if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
- lgb = rgb;
- eptr += len;
- }
+ GETCHARINCTEST(fc, Feptr);
+ Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject,
+ utf, NULL);
}
CHECK_PARTIAL();
}
@@ -5225,33 +3520,34 @@ for (;;)
else
#endif /* SUPPORT_UNICODE */
+ /* UTF mode for non-property testing character types. */
+
#ifdef SUPPORT_UNICODE
if (utf)
{
- for (fi = min;; fi++)
+ for (;;)
{
- RMATCH(eptr, ecode, offset_top, mb, eptrb, RM42);
+ RMATCH(Fecode, RM219);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
- if (eptr >= mb->end_subject)
+ if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- if (ctype == OP_ANY && IS_NEWLINE(eptr))
- RRETURN(MATCH_NOMATCH);
- GETCHARINC(c, eptr);
- switch(ctype)
+ if (Lctype == OP_ANY && IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);
+ GETCHARINC(fc, Feptr);
+ switch(Lctype)
{
case OP_ANY: /* This is the non-NL case */
if (mb->partial != 0 && /* Take care with CRLF partial */
- eptr >= mb->end_subject &&
+ Feptr >= mb->end_subject &&
NLBLOCK->nltype == NLTYPE_FIXED &&
NLBLOCK->nllen == 2 &&
- c == NLBLOCK->nl[0])
+ fc == NLBLOCK->nl[0])
{
mb->hitend = TRUE;
- if (mb->partial > 1) RRETURN(PCRE2_ERROR_PARTIAL);
+ if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
}
break;
@@ -5260,11 +3556,12 @@ for (;;)
break;
case OP_ANYNL:
- switch(c)
+ switch(fc)
{
default: RRETURN(MATCH_NOMATCH);
+
case CHAR_CR:
- if (eptr < mb->end_subject && UCHAR21(eptr) == CHAR_LF) eptr++;
+ if (Feptr < mb->end_subject && UCHAR21(Feptr) == CHAR_LF) Feptr++;
break;
case CHAR_LF:
@@ -5277,13 +3574,14 @@ for (;;)
case 0x2028:
case 0x2029:
#endif /* Not EBCDIC */
- if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH);
+ if (mb->bsr_convention == PCRE2_BSR_ANYCRLF)
+ RRETURN(MATCH_NOMATCH);
break;
}
break;
case OP_NOT_HSPACE:
- switch(c)
+ switch(fc)
{
HSPACE_CASES: RRETURN(MATCH_NOMATCH);
default: break;
@@ -5291,7 +3589,7 @@ for (;;)
break;
case OP_HSPACE:
- switch(c)
+ switch(fc)
{
HSPACE_CASES: break;
default: RRETURN(MATCH_NOMATCH);
@@ -5299,7 +3597,7 @@ for (;;)
break;
case OP_NOT_VSPACE:
- switch(c)
+ switch(fc)
{
VSPACE_CASES: RRETURN(MATCH_NOMATCH);
default: break;
@@ -5307,7 +3605,7 @@ for (;;)
break;
case OP_VSPACE:
- switch(c)
+ switch(fc)
{
VSPACE_CASES: break;
default: RRETURN(MATCH_NOMATCH);
@@ -5315,68 +3613,69 @@ for (;;)
break;
case OP_NOT_DIGIT:
- if (c < 256 && (mb->ctypes[c] & ctype_digit) != 0)
+ if (fc < 256 && (mb->ctypes[fc] & ctype_digit) != 0)
RRETURN(MATCH_NOMATCH);
break;
case OP_DIGIT:
- if (c >= 256 || (mb->ctypes[c] & ctype_digit) == 0)
+ if (fc >= 256 || (mb->ctypes[fc] & ctype_digit) == 0)
RRETURN(MATCH_NOMATCH);
break;
case OP_NOT_WHITESPACE:
- if (c < 256 && (mb->ctypes[c] & ctype_space) != 0)
+ if (fc < 256 && (mb->ctypes[fc] & ctype_space) != 0)
RRETURN(MATCH_NOMATCH);
break;
case OP_WHITESPACE:
- if (c >= 256 || (mb->ctypes[c] & ctype_space) == 0)
+ if (fc >= 256 || (mb->ctypes[fc] & ctype_space) == 0)
RRETURN(MATCH_NOMATCH);
break;
case OP_NOT_WORDCHAR:
- if (c < 256 && (mb->ctypes[c] & ctype_word) != 0)
+ if (fc < 256 && (mb->ctypes[fc] & ctype_word) != 0)
RRETURN(MATCH_NOMATCH);
break;
case OP_WORDCHAR:
- if (c >= 256 || (mb->ctypes[c] & ctype_word) == 0)
+ if (fc >= 256 || (mb->ctypes[fc] & ctype_word) == 0)
RRETURN(MATCH_NOMATCH);
break;
default:
- RRETURN(PCRE2_ERROR_INTERNAL);
+ return PCRE2_ERROR_INTERNAL;
}
}
}
else
-#endif
+#endif /* SUPPORT_UNICODE */
+
/* Not UTF mode */
{
- for (fi = min;; fi++)
+ for (;;)
{
- RMATCH(eptr, ecode, offset_top, mb, eptrb, RM43);
+ RMATCH(Fecode, RM33);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) RRETURN(MATCH_NOMATCH);
- if (eptr >= mb->end_subject)
+ if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- if (ctype == OP_ANY && IS_NEWLINE(eptr))
+ if (Lctype == OP_ANY && IS_NEWLINE(Feptr))
RRETURN(MATCH_NOMATCH);
- c = *eptr++;
- switch(ctype)
+ fc = *Feptr++;
+ switch(Lctype)
{
case OP_ANY: /* This is the non-NL case */
if (mb->partial != 0 && /* Take care with CRLF partial */
- eptr >= mb->end_subject &&
+ Feptr >= mb->end_subject &&
NLBLOCK->nltype == NLTYPE_FIXED &&
NLBLOCK->nllen == 2 &&
- c == NLBLOCK->nl[0])
+ fc == NLBLOCK->nl[0])
{
mb->hitend = TRUE;
- if (mb->partial > 1) RRETURN(PCRE2_ERROR_PARTIAL);
+ if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
}
break;
@@ -5385,11 +3684,12 @@ for (;;)
break;
case OP_ANYNL:
- switch(c)
+ switch(fc)
{
default: RRETURN(MATCH_NOMATCH);
+
case CHAR_CR:
- if (eptr < mb->end_subject && *eptr == CHAR_LF) eptr++;
+ if (Feptr < mb->end_subject && *Feptr == CHAR_LF) Feptr++;
break;
case CHAR_LF:
@@ -5402,13 +3702,14 @@ for (;;)
case 0x2028:
case 0x2029:
#endif
- if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH);
+ if (mb->bsr_convention == PCRE2_BSR_ANYCRLF)
+ RRETURN(MATCH_NOMATCH);
break;
}
break;
case OP_NOT_HSPACE:
- switch(c)
+ switch(fc)
{
default: break;
HSPACE_BYTE_CASES:
@@ -5420,7 +3721,7 @@ for (;;)
break;
case OP_HSPACE:
- switch(c)
+ switch(fc)
{
default: RRETURN(MATCH_NOMATCH);
HSPACE_BYTE_CASES:
@@ -5432,7 +3733,7 @@ for (;;)
break;
case OP_NOT_VSPACE:
- switch(c)
+ switch(fc)
{
default: break;
VSPACE_BYTE_CASES:
@@ -5444,7 +3745,7 @@ for (;;)
break;
case OP_VSPACE:
- switch(c)
+ switch(fc)
{
default: RRETURN(MATCH_NOMATCH);
VSPACE_BYTE_CASES:
@@ -5456,31 +3757,37 @@ for (;;)
break;
case OP_NOT_DIGIT:
- if (MAX_255(c) && (mb->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
+ if (MAX_255(fc) && (mb->ctypes[fc] & ctype_digit) != 0)
+ RRETURN(MATCH_NOMATCH);
break;
case OP_DIGIT:
- if (!MAX_255(c) || (mb->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
+ if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_digit) == 0)
+ RRETURN(MATCH_NOMATCH);
break;
case OP_NOT_WHITESPACE:
- if (MAX_255(c) && (mb->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
+ if (MAX_255(fc) && (mb->ctypes[fc] & ctype_space) != 0)
+ RRETURN(MATCH_NOMATCH);
break;
case OP_WHITESPACE:
- if (!MAX_255(c) || (mb->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
+ if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_space) == 0)
+ RRETURN(MATCH_NOMATCH);
break;
case OP_NOT_WORDCHAR:
- if (MAX_255(c) && (mb->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
+ if (MAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0)
+ RRETURN(MATCH_NOMATCH);
break;
case OP_WORDCHAR:
- if (!MAX_255(c) || (mb->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
+ if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_word) == 0)
+ RRETURN(MATCH_NOMATCH);
break;
default:
- RRETURN(PCRE2_ERROR_INTERNAL);
+ return PCRE2_ERROR_INTERNAL;
}
}
}
@@ -5488,113 +3795,116 @@ for (;;)
}
/* If maximizing, it is worth using inline code for speed, doing the type
- test once at the start (i.e. keep it out of the loop). Again, keep the
- UTF-8 and UCP stuff separate. */
+ test once at the start (i.e. keep it out of the loop). */
else
{
- pp = eptr; /* Remember where we started */
+ Lstart_eptr = Feptr; /* Remember where we started */
#ifdef SUPPORT_UNICODE
- if (prop_type >= 0)
+ if (proptype >= 0)
{
- switch(prop_type)
+ switch(proptype)
{
case PT_ANY:
- for (i = min; i < max; i++)
+ for (i = Lmin; i < Lmax; i++)
{
int len = 1;
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
break;
}
- GETCHARLENTEST(c, eptr, len);
- if (prop_fail_result) break;
- eptr+= len;
+ GETCHARLENTEST(fc, Feptr, len);
+ if (Lctype == OP_NOTPROP) break;
+ Feptr+= len;
}
break;
case PT_LAMP:
- for (i = min; i < max; i++)
+ for (i = Lmin; i < Lmax; i++)
{
int chartype;
int len = 1;
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
break;
}
- GETCHARLENTEST(c, eptr, len);
- chartype = UCD_CHARTYPE(c);
+ GETCHARLENTEST(fc, Feptr, len);
+ chartype = UCD_CHARTYPE(fc);
if ((chartype == ucp_Lu ||
chartype == ucp_Ll ||
- chartype == ucp_Lt) == prop_fail_result)
+ chartype == ucp_Lt) == (Lctype == OP_NOTPROP))
break;
- eptr+= len;
+ Feptr+= len;
}
break;
case PT_GC:
- for (i = min; i < max; i++)
+ for (i = Lmin; i < Lmax; i++)
{
int len = 1;
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
break;
}
- GETCHARLENTEST(c, eptr, len);
- if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
- eptr+= len;
+ GETCHARLENTEST(fc, Feptr, len);
+ if ((UCD_CATEGORY(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
+ break;
+ Feptr+= len;
}
break;
case PT_PC:
- for (i = min; i < max; i++)
+ for (i = Lmin; i < Lmax; i++)
{
int len = 1;
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
break;
}
- GETCHARLENTEST(c, eptr, len);
- if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
- eptr+= len;
+ GETCHARLENTEST(fc, Feptr, len);
+ if ((UCD_CHARTYPE(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
+ break;
+ Feptr+= len;
}
break;
case PT_SC:
- for (i = min; i < max; i++)
+ for (i = Lmin; i < Lmax; i++)
{
int len = 1;
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
break;
}
- GETCHARLENTEST(c, eptr, len);
- if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
- eptr+= len;
+ GETCHARLENTEST(fc, Feptr, len);
+ if ((UCD_SCRIPT(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
+ break;
+ Feptr+= len;
}
break;
case PT_ALNUM:
- for (i = min; i < max; i++)
+ for (i = Lmin; i < Lmax; i++)
{
int category;
int len = 1;
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
break;
}
- GETCHARLENTEST(c, eptr, len);
- category = UCD_CATEGORY(c);
- if ((category == ucp_L || category == ucp_N) == prop_fail_result)
+ GETCHARLENTEST(fc, Feptr, len);
+ category = UCD_CATEGORY(fc);
+ if ((category == ucp_L || category == ucp_N) ==
+ (Lctype == OP_NOTPROP))
break;
- eptr+= len;
+ Feptr+= len;
}
break;
@@ -5604,186 +3914,178 @@ for (;;)
case PT_SPACE: /* Perl space */
case PT_PXSPACE: /* POSIX space */
- for (i = min; i < max; i++)
+ for (i = Lmin; i < Lmax; i++)
{
int len = 1;
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
break;
}
- GETCHARLENTEST(c, eptr, len);
- switch(c)
+ GETCHARLENTEST(fc, Feptr, len);
+ switch(fc)
{
HSPACE_CASES:
VSPACE_CASES:
- if (prop_fail_result) goto ENDLOOP99; /* Break the loop */
+ if (Lctype == OP_NOTPROP) goto ENDLOOP99; /* Break the loop */
break;
default:
- if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
+ if ((UCD_CATEGORY(fc) == ucp_Z) == (Lctype == OP_NOTPROP))
goto ENDLOOP99; /* Break the loop */
break;
}
- eptr+= len;
+ Feptr+= len;
}
ENDLOOP99:
break;
case PT_WORD:
- for (i = min; i < max; i++)
+ for (i = Lmin; i < Lmax; i++)
{
int category;
int len = 1;
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
break;
}
- GETCHARLENTEST(c, eptr, len);
- category = UCD_CATEGORY(c);
+ GETCHARLENTEST(fc, Feptr, len);
+ category = UCD_CATEGORY(fc);
if ((category == ucp_L || category == ucp_N ||
- c == CHAR_UNDERSCORE) == prop_fail_result)
+ fc == CHAR_UNDERSCORE) == (Lctype == OP_NOTPROP))
break;
- eptr+= len;
+ Feptr+= len;
}
break;
case PT_CLIST:
- for (i = min; i < max; i++)
+ for (i = Lmin; i < Lmax; i++)
{
const uint32_t *cp;
int len = 1;
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
break;
}
- GETCHARLENTEST(c, eptr, len);
- cp = PRIV(ucd_caseless_sets) + prop_value;
+ GETCHARLENTEST(fc, Feptr, len);
+ cp = PRIV(ucd_caseless_sets) + Lpropvalue;
for (;;)
{
- if (c < *cp)
- { if (prop_fail_result) break; else goto GOT_MAX; }
- if (c == *cp++)
- { if (prop_fail_result) goto GOT_MAX; else break; }
+ if (fc < *cp)
+ { if (Lctype == OP_NOTPROP) break; else goto GOT_MAX; }
+ if (fc == *cp++)
+ { if (Lctype == OP_NOTPROP) goto GOT_MAX; else break; }
}
- eptr += len;
+ Feptr += len;
}
GOT_MAX:
break;
case PT_UCNC:
- for (i = min; i < max; i++)
+ for (i = Lmin; i < Lmax; i++)
{
int len = 1;
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
break;
}
- GETCHARLENTEST(c, eptr, len);
- if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
- c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
- c >= 0xe000) == prop_fail_result)
+ GETCHARLENTEST(fc, Feptr, len);
+ if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT ||
+ fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) ||
+ fc >= 0xe000) == (Lctype == OP_NOTPROP))
break;
- eptr += len;
+ Feptr += len;
}
break;
default:
- RRETURN(PCRE2_ERROR_INTERNAL);
+ return PCRE2_ERROR_INTERNAL;
}
- /* eptr is now past the end of the maximum run */
+ /* Feptr is now past the end of the maximum run */
- if (possessive) continue; /* No backtracking */
+ if (reptype == REPTYPE_POS) continue; /* No backtracking */
- /* After \C in UTF mode, pp might be in the middle of a Unicode
- character. Use <= pp to ensure backtracking doesn't go too far. */
+ /* After \C in UTF mode, Lstart_eptr might be in the middle of a
+ Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
+ go too far. */
for(;;)
{
- if (eptr <= pp) goto TAIL_RECURSE;
- RMATCH(eptr, ecode, offset_top, mb, eptrb, RM44);
+ if (Feptr <= Lstart_eptr) break;
+ RMATCH(Fecode, RM222);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- eptr--;
- if (utf) BACKCHAR(eptr);
+ Feptr--;
+ if (utf) BACKCHAR(Feptr);
}
}
/* Match extended Unicode grapheme clusters. We will get here only if the
support is in the binary; otherwise a compile-time error occurs. */
- else if (ctype == OP_EXTUNI)
+ else if (Lctype == OP_EXTUNI)
{
- for (i = min; i < max; i++)
+ for (i = Lmin; i < Lmax; i++)
{
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
break;
}
else
{
- int lgb, rgb;
- GETCHARINCTEST(c, eptr);
- lgb = UCD_GRAPHBREAK(c);
- while (eptr < mb->end_subject)
- {
- int len = 1;
- if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
- rgb = UCD_GRAPHBREAK(c);
- if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
- lgb = rgb;
- eptr += len;
- }
+ GETCHARINCTEST(fc, Feptr);
+ Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject,
+ utf, NULL);
}
CHECK_PARTIAL();
}
- /* eptr is now past the end of the maximum run */
+ /* Feptr is now past the end of the maximum run */
- if (possessive) continue; /* No backtracking */
+ if (reptype == REPTYPE_POS) continue; /* No backtracking */
- /* We use <= pp rather than == pp to detect the start of the run while
- backtracking because the use of \C in UTF mode can cause BACKCHAR to
- move back past pp. This is just palliative; the use of \C in UTF mode
- is fraught with danger. */
+ /* We use <= Lstart_eptr rather than == Lstart_eptr to detect the start
+ of the run while backtracking because the use of \C in UTF mode can
+ cause BACKCHAR to move back past Lstart_eptr. This is just palliative;
+ the use of \C in UTF mode is fraught with danger. */
for(;;)
{
int lgb, rgb;
PCRE2_SPTR fptr;
- if (eptr <= pp) goto TAIL_RECURSE; /* At start of char run */
- RMATCH(eptr, ecode, offset_top, mb, eptrb, RM45);
+ if (Feptr <= Lstart_eptr) break; /* At start of char run */
+ RMATCH(Fecode, RM220);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
/* Backtracking over an extended grapheme cluster involves inspecting
the previous two characters (if present) to see if a break is
permitted between them. */
- eptr--;
- if (!utf) c = *eptr; else
+ Feptr--;
+ if (!utf) fc = *Feptr; else
{
- BACKCHAR(eptr);
- GETCHAR(c, eptr);
+ BACKCHAR(Feptr);
+ GETCHAR(fc, Feptr);
}
- rgb = UCD_GRAPHBREAK(c);
+ rgb = UCD_GRAPHBREAK(fc);
for (;;)
{
- if (eptr <= pp) goto TAIL_RECURSE; /* At start of char run */
- fptr = eptr - 1;
- if (!utf) c = *fptr; else
+ if (Feptr <= Lstart_eptr) break; /* At start of char run */
+ fptr = Feptr - 1;
+ if (!utf) fc = *fptr; else
{
BACKCHAR(fptr);
- GETCHAR(c, fptr);
+ GETCHAR(fc, fptr);
}
- lgb = UCD_GRAPHBREAK(c);
+ lgb = UCD_GRAPHBREAK(fc);
if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
- eptr = fptr;
+ Feptr = fptr;
rgb = lgb;
}
}
@@ -5795,325 +4097,328 @@ for (;;)
#ifdef SUPPORT_UNICODE
if (utf)
{
- switch(ctype)
+ switch(Lctype)
{
case OP_ANY:
- for (i = min; i < max; i++)
+ for (i = Lmin; i < Lmax; i++)
{
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
break;
}
- if (IS_NEWLINE(eptr)) break;
+ if (IS_NEWLINE(Feptr)) break;
if (mb->partial != 0 && /* Take care with CRLF partial */
- eptr + 1 >= mb->end_subject &&
+ Feptr + 1 >= mb->end_subject &&
NLBLOCK->nltype == NLTYPE_FIXED &&
NLBLOCK->nllen == 2 &&
- UCHAR21(eptr) == NLBLOCK->nl[0])
+ UCHAR21(Feptr) == NLBLOCK->nl[0])
{
mb->hitend = TRUE;
- if (mb->partial > 1) RRETURN(PCRE2_ERROR_PARTIAL);
+ if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
}
- eptr++;
- ACROSSCHAR(eptr < mb->end_subject, *eptr, eptr++);
+ Feptr++;
+ ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
}
break;
case OP_ALLANY:
- if (max < INT_MAX)
+ if (Lmax < UINT32_MAX)
{
- for (i = min; i < max; i++)
+ for (i = Lmin; i < Lmax; i++)
{
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
break;
}
- eptr++;
- ACROSSCHAR(eptr < mb->end_subject, *eptr, eptr++);
+ Feptr++;
+ ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
}
}
else
{
- eptr = mb->end_subject; /* Unlimited UTF-8 repeat */
+ Feptr = mb->end_subject; /* Unlimited UTF-8 repeat */
SCHECK_PARTIAL();
}
break;
- /* The byte case is the same as non-UTF8 */
+ /* The "byte" (i.e. "code unit") case is the same as non-UTF */
case OP_ANYBYTE:
- c = max - min;
- if (c > (uint32_t)(mb->end_subject - eptr))
+ fc = Lmax - Lmin;
+ if (fc > (uint32_t)(mb->end_subject - Feptr))
{
- eptr = mb->end_subject;
+ Feptr = mb->end_subject;
SCHECK_PARTIAL();
}
- else eptr += c;
+ else Feptr += fc;
break;
case OP_ANYNL:
- for (i = min; i < max; i++)
+ for (i = Lmin; i < Lmax; i++)
{
int len = 1;
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
break;
}
- GETCHARLEN(c, eptr, len);
- if (c == CHAR_CR)
+ GETCHARLEN(fc, Feptr, len);
+ if (fc == CHAR_CR)
{
- if (++eptr >= mb->end_subject) break;
- if (UCHAR21(eptr) == CHAR_LF) eptr++;
+ if (++Feptr >= mb->end_subject) break;
+ if (UCHAR21(Feptr) == CHAR_LF) Feptr++;
}
else
{
- if (c != CHAR_LF &&
+ if (fc != CHAR_LF &&
(mb->bsr_convention == PCRE2_BSR_ANYCRLF ||
- (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
+ (fc != CHAR_VT && fc != CHAR_FF && fc != CHAR_NEL
#ifndef EBCDIC
- && c != 0x2028 && c != 0x2029
+ && fc != 0x2028 && fc != 0x2029
#endif /* Not EBCDIC */
)))
break;
- eptr += len;
+ Feptr += len;
}
}
break;
case OP_NOT_HSPACE:
case OP_HSPACE:
- for (i = min; i < max; i++)
+ for (i = Lmin; i < Lmax; i++)
{
BOOL gotspace;
int len = 1;
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
break;
}
- GETCHARLEN(c, eptr, len);
- switch(c)
+ GETCHARLEN(fc, Feptr, len);
+ switch(fc)
{
HSPACE_CASES: gotspace = TRUE; break;
default: gotspace = FALSE; break;
}
- if (gotspace == (ctype == OP_NOT_HSPACE)) break;
- eptr += len;
+ if (gotspace == (Lctype == OP_NOT_HSPACE)) break;
+ Feptr += len;
}
break;
case OP_NOT_VSPACE:
case OP_VSPACE:
- for (i = min; i < max; i++)
+ for (i = Lmin; i < Lmax; i++)
{
BOOL gotspace;
int len = 1;
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
break;
}
- GETCHARLEN(c, eptr, len);
- switch(c)
+ GETCHARLEN(fc, Feptr, len);
+ switch(fc)
{
VSPACE_CASES: gotspace = TRUE; break;
default: gotspace = FALSE; break;
}
- if (gotspace == (ctype == OP_NOT_VSPACE)) break;
- eptr += len;
+ if (gotspace == (Lctype == OP_NOT_VSPACE)) break;
+ Feptr += len;
}
break;
case OP_NOT_DIGIT:
- for (i = min; i < max; i++)
+ for (i = Lmin; i < Lmax; i++)
{
int len = 1;
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
break;
}
- GETCHARLEN(c, eptr, len);
- if (c < 256 && (mb->ctypes[c] & ctype_digit) != 0) break;
- eptr+= len;
+ GETCHARLEN(fc, Feptr, len);
+ if (fc < 256 && (mb->ctypes[fc] & ctype_digit) != 0) break;
+ Feptr+= len;
}
break;
case OP_DIGIT:
- for (i = min; i < max; i++)
+ for (i = Lmin; i < Lmax; i++)
{
int len = 1;
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
break;
}
- GETCHARLEN(c, eptr, len);
- if (c >= 256 ||(mb->ctypes[c] & ctype_digit) == 0) break;
- eptr+= len;
+ GETCHARLEN(fc, Feptr, len);
+ if (fc >= 256 ||(mb->ctypes[fc] & ctype_digit) == 0) break;
+ Feptr+= len;
}
break;
case OP_NOT_WHITESPACE:
- for (i = min; i < max; i++)
+ for (i = Lmin; i < Lmax; i++)
{
int len = 1;
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
break;
}
- GETCHARLEN(c, eptr, len);
- if (c < 256 && (mb->ctypes[c] & ctype_space) != 0) break;
- eptr+= len;
+ GETCHARLEN(fc, Feptr, len);
+ if (fc < 256 && (mb->ctypes[fc] & ctype_space) != 0) break;
+ Feptr+= len;
}
break;
case OP_WHITESPACE:
- for (i = min; i < max; i++)
+ for (i = Lmin; i < Lmax; i++)
{
int len = 1;
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
break;
}
- GETCHARLEN(c, eptr, len);
- if (c >= 256 ||(mb->ctypes[c] & ctype_space) == 0) break;
- eptr+= len;
+ GETCHARLEN(fc, Feptr, len);
+ if (fc >= 256 ||(mb->ctypes[fc] & ctype_space) == 0) break;
+ Feptr+= len;
}
break;
case OP_NOT_WORDCHAR:
- for (i = min; i < max; i++)
+ for (i = Lmin; i < Lmax; i++)
{
int len = 1;
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
break;
}
- GETCHARLEN(c, eptr, len);
- if (c < 256 && (mb->ctypes[c] & ctype_word) != 0) break;
- eptr+= len;
+ GETCHARLEN(fc, Feptr, len);
+ if (fc < 256 && (mb->ctypes[fc] & ctype_word) != 0) break;
+ Feptr+= len;
}
break;
case OP_WORDCHAR:
- for (i = min; i < max; i++)
+ for (i = Lmin; i < Lmax; i++)
{
int len = 1;
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
break;
}
- GETCHARLEN(c, eptr, len);
- if (c >= 256 || (mb->ctypes[c] & ctype_word) == 0) break;
- eptr+= len;
+ GETCHARLEN(fc, Feptr, len);
+ if (fc >= 256 || (mb->ctypes[fc] & ctype_word) == 0) break;
+ Feptr+= len;
}
break;
default:
- RRETURN(PCRE2_ERROR_INTERNAL);
+ return PCRE2_ERROR_INTERNAL;
}
- if (possessive) continue; /* No backtracking */
+ if (reptype == REPTYPE_POS) continue; /* No backtracking */
- /* After \C in UTF mode, pp might be in the middle of a Unicode
- character. Use <= pp to ensure backtracking doesn't go too far. */
+ /* After \C in UTF mode, Lstart_eptr might be in the middle of a
+ Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't go
+ too far. */
for(;;)
{
- if (eptr <= pp) goto TAIL_RECURSE;
- RMATCH(eptr, ecode, offset_top, mb, eptrb, RM46);
+ if (Feptr <= Lstart_eptr) break;
+ RMATCH(Fecode, RM221);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- eptr--;
- BACKCHAR(eptr);
- if (ctype == OP_ANYNL && eptr > pp && UCHAR21(eptr) == CHAR_NL &&
- UCHAR21(eptr - 1) == CHAR_CR) eptr--;
+ Feptr--;
+ BACKCHAR(Feptr);
+ if (Lctype == OP_ANYNL && Feptr > Lstart_eptr &&
+ UCHAR21(Feptr) == CHAR_NL && UCHAR21(Feptr - 1) == CHAR_CR)
+ Feptr--;
}
}
else
#endif /* SUPPORT_UNICODE */
+
/* Not UTF mode */
{
- switch(ctype)
+ switch(Lctype)
{
case OP_ANY:
- for (i = min; i < max; i++)
+ for (i = Lmin; i < Lmax; i++)
{
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
break;
}
- if (IS_NEWLINE(eptr)) break;
+ if (IS_NEWLINE(Feptr)) break;
if (mb->partial != 0 && /* Take care with CRLF partial */
- eptr + 1 >= mb->end_subject &&
+ Feptr + 1 >= mb->end_subject &&
NLBLOCK->nltype == NLTYPE_FIXED &&
NLBLOCK->nllen == 2 &&
- *eptr == NLBLOCK->nl[0])
+ *Feptr == NLBLOCK->nl[0])
{
mb->hitend = TRUE;
- if (mb->partial > 1) RRETURN(PCRE2_ERROR_PARTIAL);
+ if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
}
- eptr++;
+ Feptr++;
}
break;
case OP_ALLANY:
case OP_ANYBYTE:
- c = max - min;
- if (c > (uint32_t)(mb->end_subject - eptr))
+ fc = Lmax - Lmin;
+ if (fc > (uint32_t)(mb->end_subject - Feptr))
{
- eptr = mb->end_subject;
+ Feptr = mb->end_subject;
SCHECK_PARTIAL();
}
- else eptr += c;
+ else Feptr += fc;
break;
case OP_ANYNL:
- for (i = min; i < max; i++)
+ for (i = Lmin; i < Lmax; i++)
{
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
break;
}
- c = *eptr;
- if (c == CHAR_CR)
+ fc = *Feptr;
+ if (fc == CHAR_CR)
{
- if (++eptr >= mb->end_subject) break;
- if (*eptr == CHAR_LF) eptr++;
+ if (++Feptr >= mb->end_subject) break;
+ if (*Feptr == CHAR_LF) Feptr++;
}
else
{
- if (c != CHAR_LF && (mb->bsr_convention == PCRE2_BSR_ANYCRLF ||
- (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
+ if (fc != CHAR_LF && (mb->bsr_convention == PCRE2_BSR_ANYCRLF ||
+ (fc != CHAR_VT && fc != CHAR_FF && fc != CHAR_NEL
#if PCRE2_CODE_UNIT_WIDTH != 8
- && c != 0x2028 && c != 0x2029
+ && fc != 0x2028 && fc != 0x2029
#endif
))) break;
- eptr++;
+ Feptr++;
}
}
break;
case OP_NOT_HSPACE:
- for (i = min; i < max; i++)
+ for (i = Lmin; i < Lmax; i++)
{
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
break;
}
- switch(*eptr)
+ switch(*Feptr)
{
- default: eptr++; break;
+ default: Feptr++; break;
HSPACE_BYTE_CASES:
#if PCRE2_CODE_UNIT_WIDTH != 8
HSPACE_MULTIBYTE_CASES:
@@ -6125,37 +4430,37 @@ for (;;)
break;
case OP_HSPACE:
- for (i = min; i < max; i++)
+ for (i = Lmin; i < Lmax; i++)
{
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
break;
}
- switch(*eptr)
+ switch(*Feptr)
{
default: goto ENDLOOP01;
HSPACE_BYTE_CASES:
#if PCRE2_CODE_UNIT_WIDTH != 8
HSPACE_MULTIBYTE_CASES:
#endif
- eptr++; break;
+ Feptr++; break;
}
}
ENDLOOP01:
break;
case OP_NOT_VSPACE:
- for (i = min; i < max; i++)
+ for (i = Lmin; i < Lmax; i++)
{
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
break;
}
- switch(*eptr)
+ switch(*Feptr)
{
- default: eptr++; break;
+ default: Feptr++; break;
VSPACE_BYTE_CASES:
#if PCRE2_CODE_UNIT_WIDTH != 8
VSPACE_MULTIBYTE_CASES:
@@ -6167,251 +4472,1502 @@ for (;;)
break;
case OP_VSPACE:
- for (i = min; i < max; i++)
+ for (i = Lmin; i < Lmax; i++)
{
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
break;
}
- switch(*eptr)
+ switch(*Feptr)
{
default: goto ENDLOOP03;
VSPACE_BYTE_CASES:
#if PCRE2_CODE_UNIT_WIDTH != 8
VSPACE_MULTIBYTE_CASES:
#endif
- eptr++; break;
+ Feptr++; break;
}
}
ENDLOOP03:
break;
case OP_NOT_DIGIT:
- for (i = min; i < max; i++)
+ for (i = Lmin; i < Lmax; i++)
{
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
break;
}
- if (MAX_255(*eptr) && (mb->ctypes[*eptr] & ctype_digit) != 0) break;
- eptr++;
+ if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_digit) != 0)
+ break;
+ Feptr++;
}
break;
case OP_DIGIT:
- for (i = min; i < max; i++)
+ for (i = Lmin; i < Lmax; i++)
{
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
break;
}
- if (!MAX_255(*eptr) || (mb->ctypes[*eptr] & ctype_digit) == 0) break;
- eptr++;
+ if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_digit) == 0)
+ break;
+ Feptr++;
}
break;
case OP_NOT_WHITESPACE:
- for (i = min; i < max; i++)
+ for (i = Lmin; i < Lmax; i++)
{
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
break;
}
- if (MAX_255(*eptr) && (mb->ctypes[*eptr] & ctype_space) != 0) break;
- eptr++;
+ if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_space) != 0)
+ break;
+ Feptr++;
}
break;
case OP_WHITESPACE:
- for (i = min; i < max; i++)
+ for (i = Lmin; i < Lmax; i++)
{
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
break;
}
- if (!MAX_255(*eptr) || (mb->ctypes[*eptr] & ctype_space) == 0) break;
- eptr++;
+ if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_space) == 0)
+ break;
+ Feptr++;
}
break;
case OP_NOT_WORDCHAR:
- for (i = min; i < max; i++)
+ for (i = Lmin; i < Lmax; i++)
{
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
break;
}
- if (MAX_255(*eptr) && (mb->ctypes[*eptr] & ctype_word) != 0) break;
- eptr++;
+ if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_word) != 0)
+ break;
+ Feptr++;
}
break;
case OP_WORDCHAR:
- for (i = min; i < max; i++)
+ for (i = Lmin; i < Lmax; i++)
{
- if (eptr >= mb->end_subject)
+ if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
break;
}
- if (!MAX_255(*eptr) || (mb->ctypes[*eptr] & ctype_word) == 0) break;
- eptr++;
+ if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_word) == 0)
+ break;
+ Feptr++;
}
break;
default:
- RRETURN(PCRE2_ERROR_INTERNAL);
+ return PCRE2_ERROR_INTERNAL;
}
- if (possessive) continue; /* No backtracking */
+ if (reptype == REPTYPE_POS) continue; /* No backtracking */
+
for (;;)
{
- if (eptr == pp) goto TAIL_RECURSE;
- RMATCH(eptr, ecode, offset_top, mb, eptrb, RM47);
+ if (Feptr == Lstart_eptr) break;
+ RMATCH(Fecode, RM34);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- eptr--;
- if (ctype == OP_ANYNL && eptr > pp && *eptr == CHAR_LF &&
- eptr[-1] == CHAR_CR) eptr--;
+ Feptr--;
+ if (Lctype == OP_ANYNL && Feptr > Lstart_eptr && *Feptr == CHAR_LF &&
+ Feptr[-1] == CHAR_CR) Feptr--;
+ }
+ }
+ }
+ break; /* End of repeat character type processing */
+
+#undef Lstart_eptr
+#undef Lmin
+#undef Lmax
+#undef Lctype
+#undef Lpropvalue
+
+
+ /* ===================================================================== */
+ /* Match a back reference, possibly repeatedly. Look past the end of the
+ item to see if there is repeat information following. The OP_REF and
+ OP_REFI opcodes are used for a reference to a numbered group or to a
+ non-duplicated named group. For a duplicated named group, OP_DNREF and
+ OP_DNREFI are used. In this case we must scan the list of groups to which
+ the name refers, and use the first one that is set. */
+
+#define Lmin F->temp_32[0]
+#define Lmax F->temp_32[1]
+#define Lcaseless F->temp_32[2]
+#define Lstart F->temp_sptr[0]
+#define Loffset F->temp_size
+
+ case OP_DNREF:
+ case OP_DNREFI:
+ Lcaseless = (Fop == OP_DNREFI);
+ {
+ int count = GET2(Fecode, 1+IMM2_SIZE);
+ PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size;
+ Fecode += 1 + 2*IMM2_SIZE;
+
+ while (count-- > 0)
+ {
+ Loffset = (GET2(slot, 0) << 1) - 2;
+ if (Loffset < Foffset_top && Fovector[Loffset] != PCRE2_UNSET) break;
+ slot += mb->name_entry_size;
+ }
+ }
+ goto REF_REPEAT;
+
+ case OP_REF:
+ case OP_REFI:
+ Lcaseless = (Fop == OP_REFI);
+ Loffset = (GET2(Fecode, 1) << 1) - 2;
+ Fecode += 1 + IMM2_SIZE;
+
+ /* Set up for repetition, or handle the non-repeated case. The maximum and
+ minimum must be in the heap frame, but as they are short-term values, we
+ use temporary fields. */
+
+ REF_REPEAT:
+ switch (*Fecode)
+ {
+ case OP_CRSTAR:
+ case OP_CRMINSTAR:
+ case OP_CRPLUS:
+ case OP_CRMINPLUS:
+ case OP_CRQUERY:
+ case OP_CRMINQUERY:
+ fc = *Fecode++ - OP_CRSTAR;
+ Lmin = rep_min[fc];
+ Lmax = rep_max[fc];
+ reptype = rep_typ[fc];
+ break;
+
+ case OP_CRRANGE:
+ case OP_CRMINRANGE:
+ Lmin = GET2(Fecode, 1);
+ Lmax = GET2(Fecode, 1 + IMM2_SIZE);
+ reptype = rep_typ[*Fecode - OP_CRSTAR];
+ if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */
+ Fecode += 1 + 2 * IMM2_SIZE;
+ break;
+
+ default: /* No repeat follows */
+ {
+ rrc = match_ref(Loffset, Lcaseless, F, mb, &length);
+ if (rrc != 0)
+ {
+ if (rrc > 0) Feptr = mb->end_subject; /* Partial match */
+ CHECK_PARTIAL();
+ RRETURN(MATCH_NOMATCH);
}
}
+ Feptr += length;
+ continue; /* With the main loop */
+ }
+
+ /* Handle repeated back references. If a set group has length zero, just
+ continue with the main loop, because it matches however many times. For an
+ unset reference, if the minimum is zero, we can also just continue. We can
+ also continue if PCRE2_MATCH_UNSET_BACKREF is set, because this makes unset
+ group behave as a zero-length group. For any other unset cases, carrying
+ on will result in NOMATCH. */
+
+ if (Loffset < Foffset_top && Fovector[Loffset] != PCRE2_UNSET)
+ {
+ if (Fovector[Loffset] == Fovector[Loffset + 1]) continue;
+ }
+ else /* Group is not set */
+ {
+ if (Lmin == 0 || (mb->poptions & PCRE2_MATCH_UNSET_BACKREF) != 0)
+ continue;
+ }
+
+ /* First, ensure the minimum number of matches are present. */
+
+ for (i = 1; i <= Lmin; i++)
+ {
+ PCRE2_SIZE slength;
+ rrc = match_ref(Loffset, Lcaseless, F, mb, &slength);
+ if (rrc != 0)
+ {
+ if (rrc > 0) Feptr = mb->end_subject; /* Partial match */
+ CHECK_PARTIAL();
+ RRETURN(MATCH_NOMATCH);
+ }
+ Feptr += slength;
+ }
+
+ /* If min = max, we are done. They are not both allowed to be zero. */
+ if (Lmin == Lmax) continue;
+
+ /* If minimizing, keep trying and advancing the pointer. */
+
+ if (reptype == REPTYPE_MIN)
+ {
+ for (;;)
+ {
+ PCRE2_SIZE slength;
+ RMATCH(Fecode, RM20);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
+ rrc = match_ref(Loffset, Lcaseless, F, mb, &slength);
+ if (rrc != 0)
+ {
+ if (rrc > 0) Feptr = mb->end_subject; /* Partial match */
+ CHECK_PARTIAL();
+ RRETURN(MATCH_NOMATCH);
+ }
+ Feptr += slength;
+ }
/* Control never gets here */
}
- /* There's been some horrible disaster. Arrival here can only mean there is
- something seriously wrong in the code above or the OP_xxx definitions. */
+ /* If maximizing, find the longest string and work backwards, as long as
+ the matched lengths for each iteration are the same. */
- default:
- RRETURN(PCRE2_ERROR_INTERNAL);
- }
+ else
+ {
+ BOOL samelengths = TRUE;
+ Lstart = Feptr; /* Starting position */
+ Flength = Fovector[Loffset+1] - Fovector[Loffset];
- /* Do not stick any code in here without much thought; it is assumed
- that "continue" in the code above comes out to here to repeat the main
- loop. */
+ for (i = Lmin; i < Lmax; i++)
+ {
+ PCRE2_SIZE slength;
+ rrc = match_ref(Loffset, Lcaseless, F, mb, &slength);
+ if (rrc != 0)
+ {
+ /* Can't use CHECK_PARTIAL because we don't want to update Feptr in
+ the soft partial matching case. */
- } /* End of main loop */
-/* Control never reaches here */
+ if (rrc > 0 && mb->partial != 0 &&
+ mb->end_subject > mb->start_used_ptr)
+ {
+ mb->hitend = TRUE;
+ if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
+ }
+ break;
+ }
+ if (slength != Flength) samelengths = FALSE;
+ Feptr += slength;
+ }
-/* When compiling to use the heap rather than the stack for recursive calls to
-match(), the RRETURN() macro jumps here. The number that is saved in
-frame->Xwhere indicates which label we actually want to return to. */
+ /* If the length matched for each repetition is the same as the length of
+ the captured group, we can easily work backwards. This is the normal
+ case. However, in caseless UTF-8 mode there are pairs of case-equivalent
+ characters whose lengths (in terms of code units) differ. However, this
+ is very rare, so we handle it by re-matching fewer and fewer times. */
-#ifdef HEAP_MATCH_RECURSE
-#define LBL(val) case val: goto L_RM##val;
-HEAP_RETURN:
-switch (frame->Xwhere)
- {
- LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
- LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
- LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
- LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
- LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
- LBL(65) LBL(66) LBL(68)
-#ifdef SUPPORT_WIDE_CHARS
- LBL(20) LBL(21)
+ if (samelengths)
+ {
+ while (Feptr >= Lstart)
+ {
+ RMATCH(Fecode, RM21);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ Feptr -= Flength;
+ }
+ }
+
+ /* The rare case of non-matching lengths. Re-scan the repetition for each
+ iteration. We know that match_ref() will succeed every time. */
+
+ else
+ {
+ Lmax = i;
+ for (;;)
+ {
+ RMATCH(Fecode, RM22);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (Feptr == Lstart) break; /* Failed after minimal repetition */
+ Feptr = Lstart;
+ Lmax--;
+ for (i = Lmin; i < Lmax; i++)
+ {
+ PCRE2_SIZE slength;
+ (void)match_ref(Loffset, Lcaseless, F, mb, &slength);
+ Feptr += slength;
+ }
+ }
+ }
+
+ RRETURN(MATCH_NOMATCH);
+ }
+ /* Control never gets here */
+
+#undef Lcaseless
+#undef Lmin
+#undef Lmax
+#undef Lstart
+#undef Loffset
+
+
+
+/* ========================================================================= */
+/* Opcodes for the start of various parenthesized items */
+/* ========================================================================= */
+
+ /* In all cases, if the result of RMATCH() is MATCH_THEN, check whether the
+ (*THEN) is within the current branch by comparing the address of OP_THEN
+ that is passed back with the end of the branch. If (*THEN) is within the
+ current branch, and the branch is one of two or more alternatives (it
+ either starts or ends with OP_ALT), we have reached the limit of THEN's
+ action, so convert the return code to NOMATCH, which will cause normal
+ backtracking to happen from now on. Otherwise, THEN is passed back to an
+ outer alternative. This implements Perl's treatment of parenthesized
+ groups, where a group not containing | does not affect the current
+ alternative, that is, (X) is NOT the same as (X|(*F)). */
+
+
+ /* ===================================================================== */
+ /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a non-possessive
+ bracket group, indicating that it may occur zero times. It may repeat
+ infinitely, or not at all - i.e. it could be ()* or ()? or even (){0} in
+ the pattern. Brackets with fixed upper repeat limits are compiled as a
+ number of copies, with the optional ones preceded by BRAZERO or BRAMINZERO.
+ Possessive groups with possible zero repeats are preceded by BRAPOSZERO. */
+
+#define Lnext_ecode F->temp_sptr[0]
+
+ case OP_BRAZERO:
+ Lnext_ecode = Fecode + 1;
+ RMATCH(Lnext_ecode, RM9);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ do Lnext_ecode += GET(Lnext_ecode, 1); while (*Lnext_ecode == OP_ALT);
+ Fecode = Lnext_ecode + 1 + LINK_SIZE;
+ break;
+
+ case OP_BRAMINZERO:
+ Lnext_ecode = Fecode + 1;
+ do Lnext_ecode += GET(Lnext_ecode, 1); while (*Lnext_ecode == OP_ALT);
+ RMATCH(Lnext_ecode + 1 + LINK_SIZE, RM10);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ Fecode++;
+ break;
+
+#undef Lnext_ecode
+
+ case OP_SKIPZERO:
+ Fecode++;
+ do Fecode += GET(Fecode,1); while (*Fecode == OP_ALT);
+ Fecode += 1 + LINK_SIZE;
+ break;
+
+
+ /* ===================================================================== */
+ /* Handle possessive brackets with an unlimited repeat. The end of these
+ brackets will always be OP_KETRPOS, which returns MATCH_KETRPOS without
+ going further in the pattern. */
+
+#define Lframe_type F->temp_32[0]
+#define Lmatched_once F->temp_32[1]
+#define Lzero_allowed F->temp_32[2]
+#define Lstart_eptr F->temp_sptr[0]
+#define Lstart_group F->temp_sptr[1]
+
+ case OP_BRAPOSZERO:
+ Lzero_allowed = TRUE; /* Zero repeat is allowed */
+ Fecode += 1;
+ if (*Fecode == OP_CBRAPOS || *Fecode == OP_SCBRAPOS)
+ goto POSSESSIVE_CAPTURE;
+ goto POSSESSIVE_NON_CAPTURE;
+
+ case OP_BRAPOS:
+ case OP_SBRAPOS:
+ Lzero_allowed = FALSE; /* Zero repeat not allowed */
+
+ POSSESSIVE_NON_CAPTURE:
+ Lframe_type = GF_NOCAPTURE; /* Remembered frame type */
+ goto POSSESSIVE_GROUP;
+
+ case OP_CBRAPOS:
+ case OP_SCBRAPOS:
+ Lzero_allowed = FALSE; /* Zero repeat not allowed */
+
+ POSSESSIVE_CAPTURE:
+ number = GET2(Fecode, 1+LINK_SIZE);
+ Lframe_type = GF_CAPTURE | number; /* Remembered frame type */
+
+ POSSESSIVE_GROUP:
+ Lmatched_once = FALSE; /* Never matched */
+ Lstart_group = Fecode; /* Start of this group */
+
+ for (;;)
+ {
+ Lstart_eptr = Feptr; /* Position at group start */
+ group_frame_type = Lframe_type;
+ RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM8);
+ if (rrc == MATCH_KETRPOS)
+ {
+ Lmatched_once = TRUE; /* Matched at least once */
+ if (Feptr == Lstart_eptr) /* Empty match; skip to end */
+ {
+ do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT);
+ break;
+ }
+
+ Fecode = Lstart_group;
+ continue;
+ }
+
+ /* See comment above about handling THEN. */
+
+ if (rrc == MATCH_THEN)
+ {
+ PCRE2_SPTR next_ecode = Fecode + GET(Fecode,1);
+ if (mb->verb_ecode_ptr < next_ecode &&
+ (*Fecode == OP_ALT || *next_ecode == OP_ALT))
+ rrc = MATCH_NOMATCH;
+ }
+
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ Fecode += GET(Fecode, 1);
+ if (*Fecode != OP_ALT) break;
+ }
+
+ /* Success if matched something or zero repeat allowed */
+
+ if (Lmatched_once || Lzero_allowed)
+ {
+ Fecode += 1 + LINK_SIZE;
+ break;
+ }
+
+ RRETURN(MATCH_NOMATCH);
+
+#undef Lmatched_once
+#undef Lzero_allowed
+#undef Lframe_type
+#undef Lstart_eptr
+#undef Lstart_group
+
+
+ /* ===================================================================== */
+ /* Handle non-capturing brackets that cannot match an empty string. When we
+ get to the final alternative within the brackets, as long as there are no
+ THEN's in the pattern, we can optimize by not recording a new backtracking
+ point. (Ideally we should test for a THEN within this group, but we don't
+ have that information.) Don't do this if we are at the very top level,
+ however, because that would make handling assertions and once-only brackets
+ messier when there is nothing to go back to. */
+
+#define Lframe_type F->temp_32[0] /* Set for all that use GROUPLOOP */
+#define Lnext_branch F->temp_sptr[0] /* Used only in OP_BRA handling */
+
+ case OP_BRA:
+ if (mb->hasthen || Frdepth == 0)
+ {
+ Lframe_type = 0;
+ goto GROUPLOOP;
+ }
+
+ for (;;)
+ {
+ Lnext_branch = Fecode + GET(Fecode, 1);
+ if (*Lnext_branch != OP_ALT) break;
+
+ /* This is never the final branch. We do not need to test for MATCH_THEN
+ here because this code is not used when there is a THEN in the pattern. */
+
+ RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM1);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ Fecode = Lnext_branch;
+ }
+
+ /* Hit the start of the final branch. Continue at this level. */
+
+ Fecode += PRIV(OP_lengths)[*Fecode];
+ break;
+
+#undef Lnext_branch
+
+
+ /* ===================================================================== */
+ /* Handle a capturing bracket, other than those that are possessive with an
+ unlimited repeat. */
+
+ case OP_CBRA:
+ case OP_SCBRA:
+ Lframe_type = GF_CAPTURE | GET2(Fecode, 1+LINK_SIZE);
+ goto GROUPLOOP;
+
+
+ /* ===================================================================== */
+ /* Atomic groups and non-capturing brackets that can match an empty string
+ must record a backtracking point and also set up a chained frame. */
+
+ case OP_ONCE:
+ case OP_SBRA:
+ Lframe_type = GF_NOCAPTURE | Fop;
+
+ GROUPLOOP:
+ for (;;)
+ {
+ group_frame_type = Lframe_type;
+ RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM2);
+ if (rrc == MATCH_THEN)
+ {
+ PCRE2_SPTR next_ecode = Fecode + GET(Fecode,1);
+ if (mb->verb_ecode_ptr < next_ecode &&
+ (*Fecode == OP_ALT || *next_ecode == OP_ALT))
+ rrc = MATCH_NOMATCH;
+ }
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ Fecode += GET(Fecode, 1);
+ if (*Fecode != OP_ALT) RRETURN(MATCH_NOMATCH);
+ }
+ /* Control never reaches here. */
+
+#undef Lframe_type
+
+
+ /* ===================================================================== */
+ /* Recursion either matches the current regex, or some subexpression. The
+ offset data is the offset to the starting bracket from the start of the
+ whole pattern. (This is so that it works from duplicated subpatterns.) */
+
+#define Lframe_type F->temp_32[0]
+#define Lstart_branch F->temp_sptr[0]
+
+ case OP_RECURSE:
+ bracode = mb->start_code + GET(Fecode, 1);
+ number = (bracode == mb->start_code)? 0 : GET2(bracode, 1 + LINK_SIZE);
+
+ /* If we are already in a recursion, check for repeating the same one
+ without advancing the subject pointer. This should catch convoluted mutual
+ recursions. (Some simple cases are caught at compile time.) */
+
+ if (Fcurrent_recurse != RECURSE_UNSET)
+ {
+ offset = Flast_group_offset;
+ while (offset != PCRE2_UNSET)
+ {
+ N = (heapframe *)((char *)mb->match_frames + offset);
+ P = (heapframe *)((char *)N - frame_size);
+ if (N->group_frame_type == (GF_RECURSE | number))
+ {
+ if (Feptr == P->eptr) return PCRE2_ERROR_RECURSELOOP;
+ break;
+ }
+ offset = P->last_group_offset;
+ }
+ }
+
+ /* Now run the recursion, branch by branch. */
+
+ Lstart_branch = bracode;
+ Lframe_type = GF_RECURSE | number;
+
+ for (;;)
+ {
+ PCRE2_SPTR next_ecode;
+
+ group_frame_type = Lframe_type;
+ RMATCH(Lstart_branch + PRIV(OP_lengths)[*Lstart_branch], RM11);
+ next_ecode = Lstart_branch + GET(Lstart_branch,1);
+
+ /* Handle backtracking verbs, which are defined in a range that can
+ easily be tested for. PCRE does not allow THEN, SKIP, PRUNE or COMMIT to
+ escape beyond a recursion; they cause a NOMATCH for the entire recursion.
+
+ When one of these verbs triggers, the current recursion group number is
+ recorded. If it matches the recursion we are processing, the verb
+ happened within the recursion and we must deal with it. Otherwise it must
+ have happened after the recursion completed, and so has to be passed
+ back. See comment above about handling THEN. */
+
+ if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX &&
+ mb->verb_current_recurse == (Lframe_type ^ GF_RECURSE))
+ {
+ if (rrc == MATCH_THEN && mb->verb_ecode_ptr < next_ecode &&
+ (*Lstart_branch == OP_ALT || *next_ecode == OP_ALT))
+ rrc = MATCH_NOMATCH;
+ else RRETURN(MATCH_NOMATCH);
+ }
+
+ /* Note that carrying on after (*ACCEPT) in a recursion is handled in the
+ OP_ACCEPT code. Nothing needs to be done here. */
+
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ Lstart_branch = next_ecode;
+ if (*Lstart_branch != OP_ALT) RRETURN(MATCH_NOMATCH);
+ }
+ /* Control never reaches here. */
+
+#undef Lframe_type
+#undef Lstart_branch
+
+
+ /* ===================================================================== */
+ /* Positive assertions are like other groups except that PCRE doesn't allow
+ the effect of (*THEN) to escape beyond an assertion; it is therefore
+ treated as NOMATCH. (*ACCEPT) is treated as successful assertion, with its
+ captures and mark retained. Any other return is an error. */
+
+#define Lframe_type F->temp_32[0]
+
+ case OP_ASSERT:
+ case OP_ASSERTBACK:
+ Lframe_type = GF_NOCAPTURE | Fop;
+ for (;;)
+ {
+ group_frame_type = Lframe_type;
+ RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM3);
+ if (rrc == MATCH_ACCEPT)
+ {
+ memcpy(Fovector,
+ (char *)assert_accept_frame + offsetof(heapframe, ovector),
+ assert_accept_frame->offset_top * sizeof(PCRE2_SIZE));
+ Foffset_top = assert_accept_frame->offset_top;
+ Fmark = assert_accept_frame->mark;
+ break;
+ }
+ if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
+ Fecode += GET(Fecode, 1);
+ if (*Fecode != OP_ALT) RRETURN(MATCH_NOMATCH);
+ }
+
+ do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT);
+ Fecode += 1 + LINK_SIZE;
+ break;
+
+#undef Lframe_type
+
+
+ /* ===================================================================== */
+ /* Handle negative assertions. Loop for each non-matching branch as for
+ positive assertions. */
+
+#define Lframe_type F->temp_32[0]
+
+ case OP_ASSERT_NOT:
+ case OP_ASSERTBACK_NOT:
+ Lframe_type = GF_NOCAPTURE | Fop;
+
+ for (;;)
+ {
+ group_frame_type = Lframe_type;
+ RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM4);
+ switch(rrc)
+ {
+ case MATCH_ACCEPT: /* Assertion matched, therefore it fails. */
+ case MATCH_MATCH:
+ RRETURN (MATCH_NOMATCH);
+
+ case MATCH_NOMATCH: /* Branch failed, try next if present. */
+ case MATCH_THEN:
+ Fecode += GET(Fecode, 1);
+ if (*Fecode != OP_ALT) goto ASSERT_NOT_FAILED;
+ break;
+
+ case MATCH_COMMIT: /* Assertion forced to fail, therefore continue. */
+ case MATCH_SKIP:
+ case MATCH_PRUNE:
+ do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT);
+ goto ASSERT_NOT_FAILED;
+
+ default: /* Pass back any other return */
+ RRETURN(rrc);
+ }
+ }
+
+ /* None of the branches have matched or there was a backtrack to (*COMMIT),
+ (*SKIP), (*PRUNE), or (*THEN) in the last branch. This is success for a
+ negative assertion, so carry on. */
+
+ ASSERT_NOT_FAILED:
+ Fecode += 1 + LINK_SIZE;
+ break;
+
+#undef Lframe_type
+
+
+ /* ===================================================================== */
+ /* The callout item calls an external function, if one is provided, passing
+ details of the match so far. This is mainly for debugging, though the
+ function is able to force a failure. */
+
+ case OP_CALLOUT:
+ case OP_CALLOUT_STR:
+ rrc = do_callout(F, mb, &length);
+ if (rrc > 0) RRETURN(MATCH_NOMATCH);
+ if (rrc < 0) RRETURN(rrc);
+ Fecode += length;
+ break;
+
+
+ /* ===================================================================== */
+ /* Conditional group: compilation checked that there are no more than two
+ branches. If the condition is false, skipping the first branch takes us
+ past the end of the item if there is only one branch, but that's exactly
+ what we want. */
+
+ case OP_COND:
+ case OP_SCOND:
+
+ /* The variable Flength will be added to Fecode when the condition is
+ false, to get to the second branch. Setting it to the offset to the ALT or
+ KET, then incrementing Fecode achieves this effect. However, if the second
+ branch is non-existent, we must point to the KET so that the end of the
+ group is correctly processed. We now have Fecode pointing to the condition
+ or callout. */
+
+ Flength = GET(Fecode, 1); /* Offset to the second branch */
+ if (Fecode[Flength] != OP_ALT) Flength -= 1 + LINK_SIZE;
+ Fecode += 1 + LINK_SIZE; /* From this opcode */
+
+ /* Because of the way auto-callout works during compile, a callout item is
+ inserted between OP_COND and an assertion condition. Such a callout can
+ also be inserted manually. */
+
+ if (*Fecode == OP_CALLOUT || *Fecode == OP_CALLOUT_STR)
+ {
+ rrc = do_callout(F, mb, &length);
+ if (rrc > 0) RRETURN(MATCH_NOMATCH);
+ if (rrc < 0) RRETURN(rrc);
+
+ /* Advance Fecode past the callout, so it now points to the condition. We
+ must adjust Flength so that the value of Fecode+Flength is unchanged. */
+
+ Fecode += length;
+ Flength -= length;
+ }
+
+ /* Test the various possible conditions */
+
+ condition = FALSE;
+ switch(*Fecode)
+ {
+ case OP_RREF: /* Group recursion test */
+ if (Fcurrent_recurse != RECURSE_UNSET)
+ {
+ number = GET2(Fecode, 1);
+ condition = (number == RREF_ANY || number == Fcurrent_recurse);
+ }
+ break;
+
+ case OP_DNRREF: /* Duplicate named group recursion test */
+ if (Fcurrent_recurse != RECURSE_UNSET)
+ {
+ int count = GET2(Fecode, 1 + IMM2_SIZE);
+ PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size;
+ while (count-- > 0)
+ {
+ number = GET2(slot, 0);
+ condition = number == Fcurrent_recurse;
+ if (condition) break;
+ slot += mb->name_entry_size;
+ }
+ }
+ break;
+
+ case OP_CREF: /* Numbered group used test */
+ offset = (GET2(Fecode, 1) << 1) - 2; /* Doubled ref number */
+ condition = offset < Foffset_top && Fovector[offset] != PCRE2_UNSET;
+ break;
+
+ case OP_DNCREF: /* Duplicate named group used test */
+ {
+ int count = GET2(Fecode, 1 + IMM2_SIZE);
+ PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size;
+ while (count-- > 0)
+ {
+ offset = (GET2(slot, 0) << 1) - 2;
+ condition = offset < Foffset_top && Fovector[offset] != PCRE2_UNSET;
+ if (condition) break;
+ slot += mb->name_entry_size;
+ }
+ }
+ break;
+
+ case OP_FALSE:
+ case OP_FAIL: /* The assertion (?!) becomes OP_FAIL */
+ break;
+
+ case OP_TRUE:
+ condition = TRUE;
+ break;
+
+ /* The condition is an assertion. Run code similar to the assertion code
+ above. */
+
+#define Lpositive F->temp_32[0]
+#define Lstart_branch F->temp_sptr[0]
+
+ default:
+ Lpositive = (*Fecode == OP_ASSERT || *Fecode == OP_ASSERTBACK);
+ Lstart_branch = Fecode;
+
+ for (;;)
+ {
+ group_frame_type = GF_CONDASSERT | *Fecode;
+ RMATCH(Lstart_branch + PRIV(OP_lengths)[*Lstart_branch], RM5);
+
+ switch(rrc)
+ {
+ case MATCH_ACCEPT: /* Save captures */
+ memcpy(Fovector,
+ (char *)assert_accept_frame + offsetof(heapframe, ovector),
+ assert_accept_frame->offset_top * sizeof(PCRE2_SIZE));
+ Foffset_top = assert_accept_frame->offset_top;
+
+ /* Fall through */
+ /* In the case of a match, the captures have already been put into
+ the current frame. */
+
+ case MATCH_MATCH:
+ condition = Lpositive; /* TRUE for positive assertion */
+ break;
+
+ /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
+ assertion; it is therefore always treated as NOMATCH. */
+
+ case MATCH_NOMATCH:
+ case MATCH_THEN:
+ Lstart_branch += GET(Lstart_branch, 1);
+ if (*Lstart_branch == OP_ALT) continue; /* Try next branch */
+ condition = !Lpositive; /* TRUE for negative assertion */
+ break;
+
+ /* These force no match without checking other branches. */
+
+ case MATCH_COMMIT:
+ case MATCH_SKIP:
+ case MATCH_PRUNE:
+ condition = !Lpositive;
+ break;
+
+ default:
+ RRETURN(rrc);
+ }
+ break; /* Out of the branch loop */
+ }
+
+ /* If the condition is true, find the end of the assertion so that
+ advancing past it gets us to the start of the first branch. */
+
+ if (condition)
+ {
+ do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT);
+ }
+ break; /* End of assertion condition */
+ }
+
+#undef Lpositive
+#undef Lstart_branch
+
+ /* Choose branch according to the condition. */
+
+ Fecode += condition? PRIV(OP_lengths)[*Fecode] : Flength;
+
+ /* If the opcode is OP_SCOND it means we are at a repeated conditional
+ group that might match an empty string. We must therefore descend a level
+ so that the start is remembered for checking. For OP_COND we can just
+ continue at this level. */
+
+ if (Fop == OP_SCOND)
+ {
+ group_frame_type = GF_NOCAPTURE | Fop;
+ RMATCH(Fecode, RM35);
+ RRETURN(rrc);
+ }
+ break;
+
+
+
+/* ========================================================================= */
+/* End of start of parenthesis opcodes */
+/* ========================================================================= */
+
+
+ /* ===================================================================== */
+ /* Move the subject pointer back. This occurs only at the start of each
+ branch of a lookbehind assertion. If we are too close to the start to move
+ back, fail. When working with UTF-8 we move back a number of characters,
+ not bytes. */
+
+ case OP_REVERSE:
+ number = GET(Fecode, 1);
+#ifdef SUPPORT_UNICODE
+ if (utf)
+ {
+ while (number-- > 0)
+ {
+ if (Feptr <= mb->start_subject) RRETURN(MATCH_NOMATCH);
+ Feptr--;
+ BACKCHAR(Feptr);
+ }
+ }
+ else
+#endif
+
+ /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
+
+ {
+ if ((ptrdiff_t)number > Feptr - mb->start_subject) RRETURN(MATCH_NOMATCH);
+ Feptr -= number;
+ }
+
+ /* Save the earliest consulted character, then skip to next opcode */
+
+ if (Feptr < mb->start_used_ptr) mb->start_used_ptr = Feptr;
+ Fecode += 1 + LINK_SIZE;
+ break;
+
+
+ /* ===================================================================== */
+ /* An alternation is the end of a branch; scan along to find the end of the
+ bracketed group. */
+
+ case OP_ALT:
+ do Fecode += GET(Fecode,1); while (*Fecode == OP_ALT);
+ break;
+
+
+ /* ===================================================================== */
+ /* The end of a parenthesized group. For all but OP_BRA and OP_COND, the
+ starting frame was added to the chained frames in order to remember the
+ starting subject position for the group. */
+
+ case OP_KET:
+ case OP_KETRMIN:
+ case OP_KETRMAX:
+ case OP_KETRPOS:
+
+ bracode = Fecode - GET(Fecode, 1);
+
+ /* Point N to the frame at the start of the most recent group.
+ Remember the subject pointer at the start of the group. */
+
+ if (*bracode != OP_BRA && *bracode != OP_COND)
+ {
+ N = (heapframe *)((char *)mb->match_frames + Flast_group_offset);
+ P = (heapframe *)((char *)N - frame_size);
+ Flast_group_offset = P->last_group_offset;
+
+#ifdef DEBUG_SHOW_RMATCH
+ fprintf(stderr, "++ KET for frame=%d type=%x prev char offset=%lu\n",
+ N->rdepth, N->group_frame_type,
+ (char *)P->eptr - (char *)mb->start_subject);
#endif
+
+ /* If we are at the end of an assertion that is a condition, return a
+ match, discarding any intermediate backtracking points. Copy back the
+ captures into the frame before N so that they are set on return. Doing
+ this for all assertions, both positive and negative, seems to match what
+ Perl does. */
+
+ if (GF_IDMASK(N->group_frame_type) == GF_CONDASSERT)
+ {
+ memcpy((char *)P + offsetof(heapframe, ovector), Fovector,
+ Foffset_top * sizeof(PCRE2_SIZE));
+ P->offset_top = Foffset_top;
+ Fback_frame = (char *)F - (char *)P;
+ RRETURN(MATCH_MATCH);
+ }
+ }
+ else P = NULL; /* Indicates starting frame not recorded */
+
+ /* The group was not a conditional assertion. */
+
+ switch (*bracode)
+ {
+ case OP_BRA: /* No need to do anything for these */
+ case OP_COND:
+ case OP_SCOND:
+ break;
+
+ /* Positive assertions are like OP_ONCE, except that in addition the
+ subject pointer must be put back to where it was at the start of the
+ assertion. */
+
+ case OP_ASSERT:
+ case OP_ASSERTBACK:
+ if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
+ Feptr = P->eptr;
+ /* Fall through */
+
+ /* For an atomic group, discard internal backtracking points. We must
+ also ensure that any remaining branches within the top-level of the group
+ are not tried. Do this by adjusting the code pointer within the backtrack
+ frame so that it points to the final branch. */
+
+ case OP_ONCE:
+ Fback_frame = ((char *)F - (char *)P);
+ for (;;)
+ {
+ uint32_t y = GET(P->ecode,1);
+ if ((P->ecode)[y] != OP_ALT) break;
+ P->ecode += y;
+ }
+ break;
+
+ /* A matching negative assertion returns MATCH, which is turned into
+ NOMATCH at the assertion level. */
+
+ case OP_ASSERT_NOT:
+ case OP_ASSERTBACK_NOT:
+ RRETURN(MATCH_MATCH);
+
+ /* Whole-pattern recursion is coded as a recurse into group 0, so it
+ won't be picked up here. Instead, we catch it when the OP_END is reached.
+ Other recursion is handled here. */
+
+ case OP_CBRA:
+ case OP_CBRAPOS:
+ case OP_SCBRA:
+ case OP_SCBRAPOS:
+ number = GET2(bracode, 1+LINK_SIZE);
+
+ /* Handle a recursively called group. We reinstate the previous set of
+ captures and then carry on after the recursion call. */
+
+ if (Fcurrent_recurse == number)
+ {
+ P = (heapframe *)((char *)N - frame_size);
+ memcpy((char *)F + offsetof(heapframe, ovector), P->ovector,
+ P->offset_top * sizeof(PCRE2_SIZE));
+ Foffset_top = P->offset_top;
+ Fcapture_last = P->capture_last;
+ Fcurrent_recurse = P->current_recurse;
+ Fecode = P->ecode + 1 + LINK_SIZE;
+ continue; /* With next opcode */
+ }
+
+ /* Deal with actual capturing. */
+
+ offset = (number << 1) - 2;
+ Fcapture_last = number;
+ Fovector[offset] = P->eptr - mb->start_subject;
+ Fovector[offset+1] = Feptr - mb->start_subject;
+ if (offset >= Foffset_top) Foffset_top = offset + 2;
+ break;
+ } /* End actions relating to the starting opcode */
+
+ /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
+ and return the MATCH_KETRPOS. This makes it possible to do the repeats one
+ at a time from the outer level. This must precede the empty string test -
+ in this case that test is done at the outer level. */
+
+ if (*Fecode == OP_KETRPOS)
+ {
+ memcpy((char *)P + offsetof(heapframe, eptr),
+ (char *)F + offsetof(heapframe, eptr),
+ frame_copy_size);
+ RRETURN(MATCH_KETRPOS);
+ }
+
+ /* Handle the different kinds of closing brackets. A non-repeating ket
+ needs no special action, just continuing at this level. This also happens
+ for the repeating kets if the group matched no characters, in order to
+ forcibly break infinite loops. Otherwise, the repeating kets try the rest
+ of the pattern or restart from the preceding bracket, in the appropriate
+ order. */
+
+ if (Fop != OP_KET && (P == NULL || Feptr != P->eptr))
+ {
+ if (Fop == OP_KETRMIN)
+ {
+ RMATCH(Fecode + 1 + LINK_SIZE, RM6);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ Fecode -= GET(Fecode, 1);
+ break; /* End of ket processing */
+ }
+
+ /* Repeat the maximum number of times (KETRMAX) */
+
+ RMATCH(bracode, RM7);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ }
+
+ /* Carry on at this level for a non-repeating ket, or after matching an
+ empty string, or after repeating for a maximum number of times. */
+
+ Fecode += 1 + LINK_SIZE;
+ break;
+
+
+ /* ===================================================================== */
+ /* Start and end of line assertions, not multiline mode. */
+
+ case OP_CIRC: /* Start of line, unless PCRE2_NOTBOL is set. */
+ if (Feptr != mb->start_subject || (mb->moptions & PCRE2_NOTBOL) != 0)
+ RRETURN(MATCH_NOMATCH);
+ Fecode++;
+ break;
+
+ case OP_SOD: /* Unconditional start of subject */
+ if (Feptr != mb->start_subject) RRETURN(MATCH_NOMATCH);
+ Fecode++;
+ break;
+
+ /* When PCRE2_NOTEOL is unset, assert before the subject end, or a
+ terminating newline unless PCRE2_DOLLAR_ENDONLY is set. */
+
+ case OP_DOLL:
+ if ((mb->moptions & PCRE2_NOTEOL) != 0) RRETURN(MATCH_NOMATCH);
+ if ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0) goto ASSERT_NL_OR_EOS;
+
+ /* Fall through */
+ /* Unconditional end of subject assertion (\z) */
+
+ case OP_EOD:
+ if (Feptr < mb->end_subject) RRETURN(MATCH_NOMATCH);
+ SCHECK_PARTIAL();
+ Fecode++;
+ break;
+
+ /* End of subject or ending \n assertion (\Z) */
+
+ case OP_EODN:
+ ASSERT_NL_OR_EOS:
+ if (Feptr < mb->end_subject &&
+ (!IS_NEWLINE(Feptr) || Feptr != mb->end_subject - mb->nllen))
+ {
+ if (mb->partial != 0 &&
+ Feptr + 1 >= mb->end_subject &&
+ NLBLOCK->nltype == NLTYPE_FIXED &&
+ NLBLOCK->nllen == 2 &&
+ UCHAR21TEST(Feptr) == NLBLOCK->nl[0])
+ {
+ mb->hitend = TRUE;
+ if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
+ }
+ RRETURN(MATCH_NOMATCH);
+ }
+
+ /* Either at end of string or \n before end. */
+
+ SCHECK_PARTIAL();
+ Fecode++;
+ break;
+
+
+ /* ===================================================================== */
+ /* Start and end of line assertions, multiline mode. */
+
+ /* Start of subject unless notbol, or after any newline except for one at
+ the very end, unless PCRE2_ALT_CIRCUMFLEX is set. */
+
+ case OP_CIRCM:
+ if ((mb->moptions & PCRE2_NOTBOL) != 0 && Feptr == mb->start_subject)
+ RRETURN(MATCH_NOMATCH);
+ if (Feptr != mb->start_subject &&
+ ((Feptr == mb->end_subject &&
+ (mb->poptions & PCRE2_ALT_CIRCUMFLEX) == 0) ||
+ !WAS_NEWLINE(Feptr)))
+ RRETURN(MATCH_NOMATCH);
+ Fecode++;
+ break;
+
+ /* Assert before any newline, or before end of subject unless noteol is
+ set. */
+
+ case OP_DOLLM:
+ if (Feptr < mb->end_subject)
+ {
+ if (!IS_NEWLINE(Feptr))
+ {
+ if (mb->partial != 0 &&
+ Feptr + 1 >= mb->end_subject &&
+ NLBLOCK->nltype == NLTYPE_FIXED &&
+ NLBLOCK->nllen == 2 &&
+ UCHAR21TEST(Feptr) == NLBLOCK->nl[0])
+ {
+ mb->hitend = TRUE;
+ if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
+ }
+ RRETURN(MATCH_NOMATCH);
+ }
+ }
+ else
+ {
+ if ((mb->moptions & PCRE2_NOTEOL) != 0) RRETURN(MATCH_NOMATCH);
+ SCHECK_PARTIAL();
+ }
+ Fecode++;
+ break;
+
+
+ /* ===================================================================== */
+ /* Start of match assertion */
+
+ case OP_SOM:
+ if (Feptr != mb->start_subject + mb->start_offset) RRETURN(MATCH_NOMATCH);
+ Fecode++;
+ break;
+
+
+ /* ===================================================================== */
+ /* Reset the start of match point */
+
+ case OP_SET_SOM:
+ Fstart_match = Feptr;
+ Fecode++;
+ break;
+
+
+ /* ===================================================================== */
+ /* Word boundary assertions. Find out if the previous and current
+ characters are "word" characters. It takes a bit more work in UTF mode.
+ Characters > 255 are assumed to be "non-word" characters when PCRE2_UCP is
+ not set. When it is set, use Unicode properties if available, even when not
+ in UTF mode. Remember the earliest and latest consulted characters. */
+
+ case OP_NOT_WORD_BOUNDARY:
+ case OP_WORD_BOUNDARY:
+ if (Feptr == mb->start_subject) prev_is_word = FALSE; else
+ {
+ PCRE2_SPTR lastptr = Feptr - 1;
#ifdef SUPPORT_UNICODE
- LBL(16) LBL(18)
- LBL(22) LBL(23) LBL(28) LBL(30)
- LBL(32) LBL(34) LBL(42) LBL(46)
- LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
- LBL(59) LBL(60) LBL(61) LBL(62) LBL(67)
+ if (utf)
+ {
+ BACKCHAR(lastptr);
+ GETCHAR(fc, lastptr);
+ }
+ else
#endif /* SUPPORT_UNICODE */
- default:
- return PCRE2_ERROR_INTERNAL;
- }
-#undef LBL
-#endif /* HEAP_MATCH_RECURSE */
-}
+ fc = *lastptr;
+ if (lastptr < mb->start_used_ptr) mb->start_used_ptr = lastptr;
+#ifdef SUPPORT_UNICODE
+ if ((mb->poptions & PCRE2_UCP) != 0)
+ {
+ if (fc == '_') prev_is_word = TRUE; else
+ {
+ int cat = UCD_CATEGORY(fc);
+ prev_is_word = (cat == ucp_L || cat == ucp_N);
+ }
+ }
+ else
+#endif /* SUPPORT_UNICODE */
+ prev_is_word = CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0;
+ }
+
+ /* Get status of next character */
+ if (Feptr >= mb->end_subject)
+ {
+ SCHECK_PARTIAL();
+ cur_is_word = FALSE;
+ }
+ else
+ {
+ PCRE2_SPTR nextptr = Feptr + 1;
+#ifdef SUPPORT_UNICODE
+ if (utf)
+ {
+ FORWARDCHARTEST(nextptr, mb->end_subject);
+ GETCHAR(fc, Feptr);
+ }
+ else
+#endif /* SUPPORT_UNICODE */
+ fc = *Feptr;
+ if (nextptr > mb->last_used_ptr) mb->last_used_ptr = nextptr;
+#ifdef SUPPORT_UNICODE
+ if ((mb->poptions & PCRE2_UCP) != 0)
+ {
+ if (fc == '_') cur_is_word = TRUE; else
+ {
+ int cat = UCD_CATEGORY(fc);
+ cur_is_word = (cat == ucp_L || cat == ucp_N);
+ }
+ }
+ else
+#endif /* SUPPORT_UNICODE */
+ cur_is_word = CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0;
+ }
-/***************************************************************************
-****************************************************************************
- RECURSION IN THE match() FUNCTION
+ /* Now see if the situation is what we want */
-Undefine all the macros that were defined above to handle this. */
+ if ((*Fecode++ == OP_WORD_BOUNDARY)?
+ cur_is_word == prev_is_word : cur_is_word != prev_is_word)
+ RRETURN(MATCH_NOMATCH);
+ break;
-#ifdef HEAP_MATCH_RECURSE
-#undef eptr
-#undef ecode
-#undef mstart
-#undef offset_top
-#undef eptrb
-#undef flags
-#undef callpat
-#undef charptr
-#undef data
-#undef next_ecode
-#undef pp
-#undef prev
-#undef saved_eptr
+ /* ===================================================================== */
+ /* Backtracking (*VERB)s, with and without arguments. Note that if the
+ pattern is successfully matched, we do not come back from RMATCH. */
-#undef new_recursive
+ case OP_MARK:
+ Fmark = mb->nomatch_mark = Fecode + 2;
+ RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM12);
-#undef cur_is_word
-#undef condition
-#undef prev_is_word
+ /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
+ argument, and we must check whether that argument matches this MARK's
+ argument. It is passed back in mb->verb_skip_ptr. If it does match, we
+ return MATCH_SKIP with mb->verb_skip_ptr now pointing to the subject
+ position that corresponds to this mark. Otherwise, pass back the return
+ code unaltered. */
-#undef ctype
-#undef length
-#undef max
-#undef min
-#undef number
-#undef offset
-#undef op
-#undef save_capture_last
-#undef save_offset1
-#undef save_offset2
-#undef save_offset3
+ if (rrc == MATCH_SKIP_ARG &&
+ PRIV(strcmp)(Fecode + 2, mb->verb_skip_ptr) == 0)
+ {
+ mb->verb_skip_ptr = Feptr; /* Pass back current position */
+ RRETURN(MATCH_SKIP);
+ }
+ RRETURN(rrc);
-#undef newptrb
-#endif /* HEAP_MATCH_RECURSE */
+ case OP_FAIL:
+ RRETURN(MATCH_NOMATCH);
-/* These two are defined as macros in both cases */
+ /* Record the current recursing group number in mb->verb_current_recurse
+ when a backtracking return such as MATCH_COMMIT is given. This enables the
+ recurse processing to catch verbs from within the recursion. */
-#undef fc
-#undef fi
+ case OP_COMMIT:
+ RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM13);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ mb->verb_current_recurse = Fcurrent_recurse;
+ RRETURN(MATCH_COMMIT);
-/***************************************************************************
-***************************************************************************/
+ case OP_COMMIT_ARG:
+ Fmark = mb->nomatch_mark = Fecode + 2;
+ RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM36);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ mb->verb_current_recurse = Fcurrent_recurse;
+ RRETURN(MATCH_COMMIT);
+ case OP_PRUNE:
+ RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM14);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ mb->verb_current_recurse = Fcurrent_recurse;
+ RRETURN(MATCH_PRUNE);
-#ifdef HEAP_MATCH_RECURSE
-/*************************************************
-* Release allocated heap frames *
-*************************************************/
+ case OP_PRUNE_ARG:
+ Fmark = mb->nomatch_mark = Fecode + 2;
+ RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM15);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ mb->verb_current_recurse = Fcurrent_recurse;
+ RRETURN(MATCH_PRUNE);
-/* This function releases all the allocated frames. The base frame is on the
-machine stack, and so must not be freed.
+ case OP_SKIP:
+ RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM16);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ mb->verb_skip_ptr = Feptr; /* Pass back current position */
+ mb->verb_current_recurse = Fcurrent_recurse;
+ RRETURN(MATCH_SKIP);
-Argument:
- frame_base the address of the base frame
- mb the match block
+ /* Note that, for Perl compatibility, SKIP with an argument does NOT set
+ nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was
+ not a matching mark, we have to re-run the match, ignoring the SKIP_ARG
+ that failed and any that precede it (either they also failed, or were not
+ triggered). To do this, we maintain a count of executed SKIP_ARGs. If a
+ SKIP_ARG gets to top level, the match is re-run with mb->ignore_skip_arg
+ set to the count of the one that failed. */
-Returns: nothing
-*/
+ case OP_SKIP_ARG:
+ mb->skip_arg_count++;
+ if (mb->skip_arg_count <= mb->ignore_skip_arg)
+ {
+ Fecode += PRIV(OP_lengths)[*Fecode] + Fecode[1];
+ break;
+ }
+ RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM17);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
-static void
-release_match_heapframes (heapframe *frame_base, match_block *mb)
-{
-heapframe *nextframe = frame_base->Xnextframe;
-while (nextframe != NULL)
+ /* Pass back the current skip name and return the special MATCH_SKIP_ARG
+ return code. This will either be caught by a matching MARK, or get to the
+ top, where it causes a rematch with mb->ignore_skip_arg set to the value of
+ mb->skip_arg_count. */
+
+ mb->verb_skip_ptr = Fecode + 2;
+ mb->verb_current_recurse = Fcurrent_recurse;
+ RRETURN(MATCH_SKIP_ARG);
+
+ /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
+ the branch in which it occurs can be determined. */
+
+ case OP_THEN:
+ RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM18);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ mb->verb_ecode_ptr = Fecode;
+ mb->verb_current_recurse = Fcurrent_recurse;
+ RRETURN(MATCH_THEN);
+
+ case OP_THEN_ARG:
+ Fmark = mb->nomatch_mark = Fecode + 2;
+ RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM19);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ mb->verb_ecode_ptr = Fecode;
+ mb->verb_current_recurse = Fcurrent_recurse;
+ RRETURN(MATCH_THEN);
+
+
+ /* ===================================================================== */
+ /* There's been some horrible disaster. Arrival here can only mean there is
+ something seriously wrong in the code above or the OP_xxx definitions. */
+
+ default:
+ return PCRE2_ERROR_INTERNAL;
+ }
+
+ /* Do not insert any code in here without much thought; it is assumed
+ that "continue" in the code above comes out to here to repeat the main
+ loop. */
+
+ } /* End of main loop */
+/* Control never reaches here */
+
+
+/* ========================================================================= */
+/* The RRETURN() macro jumps here. The number that is saved in Freturn_id
+indicates which label we actually want to return to. The value in Frdepth is
+the index number of the frame in the vector. The return value has been placed
+in rrc. */
+
+#define LBL(val) case val: goto L_RM##val;
+
+RETURN_SWITCH:
+if (Frdepth == 0) return rrc; /* Exit from the top level */
+F = (heapframe *)((char *)F - Fback_frame); /* Backtrack */
+mb->cb->callout_flags |= PCRE2_CALLOUT_BACKTRACK; /* Note for callouts */
+
+#ifdef DEBUG_SHOW_RMATCH
+fprintf(stderr, "++ RETURN %d to %d\n", rrc, Freturn_id);
+#endif
+
+switch (Freturn_id)
{
- heapframe *oldframe = nextframe;
- nextframe = nextframe->Xnextframe;
- mb->stack_memctl.free(oldframe, mb->stack_memctl.memory_data);
+ LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
+ LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(16)
+ LBL(17) LBL(18) LBL(19) LBL(20) LBL(21) LBL(22) LBL(23) LBL(24)
+ LBL(25) LBL(26) LBL(27) LBL(28) LBL(29) LBL(30) LBL(31) LBL(32)
+ LBL(33) LBL(34) LBL(35) LBL(36)
+
+#ifdef SUPPORT_WIDE_CHARS
+ LBL(100) LBL(101)
+#endif
+
+#ifdef SUPPORT_UNICODE
+ LBL(200) LBL(201) LBL(202) LBL(203) LBL(204) LBL(205) LBL(206)
+ LBL(207) LBL(208) LBL(209) LBL(210) LBL(211) LBL(212) LBL(213)
+ LBL(214) LBL(215) LBL(216) LBL(217) LBL(218) LBL(219) LBL(220)
+ LBL(221) LBL(222)
+#endif
+
+ default:
+ return PCRE2_ERROR_INTERNAL;
}
+#undef LBL
}
-#endif /* HEAP_MATCH_RECURSE */
-
/*************************************************
@@ -6444,8 +6000,6 @@ pcre2_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
pcre2_match_context *mcontext)
{
int rc;
-int ocount;
-
const uint8_t *start_bits = NULL;
const pcre2_real_code *re = (const pcre2_real_code *)code;
@@ -6455,7 +6009,6 @@ BOOL firstline;
BOOL has_first_cu = FALSE;
BOOL has_req_cu = FALSE;
BOOL startline;
-BOOL using_temporary_offsets = FALSE;
BOOL utf;
PCRE2_UCHAR first_cu = 0;
@@ -6470,18 +6023,22 @@ PCRE2_SPTR req_cu_ptr = start_match - 1;
PCRE2_SPTR start_partial = NULL;
PCRE2_SPTR match_partial = NULL;
-/* We need to have mb pointing to a match block, because the IS_NEWLINE macro
-is used below, and it expects NLBLOCK to be defined as a pointer. */
+PCRE2_SIZE frame_size;
+
+/* We need to have mb as a pointer to a match block, because the IS_NEWLINE
+macro is used below, and it expects NLBLOCK to be defined as a pointer. */
+pcre2_callout_block cb;
match_block actual_match_block;
match_block *mb = &actual_match_block;
-#ifdef HEAP_MATCH_RECURSE
-heapframe frame_zero;
-frame_zero.Xprevframe = NULL; /* Marks the top level */
-frame_zero.Xnextframe = NULL; /* None are allocated yet */
-mb->match_frames_base = &frame_zero;
-#endif
+/* Allocate an initial vector of backtracking frames on the stack. If this
+proves to be too small, it is replaced by a larger one on the heap. To get a
+vector of the size required that is aligned for pointers, allocate it as a
+vector of pointers. */
+
+PCRE2_SPTR stack_frames_vector[START_FRAMES_SIZE/sizeof(PCRE2_SPTR)];
+mb->stack_frames = (heapframe *)stack_frames_vector;
/* A length equal to PCRE2_ZERO_TERMINATED implies a zero-terminated
subject string. */
@@ -6510,8 +6067,8 @@ options variable for this function. Users of PCRE2 who are not calling the
function directly would like to have a way of setting these flags, in the same
way that they can set pcre2_compile() flags like PCRE2_NO_AUTOPOSSESS with
constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and
-(*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which can now be
-transferred to the options for this function. The bits are guaranteed to be
+(*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which we now
+transfer to the options for this function. The bits are guaranteed to be
adjacent, but do not have the same values. This bit of Boolean trickery assumes
that the match-time bits are not more significant than the flag bits. If by
accident this is not the case, a compile-time division by zero error will
@@ -6523,20 +6080,22 @@ options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1)));
#undef FF
#undef OO
-/* A NULL match context means "use a default context" */
-
-if (mcontext == NULL)
- mcontext = (pcre2_match_context *)(&PRIV(default_match_context));
-
/* These two settings are used in the code for checking a UTF string that
follows immediately afterwards. Other values in the mb block are used only
-during interpretive pcre_match() processing, not when the JIT support is in
-use, so they are set up later. */
+during interpretive processing, not when the JIT support is in use, so they are
+set up later. */
utf = (re->overall_options & PCRE2_UTF) != 0;
mb->partial = ((options & PCRE2_PARTIAL_HARD) != 0)? 2 :
((options & PCRE2_PARTIAL_SOFT) != 0)? 1 : 0;
+/* Partial matching and PCRE2_ENDANCHORED are currently not allowed at the same
+time. */
+
+if (mb->partial != 0 &&
+ ((re->overall_options | options) & PCRE2_ENDANCHORED) != 0)
+ return PCRE2_ERROR_BADOPTION;
+
/* Check a UTF string for validity if required. For 8-bit and 16-bit strings,
we must also check that a starting offset does not point into the middle of a
multiunit character. We check only the portion of the subject that is going to
@@ -6595,7 +6154,7 @@ if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
/* It is an error to set an offset limit without setting the flag at compile
time. */
-if (mcontext->offset_limit != PCRE2_UNSET &&
+if (mcontext != NULL && mcontext->offset_limit != PCRE2_UNSET &&
(re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0)
return PCRE2_ERROR_BADOFFSETLIMIT;
@@ -6614,7 +6173,15 @@ if (re->executable_jit != NULL && (options & ~PUBLIC_JIT_MATCH_OPTIONS) == 0)
}
#endif
-/* Carry on with non-JIT matching. */
+/* Carry on with non-JIT matching. A NULL match context means "use a default
+context", but we take the memory control functions from the pattern. */
+
+if (mcontext == NULL)
+ {
+ mcontext = (pcre2_match_context *)(&PRIV(default_match_context));
+ mb->memctl = re->memctl;
+ }
+else mb->memctl = mcontext->memctl;
anchored = ((re->overall_options | options) & PCRE2_ANCHORED) != 0;
firstline = (re->overall_options & PCRE2_FIRSTLINE) != 0;
@@ -6622,14 +6189,19 @@ startline = (re->flags & PCRE2_STARTLINE) != 0;
bumpalong_limit = (mcontext->offset_limit == PCRE2_UNSET)?
end_subject : subject + mcontext->offset_limit;
-/* Fill in the fields in the match block. */
+/* Initialize and set up the fixed fields in the callout block, with a pointer
+in the match block. */
+
+mb->cb = &cb;
+cb.version = 2;
+cb.subject = subject;
+cb.subject_length = (PCRE2_SIZE)(end_subject - subject);
+cb.callout_flags = 0;
+
+/* Fill in the remaining fields in the match block. */
mb->callout = mcontext->callout;
mb->callout_data = mcontext->callout_data;
-mb->memctl = mcontext->memctl;
-#ifdef HEAP_MATCH_RECURSE
-mb->stack_memctl = mcontext->stack_memctl;
-#endif
mb->start_subject = subject;
mb->start_offset = start_offset;
@@ -6641,8 +6213,6 @@ mb->poptions = re->overall_options; /* Pattern options */
mb->ignore_skip_arg = 0;
mb->mark = mb->nomatch_mark = NULL; /* In case never set */
-mb->recursive = NULL; /* No recursion at top level */
-mb->ovecsave_chain = NULL; /* No ovecsave blocks yet */
mb->hitend = FALSE;
/* The name table is needed for finding all the numbers associated with a
@@ -6653,20 +6223,6 @@ mb->name_count = re->name_count;
mb->name_entry_size = re->name_entry_size;
mb->start_code = mb->name_table + re->name_count * re->name_entry_size;
-/* Limits set in the pattern override the match context only if they are
-smaller. */
-
-mb->match_limit = (mcontext->match_limit < re->limit_match)?
- mcontext->match_limit : re->limit_match;
-mb->match_limit_recursion = (mcontext->recursion_limit < re->limit_recursion)?
- mcontext->recursion_limit : re->limit_recursion;
-
-/* Pointers to the individual character tables */
-
-mb->lcc = re->tables + lcc_offset;
-mb->fcc = re->tables + fcc_offset;
-mb->ctypes = re->tables + ctypes_offset;
-
/* Process the \R and newline settings. */
mb->bsr_convention = re->bsr_convention;
@@ -6683,6 +6239,11 @@ switch(re->newline_convention)
mb->nl[0] = CHAR_NL;
break;
+ case PCRE2_NEWLINE_NUL:
+ mb->nllen = 1;
+ mb->nl[0] = CHAR_NUL;
+ break;
+
case PCRE2_NEWLINE_CRLF:
mb->nllen = 2;
mb->nl[0] = CHAR_CR;
@@ -6700,71 +6261,91 @@ switch(re->newline_convention)
default: return PCRE2_ERROR_INTERNAL;
}
-/* If the expression has got more back references than the offsets supplied can
-hold, we get a temporary chunk of memory to use during the matching. Otherwise,
-we can use the vector supplied. The size of the ovector is three times the
-value in the oveccount field. Two-thirds of it is pairs for storing matching
-offsets, and the top third is working space. */
+/* The backtracking frames have fixed data at the front, and a PCRE2_SIZE
+vector at the end, whose size depends on the number of capturing parentheses in
+the pattern. It is not used at all if there are no capturing parentheses.
+
+ frame_size is the total size of each frame
+ mb->frame_vector_size is the total usable size of the vector (rounded down
+ to a whole number of frames)
-if (re->top_backref >= match_data->oveccount)
+The last of these is changed within the match() function if the frame vector
+has to be expanded. We therefore put it into the match block so that it is
+correct when calling match() more than once for non-anchored patterns. */
+
+frame_size = offsetof(heapframe, ovector) +
+ re->top_bracket * 2 * sizeof(PCRE2_SIZE);
+
+/* Limits set in the pattern override the match context only if they are
+smaller. */
+
+mb->heap_limit = (mcontext->heap_limit < re->limit_heap)?
+ mcontext->heap_limit : re->limit_heap;
+
+mb->match_limit = (mcontext->match_limit < re->limit_match)?
+ mcontext->match_limit : re->limit_match;
+
+mb->match_limit_depth = (mcontext->depth_limit < re->limit_depth)?
+ mcontext->depth_limit : re->limit_depth;
+
+/* If a pattern has very many capturing parentheses, the frame size may be very
+large. Ensure that there are at least 10 available frames by getting an initial
+vector on the heap if necessary, except when the heap limit prevents this. Get
+fewer if possible. (The heap limit is in kibibytes.) */
+
+if (frame_size <= START_FRAMES_SIZE/10)
{
- ocount = re->top_backref * 3 + 3;
- mb->ovector = (PCRE2_SIZE *)(mb->memctl.malloc(ocount * sizeof(PCRE2_SIZE),
- mb->memctl.memory_data));
- if (mb->ovector == NULL) return PCRE2_ERROR_NOMEMORY;
- using_temporary_offsets = TRUE;
+ mb->match_frames = mb->stack_frames; /* Initial frame vector on the stack */
+ mb->frame_vector_size = ((START_FRAMES_SIZE/frame_size) * frame_size);
}
else
{
- ocount = 3 * match_data->oveccount;
- mb->ovector = match_data->ovector;
+ mb->frame_vector_size = frame_size * 10;
+ if ((mb->frame_vector_size / 1024) > mb->heap_limit)
+ {
+ if (frame_size > mb->heap_limit * 1024) return PCRE2_ERROR_HEAPLIMIT;
+ mb->frame_vector_size = ((mb->heap_limit * 1024)/frame_size) * frame_size;
+ }
+ mb->match_frames = mb->memctl.malloc(mb->frame_vector_size,
+ mb->memctl.memory_data);
+ if (mb->match_frames == NULL) return PCRE2_ERROR_NOMEMORY;
}
-mb->offset_end = ocount;
-mb->offset_max = (2*ocount)/3;
+mb->match_frames_top =
+ (heapframe *)((char *)mb->match_frames + mb->frame_vector_size);
-/* Reset the working variable associated with each extraction. These should
-never be used unless previously set, but they get saved and restored, and so we
-initialize them to avoid reading uninitialized locations. Also, unset the
-offsets for the matched string. This is really just for tidiness with callouts,
-in case they inspect these fields. */
+/* Write to the ovector within the first frame to mark every capture unset and
+to avoid uninitialized memory read errors when it is copied to a new frame. */
-if (ocount > 0)
- {
- register PCRE2_SIZE *iptr = mb->ovector + ocount;
- register PCRE2_SIZE *iend = iptr - re->top_bracket;
- if (iend < mb->ovector + 2) iend = mb->ovector + 2;
- while (--iptr >= iend) *iptr = PCRE2_UNSET;
- mb->ovector[0] = mb->ovector[1] = PCRE2_UNSET;
- }
+memset((char *)(mb->match_frames) + offsetof(heapframe, ovector), 0xff,
+ re->top_bracket * 2 * sizeof(PCRE2_SIZE));
-/* Set up the first code unit to match, if available. The first_codeunit value
-is never set for an anchored regular expression, but the anchoring may be
-forced at run time, so we have to test for anchoring. The first code unit may
-be unset for an unanchored pattern, of course. If there's no first code unit
-there may be a bitmap of possible first characters. */
+/* Pointers to the individual character tables */
+
+mb->lcc = re->tables + lcc_offset;
+mb->fcc = re->tables + fcc_offset;
+mb->ctypes = re->tables + ctypes_offset;
+
+/* Set up the first code unit to match, if available. If there's no first code
+unit there may be a bitmap of possible first characters. */
-if (!anchored)
+if ((re->flags & PCRE2_FIRSTSET) != 0)
{
- if ((re->flags & PCRE2_FIRSTSET) != 0)
+ has_first_cu = TRUE;
+ first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
+ if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
{
- has_first_cu = TRUE;
- first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
- if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
- {
- first_cu2 = TABLE_GET(first_cu, mb->fcc, first_cu);
+ first_cu2 = TABLE_GET(first_cu, mb->fcc, first_cu);
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
- if (utf && first_cu > 127) first_cu2 = UCD_OTHERCASE(first_cu);
+ if (utf && first_cu > 127) first_cu2 = UCD_OTHERCASE(first_cu);
#endif
- }
}
- else
- if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
- start_bits = re->start_bitmap;
}
+else
+ if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
+ start_bits = re->start_bitmap;
-/* For anchored or unanchored matches, there may be a "last known required
-character" set. */
+/* There may also be a "last known required character" set. */
if ((re->flags & PCRE2_LASTSET) != 0)
{
@@ -6788,7 +6369,6 @@ the loop runs just once. */
for(;;)
{
PCRE2_SPTR new_start_match;
- mb->capture_last = 0;
/* ----------------- Start of match optimizations ---------------- */
@@ -6799,13 +6379,11 @@ for(;;)
if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
{
- PCRE2_SPTR save_end_subject = end_subject;
-
/* If firstline is TRUE, the start of the match is constrained to the first
line of a multiline string. That is, the match must be before or at the
- first newline. Implement this by temporarily adjusting end_subject so that
- we stop the optimization scans at a newline. If the match fails at the
- newline, later code breaks this loop. */
+ first newline following the start of matching. Temporarily adjust
+ end_subject so that we stop the scans for a first code unit at a newline.
+ If the match fails at the newline, later code breaks the loop. */
if (firstline)
{
@@ -6813,102 +6391,179 @@ for(;;)
#ifdef SUPPORT_UNICODE
if (utf)
{
- while (t < mb->end_subject && !IS_NEWLINE(t))
+ while (t < end_subject && !IS_NEWLINE(t))
{
t++;
- ACROSSCHAR(t < end_subject, *t, t++);
+ ACROSSCHAR(t < end_subject, t, t++);
}
}
else
#endif
- while (t < mb->end_subject && !IS_NEWLINE(t)) t++;
+ while (t < end_subject && !IS_NEWLINE(t)) t++;
end_subject = t;
}
- /* Advance to a unique first code unit if there is one. In 8-bit mode, the
- use of memchr() gives a big speed up. */
+ /* Anchored: check the first code unit if one is recorded. This may seem
+ pointless but it can help in detecting a no match case without scanning for
+ the required code unit. */
- if (has_first_cu)
+ if (anchored)
{
- PCRE2_UCHAR smc;
- if (first_cu != first_cu2)
- while (start_match < end_subject &&
- (smc = UCHAR21TEST(start_match)) != first_cu && smc != first_cu2)
- start_match++;
- else
+ if (has_first_cu || start_bits != NULL)
{
+ BOOL ok = start_match < end_subject;
+ if (ok)
+ {
+ PCRE2_UCHAR c = UCHAR21TEST(start_match);
+ ok = has_first_cu && (c == first_cu || c == first_cu2);
+ if (!ok && start_bits != NULL)
+ {
#if PCRE2_CODE_UNIT_WIDTH != 8
- while (start_match < end_subject && UCHAR21TEST(start_match) != first_cu)
- start_match++;
-#else
- start_match = memchr(start_match, first_cu, end_subject - start_match);
- if (start_match == NULL) start_match = end_subject;
+ if (c > 255) c = 255;
#endif
+ ok = (start_bits[c/8] & (1 << (c&7))) != 0;
+ }
+ }
+ if (!ok)
+ {
+ rc = MATCH_NOMATCH;
+ break;
+ }
}
}
- /* Or to just after a linebreak for a multiline match */
+ /* Not anchored. Advance to a unique first code unit if there is one. In
+ 8-bit mode, the use of memchr() gives a big speed up, even though we have
+ to call it twice in caseless mode, in order to find the earliest occurrence
+ of the character in either of its cases. */
- else if (startline)
+ else
{
- if (start_match > mb->start_subject + start_offset)
+ if (has_first_cu)
{
-#ifdef SUPPORT_UNICODE
- if (utf)
+ if (first_cu != first_cu2) /* Caseless */
{
- while (start_match < end_subject && !WAS_NEWLINE(start_match))
- {
+#if PCRE2_CODE_UNIT_WIDTH != 8
+ PCRE2_UCHAR smc;
+ while (start_match < end_subject &&
+ (smc = UCHAR21TEST(start_match)) != first_cu &&
+ smc != first_cu2)
start_match++;
- ACROSSCHAR(start_match < end_subject, *start_match,
- start_match++);
- }
+#else /* 8-bit code units */
+ PCRE2_SPTR pp1 =
+ memchr(start_match, first_cu, end_subject-start_match);
+ PCRE2_SPTR pp2 =
+ memchr(start_match, first_cu2, end_subject-start_match);
+ if (pp1 == NULL)
+ start_match = (pp2 == NULL)? end_subject : pp2;
+ else
+ start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
+#endif
}
+
+ /* The caseful case */
+
else
+ {
+#if PCRE2_CODE_UNIT_WIDTH != 8
+ while (start_match < end_subject && UCHAR21TEST(start_match) !=
+ first_cu)
+ start_match++;
+#else
+ start_match = memchr(start_match, first_cu, end_subject - start_match);
+ if (start_match == NULL) start_match = end_subject;
#endif
- while (start_match < end_subject && !WAS_NEWLINE(start_match))
- start_match++;
+ }
- /* If we have just passed a CR and the newline option is ANY or
- ANYCRLF, and we are now at a LF, advance the match position by one more
- code unit. */
+ /* If we can't find the required code unit, having reached the true end
+ of the subject, break the bumpalong loop, to force a match failure,
+ except when doing partial matching, when we let the next cycle run at
+ the end of the subject. To see why, consider the pattern /(?<=abc)def/,
+ which partially matches "abc", even though the string does not contain
+ the starting character "d". If we have not reached the true end of the
+ subject (PCRE2_FIRSTLINE caused end_subject to be temporarily modified)
+ we also let the cycle run, because the matching string is legitimately
+ allowed to start with the first code unit of a newline. */
+
+ if (!mb->partial && start_match >= mb->end_subject)
+ {
+ rc = MATCH_NOMATCH;
+ break;
+ }
+ }
- if (start_match[-1] == CHAR_CR &&
- (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
- start_match < end_subject &&
- UCHAR21TEST(start_match) == CHAR_NL)
- start_match++;
+ /* If there's no first code unit, advance to just after a linebreak for a
+ multiline match if required. */
+
+ else if (startline)
+ {
+ if (start_match > mb->start_subject + start_offset)
+ {
+#ifdef SUPPORT_UNICODE
+ if (utf)
+ {
+ while (start_match < end_subject && !WAS_NEWLINE(start_match))
+ {
+ start_match++;
+ ACROSSCHAR(start_match < end_subject, start_match, start_match++);
+ }
+ }
+ else
+#endif
+ while (start_match < end_subject && !WAS_NEWLINE(start_match))
+ start_match++;
+
+ /* If we have just passed a CR and the newline option is ANY or
+ ANYCRLF, and we are now at a LF, advance the match position by one
+ more code unit. */
+
+ if (start_match[-1] == CHAR_CR &&
+ (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
+ start_match < end_subject &&
+ UCHAR21TEST(start_match) == CHAR_NL)
+ start_match++;
+ }
}
- }
- /* Or to a non-unique first code unit if any have been identified. The
- bitmap contains only 256 bits. When code units are 16 or 32 bits wide, all
- code units greater than 254 set the 255 bit. */
+ /* If there's no first code unit or a requirement for a multiline line
+ start, advance to a non-unique first code unit if any have been
+ identified. The bitmap contains only 256 bits. When code units are 16 or
+ 32 bits wide, all code units greater than 254 set the 255 bit. */
- else if (start_bits != NULL)
- {
- while (start_match < end_subject)
+ else if (start_bits != NULL)
{
- register uint32_t c = UCHAR21TEST(start_match);
+ while (start_match < end_subject)
+ {
+ uint32_t c = UCHAR21TEST(start_match);
#if PCRE2_CODE_UNIT_WIDTH != 8
- if (c > 255) c = 255;
+ if (c > 255) c = 255;
#endif
- if ((start_bits[c/8] & (1 << (c&7))) != 0) break;
- start_match++;
+ if ((start_bits[c/8] & (1 << (c&7))) != 0) break;
+ start_match++;
+ }
+
+ /* See comment above in first_cu checking about the next few lines. */
+
+ if (!mb->partial && start_match >= mb->end_subject)
+ {
+ rc = MATCH_NOMATCH;
+ break;
+ }
}
- }
+ } /* End first code unit handling */
/* Restore fudged end_subject */
- end_subject = save_end_subject;
+ end_subject = mb->end_subject;
- /* The following two optimizations are disabled for partial matching. */
+ /* The following two optimizations must be disabled for partial matching. */
if (!mb->partial)
{
- /* The minimum matching length is a lower bound; no actual string of that
- length may actually match the pattern. Although the value is, strictly,
- in characters, we treat it as code units to avoid spending too much time
- in this optimization. */
+ /* The minimum matching length is a lower bound; no string of that length
+ may actually match the pattern. Although the value is, strictly, in
+ characters, we treat it as code units to avoid spending too much time in
+ this optimization. */
if (end_subject - start_match < re->minlength)
{
@@ -6917,12 +6572,16 @@ for(;;)
}
/* If req_cu is set, we know that that code unit must appear in the
- subject for the match to succeed. If the first code unit is set, req_cu
- must be later in the subject; otherwise the test starts at the match
- point. This optimization can save a huge amount of backtracking in
- patterns with nested unlimited repeats that aren't going to match.
- Writing separate code for cased/caseless versions makes it go faster, as
- does using an autoincrement and backing off on a match.
+ subject for the (non-partial) match to succeed. If the first code unit is
+ set, req_cu must be later in the subject; otherwise the test starts at
+ the match point. This optimization can save a huge amount of backtracking
+ in patterns with nested unlimited repeats that aren't going to match.
+ Writing separate code for caseful/caseless versions makes it go faster,
+ as does using an autoincrement and backing off on a match. As in the case
+ of the first code unit, using memchr() in the 8-bit library gives a big
+ speed up. Unlike the first_cu check above, we do not need to call
+ memchr() twice in the caseless case because we only need to check for the
+ presence of the character in either case, not find the first occurrence.
HOWEVER: when the subject string is very, very long, searching to its end
can take a long time, and give bad performance on quite ordinary
@@ -6932,30 +6591,55 @@ for(;;)
if (has_req_cu && end_subject - start_match < REQ_CU_MAX)
{
- register PCRE2_SPTR p = start_match + (has_first_cu? 1:0);
+ PCRE2_SPTR p = start_match + (has_first_cu? 1:0);
/* We don't need to repeat the search if we haven't yet reached the
- place we found it at last time. */
+ place we found it last time round the bumpalong loop. */
if (p > req_cu_ptr)
{
- if (req_cu != req_cu2)
+ if (p < end_subject)
{
- while (p < end_subject)
+ if (req_cu != req_cu2) /* Caseless */
{
- register uint32_t pp = UCHAR21INCTEST(p);
- if (pp == req_cu || pp == req_cu2) { p--; break; }
+#if PCRE2_CODE_UNIT_WIDTH != 8
+ do
+ {
+ uint32_t pp = UCHAR21INCTEST(p);
+ if (pp == req_cu || pp == req_cu2) { p--; break; }
+ }
+ while (p < end_subject);
+
+#else /* 8-bit code units */
+ PCRE2_SPTR pp = p;
+ p = memchr(pp, req_cu, end_subject - pp);
+ if (p == NULL)
+ {
+ p = memchr(pp, req_cu2, end_subject - pp);
+ if (p == NULL) p = end_subject;
+ }
+#endif /* PCRE2_CODE_UNIT_WIDTH != 8 */
}
- }
- else
- {
- while (p < end_subject)
+
+ /* The caseful case */
+
+ else
{
- if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
+#if PCRE2_CODE_UNIT_WIDTH != 8
+ do
+ {
+ if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
+ }
+ while (p < end_subject);
+
+#else /* 8-bit code units */
+ p = memchr(p, req_cu, end_subject - p);
+ if (p == NULL) p = end_subject;
+#endif
}
}
- /* If we can't find the required code unit, break the matching loop,
+ /* If we can't find the required code unit, break the bumpalong loop,
forcing a match failure. */
if (p >= end_subject)
@@ -6965,8 +6649,8 @@ for(;;)
}
/* If we have found the required code unit, save the point where we
- found it, so that we don't search again next time round the loop if
- the start hasn't passed this code unit yet. */
+ found it, so that we don't search again next time round the bumpalong
+ loop if the start hasn't yet passed this code unit. */
req_cu_ptr = p;
}
@@ -6987,14 +6671,17 @@ for(;;)
/* OK, we can now run the match. If "hitend" is set afterwards, remember the
first starting point for which a partial match was found. */
- mb->start_match_ptr = start_match;
+ cb.start_match = (PCRE2_SIZE)(start_match - subject);
+ cb.callout_flags |= PCRE2_CALLOUT_STARTMATCH;
+
mb->start_used_ptr = start_match;
mb->last_used_ptr = start_match;
mb->match_call_count = 0;
- mb->match_function_type = 0;
mb->end_offset_top = 0;
mb->skip_arg_count = 0;
- rc = match(start_match, mb->start_code, start_match, 2, mb, NULL, 0);
+
+ rc = match(start_match, mb->start_code, match_data->ovector,
+ match_data->oveccount, re->top_bracket, frame_size, mb);
if (mb->hitend && start_partial == NULL)
{
@@ -7020,9 +6707,9 @@ for(;;)
greater than the match we have just done, treat it as NOMATCH. */
case MATCH_SKIP:
- if (mb->start_match_ptr > start_match)
+ if (mb->verb_skip_ptr > start_match)
{
- new_start_match = mb->start_match_ptr;
+ new_start_match = mb->verb_skip_ptr;
break;
}
/* Fall through */
@@ -7037,7 +6724,7 @@ for(;;)
new_start_match = start_match + 1;
#ifdef SUPPORT_UNICODE
if (utf)
- ACROSSCHAR(new_start_match < end_subject, *new_start_match,
+ ACROSSCHAR(new_start_match < end_subject, new_start_match,
new_start_match++);
#endif
break;
@@ -7096,11 +6783,11 @@ for(;;)
/* ==========================================================================*/
-/* When we reach here, one of the stopping conditions is true:
+/* When we reach here, one of the following stopping conditions is true:
(1) The match succeeded, either completely, or partially;
-(2) The pattern is anchored or the match was failed by (*COMMIT);
+(2) The pattern is anchored or the match was failed after (*COMMIT);
(3) We are past the end of the subject or the bumpalong limit;
@@ -7114,18 +6801,10 @@ for(;;)
ENDLOOP:
-#ifdef HEAP_MATCH_RECURSE
-release_match_heapframes(&frame_zero, mb);
-#endif
-
-/* Release any frames that were saved from recursions. */
+/* Release an enlarged frame vector that is on the heap. */
-while (mb->ovecsave_chain != NULL)
- {
- ovecsave_frame *this = mb->ovecsave_chain;
- mb->ovecsave_chain = this->next;
- mb->memctl.free(this, mb->memctl.memory_data);
- }
+if (mb->match_frames != mb->stack_frames)
+ mb->memctl.free(mb->match_frames, mb->memctl.memory_data);
/* Fill in fields that are always returned in the match data. */
@@ -7134,68 +6813,14 @@ match_data->subject = subject;
match_data->mark = mb->mark;
match_data->matchedby = PCRE2_MATCHEDBY_INTERPRETER;
-/* Handle a fully successful match. */
+/* Handle a fully successful match. Set the return code to the number of
+captured strings, or 0 if there were too many to fit into the ovector, and then
+set the remaining returned values before returning. */
-if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
+if (rc == MATCH_MATCH)
{
- uint32_t arg_offset_max = 2 * match_data->oveccount;
-
- /* When the offset vector is big enough to deal with any backreferences,
- captured substring offsets will already be set up. In the case where we had
- to get some local memory to hold offsets for backreference processing, copy
- those that we can. In this case there need not be overflow if certain parts
- of the pattern were not used, even though there are more capturing
- parentheses than vector slots. */
-
- if (using_temporary_offsets)
- {
- if (arg_offset_max >= 4)
- {
- memcpy(match_data->ovector + 2, mb->ovector + 2,
- (arg_offset_max - 2) * sizeof(PCRE2_SIZE));
- }
- if (mb->end_offset_top > arg_offset_max) mb->capture_last |= OVFLBIT;
- mb->memctl.free(mb->ovector, mb->memctl.memory_data);
- }
-
- /* Set the return code to the number of captured strings, or 0 if there were
- too many to fit into the ovector. */
-
- match_data->rc = ((mb->capture_last & OVFLBIT) != 0)?
- 0 : mb->end_offset_top/2;
-
- /* If there is space in the offset vector, set any pairs that follow the
- highest-numbered captured string but are less than the number of capturing
- groups in the pattern (and are within the ovector) to PCRE2_UNSET. It is
- documented that this happens. In earlier versions, the whole set of potential
- capturing offsets was initialized each time round the loop, but this is
- handled differently now. "Gaps" are set to PCRE2_UNSET dynamically instead
- (this fixed a bug). Thus, it is only those at the end that need setting here.
- We can't just mark them all unset at the start of the whole thing because
- they may get set in one branch that is not the final matching branch. */
-
- if (mb->end_offset_top/2 <= re->top_bracket)
- {
- register PCRE2_SIZE *iptr, *iend;
- int resetcount = re->top_bracket + 1;
- if (resetcount > match_data->oveccount) resetcount = match_data->oveccount;
- iptr = match_data->ovector + mb->end_offset_top;
- iend = match_data->ovector + 2 * resetcount;
- while (iptr < iend) *iptr++ = PCRE2_UNSET;
- }
-
- /* If there is space, set up the whole thing as substring 0. The value of
- mb->start_match_ptr might be modified if \K was encountered on the success
- matching path. */
-
- if (match_data->oveccount < 1) rc = 0; else
- {
- match_data->ovector[0] = mb->start_match_ptr - mb->start_subject;
- match_data->ovector[1] = mb->end_match_ptr - mb->start_subject;
- }
-
- /* Set the remaining returned values */
-
+ match_data->rc = ((int)mb->end_offset_top >= 2 * match_data->oveccount)?
+ 0 : (int)mb->end_offset_top/2 + 1;
match_data->startchar = start_match - subject;
match_data->leftchar = mb->start_used_ptr - subject;
match_data->rightchar = ((mb->last_used_ptr > mb->end_match_ptr)?
@@ -7211,18 +6836,14 @@ match_data->mark = mb->nomatch_mark;
/* For anything other than nomatch or partial match, just return the code. */
-if (rc != MATCH_NOMATCH && rc != PCRE2_ERROR_PARTIAL)
- match_data->rc = rc;
+if (rc != MATCH_NOMATCH && rc != PCRE2_ERROR_PARTIAL) match_data->rc = rc;
-/* Else handle a partial match. */
+/* Handle a partial match. */
else if (match_partial != NULL)
{
- if (match_data->oveccount > 0)
- {
- match_data->ovector[0] = match_partial - subject;
- match_data->ovector[1] = end_subject - subject;
- }
+ match_data->ovector[0] = match_partial - subject;
+ match_data->ovector[1] = end_subject - subject;
match_data->startchar = match_partial - subject;
match_data->leftchar = start_partial - subject;
match_data->rightchar = end_subject - subject;
@@ -7233,10 +6854,6 @@ else if (match_partial != NULL)
else match_data->rc = PCRE2_ERROR_NOMATCH;
-/* Free any temporary offsets. */
-
-if (using_temporary_offsets)
- mb->memctl.free(mb->ovector, mb->memctl.memory_data);
return match_data->rc;
}