summaryrefslogtreecommitdiffstats
path: root/src/3rdparty/pcre2/src/pcre2_internal.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/3rdparty/pcre2/src/pcre2_internal.h')
-rw-r--r--src/3rdparty/pcre2/src/pcre2_internal.h111
1 files changed, 77 insertions, 34 deletions
diff --git a/src/3rdparty/pcre2/src/pcre2_internal.h b/src/3rdparty/pcre2/src/pcre2_internal.h
index d8fad1e93b..92dd3138d4 100644
--- a/src/3rdparty/pcre2/src/pcre2_internal.h
+++ b/src/3rdparty/pcre2/src/pcre2_internal.h
@@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
- New API code Copyright (c) 2016-2020 University of Cambridge
+ New API code Copyright (c) 2016-2022 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -220,18 +220,17 @@ not rely on this. */
#define COMPILE_ERROR_BASE 100
-/* The initial frames vector for remembering backtracking points in
-pcre2_match() is allocated on the system stack, of this size (bytes). The size
-must be a multiple of sizeof(PCRE2_SPTR) in all environments, so making it a
-multiple of 8 is best. Typical frame sizes are a few hundred bytes (it depends
-on the number of capturing parentheses) so 20KiB handles quite a few frames. A
-larger vector on the heap is obtained for patterns that need more frames. The
-maximum size of this can be limited. */
+/* The initial frames vector for remembering pcre2_match() backtracking points
+is allocated on the heap, of this size (bytes) or ten times the frame size if
+larger, unless the heap limit is smaller. Typical frame sizes are a few hundred
+bytes (it depends on the number of capturing parentheses) so 20KiB handles
+quite a few frames. A larger vector on the heap is obtained for matches that
+need more frames, subject to the heap limit. */
#define START_FRAMES_SIZE 20480
-/* Similarly, for DFA matching, an initial internal workspace vector is
-allocated on the stack. */
+/* For DFA matching, an initial internal workspace vector is allocated on the
+stack. The heap is used only if this turns out to be too small. */
#define DFA_START_RWS_SIZE 30720
@@ -954,6 +953,13 @@ a positive value. */
#define STRING_LIMIT_RECURSION_EQ "LIMIT_RECURSION="
#define STRING_MARK "MARK"
+#define STRING_bc "bc"
+#define STRING_bidiclass "bidiclass"
+#define STRING_sc "sc"
+#define STRING_script "script"
+#define STRING_scriptextensions "scriptextensions"
+#define STRING_scx "scx"
+
#else /* SUPPORT_UNICODE */
/* UTF-8 support is enabled; always use UTF-8 (=ASCII) character codes. This
@@ -1248,26 +1254,39 @@ only. */
#define STRING_LIMIT_RECURSION_EQ STR_L STR_I STR_M STR_I STR_T STR_UNDERSCORE STR_R STR_E STR_C STR_U STR_R STR_S STR_I STR_O STR_N STR_EQUALS_SIGN
#define STRING_MARK STR_M STR_A STR_R STR_K
+#define STRING_bc STR_b STR_c
+#define STRING_bidiclass STR_b STR_i STR_d STR_i STR_c STR_l STR_a STR_s STR_s
+#define STRING_sc STR_s STR_c
+#define STRING_script STR_s STR_c STR_r STR_i STR_p STR_t
+#define STRING_scriptextensions STR_s STR_c STR_r STR_i STR_p STR_t STR_e STR_x STR_t STR_e STR_n STR_s STR_i STR_o STR_n STR_s
+#define STRING_scx STR_s STR_c STR_x
+
+
#endif /* SUPPORT_UNICODE */
/* -------------------- End of character and string names -------------------*/
/* -------------------- Definitions for compiled patterns -------------------*/
-/* Codes for different types of Unicode property */
+/* Codes for different types of Unicode property. If these definitions are
+changed, the autopossessifying table in pcre2_auto_possess.c must be updated to
+match. */
#define PT_ANY 0 /* Any property - matches all chars */
#define PT_LAMP 1 /* L& - the union of Lu, Ll, Lt */
#define PT_GC 2 /* Specified general characteristic (e.g. L) */
#define PT_PC 3 /* Specified particular characteristic (e.g. Lu) */
-#define PT_SC 4 /* Script (e.g. Han) */
-#define PT_ALNUM 5 /* Alphanumeric - the union of L and N */
-#define PT_SPACE 6 /* Perl space - Z plus 9,10,12,13 */
-#define PT_PXSPACE 7 /* POSIX space - Z plus 9,10,11,12,13 */
-#define PT_WORD 8 /* Word - L plus N plus underscore */
-#define PT_CLIST 9 /* Pseudo-property: match character list */
-#define PT_UCNC 10 /* Universal Character nameable character */
-#define PT_TABSIZE 11 /* Size of square table for autopossessify tests */
+#define PT_SC 4 /* Script only (e.g. Han) */
+#define PT_SCX 5 /* Script extensions (includes SC) */
+#define PT_ALNUM 6 /* Alphanumeric - the union of L and N */
+#define PT_SPACE 7 /* Perl space - general category Z plus 9,10,12,13 */
+#define PT_PXSPACE 8 /* POSIX space - Z plus 9,10,11,12,13 */
+#define PT_WORD 9 /* Word - L plus N plus underscore */
+#define PT_CLIST 10 /* Pseudo-property: match character list */
+#define PT_UCNC 11 /* Universal Character nameable character */
+#define PT_BIDICL 12 /* Specified bidi class */
+#define PT_BOOL 13 /* Boolean property */
+#define PT_TABSIZE 14 /* Size of square table for autopossessify tests */
/* The following special properties are used only in XCLASS items, when POSIX
classes are specified and PCRE2_UCP is set - in other words, for Unicode
@@ -1275,22 +1294,27 @@ handling of these classes. They are not available via the \p or \P escapes like
those in the above list, and so they do not take part in the autopossessifying
table. */
-#define PT_PXGRAPH 11 /* [:graph:] - characters that mark the paper */
-#define PT_PXPRINT 12 /* [:print:] - [:graph:] plus non-control spaces */
-#define PT_PXPUNCT 13 /* [:punct:] - punctuation characters */
+#define PT_PXGRAPH 14 /* [:graph:] - characters that mark the paper */
+#define PT_PXPRINT 15 /* [:print:] - [:graph:] plus non-control spaces */
+#define PT_PXPUNCT 16 /* [:punct:] - punctuation characters */
+
+/* This value is used when parsing \p and \P escapes to indicate that neither
+\p{script:...} nor \p{scx:...} has been encountered. */
+
+#define PT_NOTSCRIPT 255
/* Flag bits and data types for the extended class (OP_XCLASS) for classes that
contain characters with values greater than 255. */
-#define XCL_NOT 0x01 /* Flag: this is a negative class */
-#define XCL_MAP 0x02 /* Flag: a 32-byte map is present */
-#define XCL_HASPROP 0x04 /* Flag: property checks are present. */
+#define XCL_NOT 0x01 /* Flag: this is a negative class */
+#define XCL_MAP 0x02 /* Flag: a 32-byte map is present */
+#define XCL_HASPROP 0x04 /* Flag: property checks are present. */
-#define XCL_END 0 /* Marks end of individual items */
-#define XCL_SINGLE 1 /* Single item (one multibyte char) follows */
-#define XCL_RANGE 2 /* A range (two multibyte chars) follows */
-#define XCL_PROP 3 /* Unicode property (2-byte property code follows) */
-#define XCL_NOTPROP 4 /* Unicode inverted property (ditto) */
+#define XCL_END 0 /* Marks end of individual items */
+#define XCL_SINGLE 1 /* Single item (one multibyte char) follows */
+#define XCL_RANGE 2 /* A range (two multibyte chars) follows */
+#define XCL_PROP 3 /* Unicode property (2-byte property code follows) */
+#define XCL_NOTPROP 4 /* Unicode inverted property (ditto) */
/* These are escaped items that aren't just an encoding of a particular data
value such as \n. They must have non-zero values, as check_escape() returns 0
@@ -1797,8 +1821,8 @@ typedef struct {
uint8_t gbprop; /* ucp_gbControl, etc. (grapheme break property) */
uint8_t caseset; /* offset to multichar other cases or zero */
int32_t other_case; /* offset to other case, or zero if none */
- int16_t scriptx; /* script extension value */
- int16_t dummy; /* spare - to round to multiple of 4 bytes */
+ uint16_t scriptx_bidiclass; /* script extension (11 bit) and bidi class (5 bit) values */
+ uint16_t bprops; /* binary properties offset */
} ucd_record;
/* UCD access macros */
@@ -1815,13 +1839,30 @@ typedef struct {
#define GET_UCD(ch) REAL_GET_UCD(ch)
#endif
+#define UCD_SCRIPTX_MASK 0x3ff
+#define UCD_BIDICLASS_SHIFT 11
+#define UCD_BPROPS_MASK 0xfff
+
+#define UCD_SCRIPTX_PROP(prop) ((prop)->scriptx_bidiclass & UCD_SCRIPTX_MASK)
+#define UCD_BIDICLASS_PROP(prop) ((prop)->scriptx_bidiclass >> UCD_BIDICLASS_SHIFT)
+#define UCD_BPROPS_PROP(prop) ((prop)->bprops & UCD_BPROPS_MASK)
+
#define UCD_CHARTYPE(ch) GET_UCD(ch)->chartype
#define UCD_SCRIPT(ch) GET_UCD(ch)->script
#define UCD_CATEGORY(ch) PRIV(ucp_gentype)[UCD_CHARTYPE(ch)]
#define UCD_GRAPHBREAK(ch) GET_UCD(ch)->gbprop
#define UCD_CASESET(ch) GET_UCD(ch)->caseset
#define UCD_OTHERCASE(ch) ((uint32_t)((int)ch + (int)(GET_UCD(ch)->other_case)))
-#define UCD_SCRIPTX(ch) GET_UCD(ch)->scriptx
+#define UCD_SCRIPTX(ch) UCD_SCRIPTX_PROP(GET_UCD(ch))
+#define UCD_BPROPS(ch) UCD_BPROPS_PROP(GET_UCD(ch))
+#define UCD_BIDICLASS(ch) UCD_BIDICLASS_PROP(GET_UCD(ch))
+
+/* The "scriptx" and bprops fields contain offsets into vectors of 32-bit words
+that form a bitmap representing a list of scripts or boolean properties. These
+macros test or set a bit in the map by number. */
+
+#define MAPBIT(map,n) ((map)[(n)/32]&(1u<<((n)%32)))
+#define MAPSET(map,n) ((map)[(n)/32]|=(1u<<((n)%32)))
/* Header for serialized pcre2 codes. */
@@ -1878,6 +1919,7 @@ extern const uint8_t PRIV(utf8_table4)[];
#endif
#define _pcre2_hspace_list PCRE2_SUFFIX(_pcre2_hspace_list_)
#define _pcre2_vspace_list PCRE2_SUFFIX(_pcre2_vspace_list_)
+#define _pcre2_ucd_boolprop_sets PCRE2_SUFFIX(_pcre2_ucd_boolprop_sets_)
#define _pcre2_ucd_caseless_sets PCRE2_SUFFIX(_pcre2_ucd_caseless_sets_)
#define _pcre2_ucd_digit_sets PCRE2_SUFFIX(_pcre2_ucd_digit_sets_)
#define _pcre2_ucd_script_sets PCRE2_SUFFIX(_pcre2_ucd_script_sets_)
@@ -1901,9 +1943,10 @@ extern const pcre2_match_context PRIV(default_match_context);
extern const uint8_t PRIV(default_tables)[];
extern const uint32_t PRIV(hspace_list)[];
extern const uint32_t PRIV(vspace_list)[];
+extern const uint32_t PRIV(ucd_boolprop_sets)[];
extern const uint32_t PRIV(ucd_caseless_sets)[];
extern const uint32_t PRIV(ucd_digit_sets)[];
-extern const uint8_t PRIV(ucd_script_sets)[];
+extern const uint32_t PRIV(ucd_script_sets)[];
extern const ucd_record PRIV(ucd_records)[];
#if PCRE2_CODE_UNIT_WIDTH == 32
extern const ucd_record PRIV(dummy_ucd_record)[];