summaryrefslogtreecommitdiffstats
path: root/src/3rdparty/pcre2/src/pcre2_jit_simd_inc.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/3rdparty/pcre2/src/pcre2_jit_simd_inc.h')
-rw-r--r--src/3rdparty/pcre2/src/pcre2_jit_simd_inc.h1133
1 files changed, 815 insertions, 318 deletions
diff --git a/src/3rdparty/pcre2/src/pcre2_jit_simd_inc.h b/src/3rdparty/pcre2/src/pcre2_jit_simd_inc.h
index aa029cce38..783a85f50e 100644
--- a/src/3rdparty/pcre2/src/pcre2_jit_simd_inc.h
+++ b/src/3rdparty/pcre2/src/pcre2_jit_simd_inc.h
@@ -42,7 +42,8 @@ POSSIBILITY OF SUCH DAMAGE.
#if !(defined SUPPORT_VALGRIND)
#if ((defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86) \
- || (defined SLJIT_CONFIG_S390X && SLJIT_CONFIG_S390X))
+ || (defined SLJIT_CONFIG_S390X && SLJIT_CONFIG_S390X) \
+ || (defined SLJIT_CONFIG_LOONGARCH_64 && SLJIT_CONFIG_LOONGARCH_64))
typedef enum {
vector_compare_match1,
@@ -50,18 +51,39 @@ typedef enum {
vector_compare_match2,
} vector_compare_type;
-static SLJIT_INLINE sljit_u32 max_fast_forward_char_pair_offset(void)
+#if (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86)
+static SLJIT_INLINE sljit_s32 max_fast_forward_char_pair_offset(void)
{
#if PCRE2_CODE_UNIT_WIDTH == 8
+/* The AVX2 code path is currently disabled. */
+/* return sljit_has_cpu_feature(SLJIT_HAS_AVX2) ? 31 : 15; */
return 15;
#elif PCRE2_CODE_UNIT_WIDTH == 16
+/* The AVX2 code path is currently disabled. */
+/* return sljit_has_cpu_feature(SLJIT_HAS_AVX2) ? 15 : 7; */
return 7;
#elif PCRE2_CODE_UNIT_WIDTH == 32
+/* The AVX2 code path is currently disabled. */
+/* return sljit_has_cpu_feature(SLJIT_HAS_AVX2) ? 7 : 3; */
return 3;
#else
#error "Unsupported unit width"
#endif
}
+#else /* !SLJIT_CONFIG_X86 */
+static SLJIT_INLINE sljit_s32 max_fast_forward_char_pair_offset(void)
+{
+#if PCRE2_CODE_UNIT_WIDTH == 8
+return 15;
+#elif PCRE2_CODE_UNIT_WIDTH == 16
+return 7;
+#elif PCRE2_CODE_UNIT_WIDTH == 32
+return 3;
+#else
+#error "Unsupported unit width"
+#endif
+}
+#endif /* SLJIT_CONFIG_X86 */
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
static struct sljit_jump *jump_if_utf_char_start(struct sljit_compiler *compiler, sljit_s32 reg)
@@ -86,49 +108,35 @@ static sljit_s32 character_to_int32(PCRE2_UCHAR chr)
{
sljit_u32 value = chr;
#if PCRE2_CODE_UNIT_WIDTH == 8
-#define SSE2_COMPARE_TYPE_INDEX 0
+#define SIMD_COMPARE_TYPE_INDEX 0
return (sljit_s32)((value << 24) | (value << 16) | (value << 8) | value);
#elif PCRE2_CODE_UNIT_WIDTH == 16
-#define SSE2_COMPARE_TYPE_INDEX 1
+#define SIMD_COMPARE_TYPE_INDEX 1
return (sljit_s32)((value << 16) | value);
#elif PCRE2_CODE_UNIT_WIDTH == 32
-#define SSE2_COMPARE_TYPE_INDEX 2
+#define SIMD_COMPARE_TYPE_INDEX 2
return (sljit_s32)(value);
#else
#error "Unsupported unit width"
#endif
}
-static void load_from_mem_sse2(struct sljit_compiler *compiler, sljit_s32 dst_xmm_reg, sljit_s32 src_general_reg, sljit_s8 offset)
+static void fast_forward_char_pair_sse2_compare(struct sljit_compiler *compiler, vector_compare_type compare_type,
+ sljit_s32 reg_type, int step, sljit_s32 dst_ind, sljit_s32 cmp1_ind, sljit_s32 cmp2_ind, sljit_s32 tmp_ind)
{
-sljit_u8 instruction[5];
-
-SLJIT_ASSERT(dst_xmm_reg < 8);
-SLJIT_ASSERT(src_general_reg < 8);
-
-/* MOVDQA xmm1, xmm2/m128 */
-instruction[0] = ((sljit_u8)offset & 0xf) == 0 ? 0x66 : 0xf3;
-instruction[1] = 0x0f;
-instruction[2] = 0x6f;
+sljit_u8 instruction[4];
-if (offset == 0)
+if (reg_type == SLJIT_SIMD_REG_128)
{
- instruction[3] = (dst_xmm_reg << 3) | src_general_reg;
- sljit_emit_op_custom(compiler, instruction, 4);
- return;
+ instruction[0] = 0x66;
+ instruction[1] = 0x0f;
+ }
+else
+ {
+ /* Two byte VEX prefix. */
+ instruction[0] = 0xc5;
+ instruction[1] = 0xfd;
}
-
-instruction[3] = 0x40 | (dst_xmm_reg << 3) | src_general_reg;
-instruction[4] = (sljit_u8)offset;
-sljit_emit_op_custom(compiler, instruction, 5);
-}
-
-static void fast_forward_char_pair_sse2_compare(struct sljit_compiler *compiler, vector_compare_type compare_type,
- int step, sljit_s32 dst_ind, sljit_s32 cmp1_ind, sljit_s32 cmp2_ind, sljit_s32 tmp_ind)
-{
-sljit_u8 instruction[4];
-instruction[0] = 0x66;
-instruction[1] = 0x0f;
SLJIT_ASSERT(step >= 0 && step <= 3);
@@ -139,8 +147,10 @@ if (compare_type != vector_compare_match2)
if (compare_type == vector_compare_match1i)
{
/* POR xmm1, xmm2/m128 */
- /* instruction[0] = 0x66; */
- /* instruction[1] = 0x0f; */
+ if (reg_type == SLJIT_SIMD_REG_256)
+ instruction[1] ^= (dst_ind << 3);
+
+ /* Prefix is filled. */
instruction[2] = 0xeb;
instruction[3] = 0xc0 | (dst_ind << 3) | cmp2_ind;
sljit_emit_op_custom(compiler, instruction, 4);
@@ -152,20 +162,35 @@ if (compare_type != vector_compare_match2)
return;
/* PCMPEQB/W/D xmm1, xmm2/m128 */
- /* instruction[0] = 0x66; */
- /* instruction[1] = 0x0f; */
- instruction[2] = 0x74 + SSE2_COMPARE_TYPE_INDEX;
+ if (reg_type == SLJIT_SIMD_REG_256)
+ instruction[1] ^= (dst_ind << 3);
+
+ /* Prefix is filled. */
+ instruction[2] = 0x74 + SIMD_COMPARE_TYPE_INDEX;
instruction[3] = 0xc0 | (dst_ind << 3) | cmp1_ind;
sljit_emit_op_custom(compiler, instruction, 4);
return;
}
+if (reg_type == SLJIT_SIMD_REG_256)
+ {
+ if (step == 2)
+ return;
+
+ if (step == 0)
+ {
+ step = 2;
+ instruction[1] ^= (dst_ind << 3);
+ }
+ }
+
switch (step)
{
case 0:
+ SLJIT_ASSERT(reg_type == SLJIT_SIMD_REG_128);
+
/* MOVDQA xmm1, xmm2/m128 */
- /* instruction[0] = 0x66; */
- /* instruction[1] = 0x0f; */
+ /* Prefix is filled. */
instruction[2] = 0x6f;
instruction[3] = 0xc0 | (tmp_ind << 3) | dst_ind;
sljit_emit_op_custom(compiler, instruction, 4);
@@ -173,26 +198,29 @@ switch (step)
case 1:
/* PCMPEQB/W/D xmm1, xmm2/m128 */
- /* instruction[0] = 0x66; */
- /* instruction[1] = 0x0f; */
- instruction[2] = 0x74 + SSE2_COMPARE_TYPE_INDEX;
+ if (reg_type == SLJIT_SIMD_REG_256)
+ instruction[1] ^= (dst_ind << 3);
+
+ /* Prefix is filled. */
+ instruction[2] = 0x74 + SIMD_COMPARE_TYPE_INDEX;
instruction[3] = 0xc0 | (dst_ind << 3) | cmp1_ind;
sljit_emit_op_custom(compiler, instruction, 4);
return;
case 2:
/* PCMPEQB/W/D xmm1, xmm2/m128 */
- /* instruction[0] = 0x66; */
- /* instruction[1] = 0x0f; */
- instruction[2] = 0x74 + SSE2_COMPARE_TYPE_INDEX;
+ /* Prefix is filled. */
+ instruction[2] = 0x74 + SIMD_COMPARE_TYPE_INDEX;
instruction[3] = 0xc0 | (tmp_ind << 3) | cmp2_ind;
sljit_emit_op_custom(compiler, instruction, 4);
return;
case 3:
/* POR xmm1, xmm2/m128 */
- /* instruction[0] = 0x66; */
- /* instruction[1] = 0x0f; */
+ if (reg_type == SLJIT_SIMD_REG_256)
+ instruction[1] ^= (dst_ind << 3);
+
+ /* Prefix is filled. */
instruction[2] = 0xeb;
instruction[3] = 0xc0 | (dst_ind << 3) | tmp_ind;
sljit_emit_op_custom(compiler, instruction, 4);
@@ -200,12 +228,16 @@ switch (step)
}
}
-#define JIT_HAS_FAST_FORWARD_CHAR_SIMD (sljit_has_cpu_feature(SLJIT_HAS_SSE2))
+#define JIT_HAS_FAST_FORWARD_CHAR_SIMD (sljit_has_cpu_feature(SLJIT_HAS_SIMD))
static void fast_forward_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_s32 offset)
{
DEFINE_COMPILER;
sljit_u8 instruction[8];
+/* The AVX2 code path is currently disabled. */
+/* sljit_s32 reg_type = sljit_has_cpu_feature(SLJIT_HAS_AVX2) ? SLJIT_SIMD_REG_256 : SLJIT_SIMD_REG_128; */
+sljit_s32 reg_type = SLJIT_SIMD_REG_128;
+sljit_s32 value;
struct sljit_label *start;
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
struct sljit_label *restart;
@@ -213,12 +245,11 @@ struct sljit_label *restart;
struct sljit_jump *quit;
struct sljit_jump *partial_quit[2];
vector_compare_type compare_type = vector_compare_match1;
-sljit_s32 tmp1_reg_ind = sljit_get_register_index(TMP1);
-sljit_s32 str_ptr_reg_ind = sljit_get_register_index(STR_PTR);
-sljit_s32 data_ind = 0;
-sljit_s32 tmp_ind = 1;
-sljit_s32 cmp1_ind = 2;
-sljit_s32 cmp2_ind = 3;
+sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
+sljit_s32 data_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR0);
+sljit_s32 cmp1_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR1);
+sljit_s32 cmp2_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR2);
+sljit_s32 tmp_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR3);
sljit_u32 bit = 0;
int i;
@@ -241,61 +272,34 @@ if (common->mode == PCRE2_JIT_COMPLETE)
add_jump(compiler, &common->failed_match, partial_quit[0]);
/* First part (unaligned start) */
-
-OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1 | bit));
-
-SLJIT_ASSERT(tmp1_reg_ind < 8);
-
-/* MOVD xmm, r/m32 */
-instruction[0] = 0x66;
-instruction[1] = 0x0f;
-instruction[2] = 0x6e;
-instruction[3] = 0xc0 | (cmp1_ind << 3) | tmp1_reg_ind;
-sljit_emit_op_custom(compiler, instruction, 4);
+value = SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_32 | SLJIT_SIMD_LANE_ZERO;
+sljit_emit_simd_lane_mov(compiler, value, SLJIT_FR1, 0, SLJIT_IMM, character_to_int32(char1 | bit));
if (char1 != char2)
- {
- OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(bit != 0 ? bit : char2));
-
- /* MOVD xmm, r/m32 */
- instruction[3] = 0xc0 | (cmp2_ind << 3) | tmp1_reg_ind;
- sljit_emit_op_custom(compiler, instruction, 4);
- }
+ sljit_emit_simd_lane_mov(compiler, value, SLJIT_FR2, 0, SLJIT_IMM, character_to_int32(bit != 0 ? bit : char2));
OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
-/* PSHUFD xmm1, xmm2/m128, imm8 */
-/* instruction[0] = 0x66; */
-/* instruction[1] = 0x0f; */
-instruction[2] = 0x70;
-instruction[3] = 0xc0 | (cmp1_ind << 3) | cmp1_ind;
-instruction[4] = 0;
-sljit_emit_op_custom(compiler, instruction, 5);
+sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_FR1, SLJIT_FR1, 0);
if (char1 != char2)
- {
- /* PSHUFD xmm1, xmm2/m128, imm8 */
- instruction[3] = 0xc0 | (cmp2_ind << 3) | cmp2_ind;
- sljit_emit_op_custom(compiler, instruction, 5);
- }
+ sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_FR2, SLJIT_FR2, 0);
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
restart = LABEL();
#endif
-OP2(SLJIT_AND, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, ~0xf);
-OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf);
-load_from_mem_sse2(compiler, data_ind, str_ptr_reg_ind, 0);
-for (i = 0; i < 4; i++)
- fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
+value = (reg_type == SLJIT_SIMD_REG_256) ? 0x1f : 0xf;
+OP2(SLJIT_AND, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, ~value);
+OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, value);
-/* PMOVMSKB reg, xmm */
-/* instruction[0] = 0x66; */
-/* instruction[1] = 0x0f; */
-instruction[2] = 0xd7;
-instruction[3] = 0xc0 | (tmp1_reg_ind << 3) | data_ind;
-sljit_emit_op_custom(compiler, instruction, 4);
+value = (reg_type == SLJIT_SIMD_REG_256) ? SLJIT_SIMD_MEM_ALIGNED_256 : SLJIT_SIMD_MEM_ALIGNED_128;
+sljit_emit_simd_mov(compiler, reg_type | value, SLJIT_FR0, SLJIT_MEM1(STR_PTR), 0);
+for (i = 0; i < 4; i++)
+ fast_forward_char_pair_sse2_compare(compiler, compare_type, reg_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
+
+sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_FR0, TMP1, 0);
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);
@@ -306,27 +310,24 @@ OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
/* Second part (aligned) */
start = LABEL();
-OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
+value = (reg_type == SLJIT_SIMD_REG_256) ? 32 : 16;
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, value);
partial_quit[1] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
if (common->mode == PCRE2_JIT_COMPLETE)
add_jump(compiler, &common->failed_match, partial_quit[1]);
-load_from_mem_sse2(compiler, data_ind, str_ptr_reg_ind, 0);
+value = (reg_type == SLJIT_SIMD_REG_256) ? SLJIT_SIMD_MEM_ALIGNED_256 : SLJIT_SIMD_MEM_ALIGNED_128;
+sljit_emit_simd_mov(compiler, reg_type | value, SLJIT_FR0, SLJIT_MEM1(STR_PTR), 0);
for (i = 0; i < 4; i++)
- fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
-
-/* PMOVMSKB reg, xmm */
-/* instruction[0] = 0x66; */
-/* instruction[1] = 0x0f; */
-instruction[2] = 0xd7;
-instruction[3] = 0xc0 | (tmp1_reg_ind << 3) | data_ind;
-sljit_emit_op_custom(compiler, instruction, 4);
+ fast_forward_char_pair_sse2_compare(compiler, compare_type, reg_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
+sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_FR0, TMP1, 0);
CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);
JUMPHERE(quit);
+SLJIT_ASSERT(tmp1_reg_ind < 8);
/* BSF r32, r/m32 */
instruction[0] = 0x0f;
instruction[1] = 0xbc;
@@ -339,8 +340,8 @@ if (common->mode != PCRE2_JIT_COMPLETE)
{
JUMPHERE(partial_quit[0]);
JUMPHERE(partial_quit[1]);
- OP2(SLJIT_SUB | SLJIT_SET_GREATER, SLJIT_UNUSED, 0, STR_PTR, 0, STR_END, 0);
- CMOV(SLJIT_GREATER, STR_PTR, STR_END, 0);
+ OP2U(SLJIT_SUB | SLJIT_SET_GREATER, STR_PTR, 0, STR_END, 0);
+ SELECT(SLJIT_GREATER, STR_PTR, STR_END, 0, STR_PTR);
}
else
add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
@@ -364,22 +365,25 @@ if (common->utf && offset > 0)
#endif
}
-#define JIT_HAS_FAST_REQUESTED_CHAR_SIMD (sljit_has_cpu_feature(SLJIT_HAS_SSE2))
+#define JIT_HAS_FAST_REQUESTED_CHAR_SIMD (sljit_has_cpu_feature(SLJIT_HAS_SIMD))
static jump_list *fast_requested_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2)
{
DEFINE_COMPILER;
sljit_u8 instruction[8];
+/* The AVX2 code path is currently disabled. */
+/* sljit_s32 reg_type = sljit_has_cpu_feature(SLJIT_HAS_AVX2) ? SLJIT_SIMD_REG_256 : SLJIT_SIMD_REG_128; */
+sljit_s32 reg_type = SLJIT_SIMD_REG_128;
+sljit_s32 value;
struct sljit_label *start;
struct sljit_jump *quit;
jump_list *not_found = NULL;
vector_compare_type compare_type = vector_compare_match1;
-sljit_s32 tmp1_reg_ind = sljit_get_register_index(TMP1);
-sljit_s32 str_ptr_reg_ind = sljit_get_register_index(STR_PTR);
-sljit_s32 data_ind = 0;
-sljit_s32 tmp_ind = 1;
-sljit_s32 cmp1_ind = 2;
-sljit_s32 cmp2_ind = 3;
+sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
+sljit_s32 data_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR0);
+sljit_s32 cmp1_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR1);
+sljit_s32 cmp2_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR2);
+sljit_s32 tmp_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR3);
sljit_u32 bit = 0;
int i;
@@ -401,57 +405,30 @@ OP1(SLJIT_MOV, TMP3, 0, STR_PTR, 0);
/* First part (unaligned start) */
-OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1 | bit));
-
-SLJIT_ASSERT(tmp1_reg_ind < 8);
-
-/* MOVD xmm, r/m32 */
-instruction[0] = 0x66;
-instruction[1] = 0x0f;
-instruction[2] = 0x6e;
-instruction[3] = 0xc0 | (cmp1_ind << 3) | tmp1_reg_ind;
-sljit_emit_op_custom(compiler, instruction, 4);
+value = SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_32 | SLJIT_SIMD_LANE_ZERO;
+sljit_emit_simd_lane_mov(compiler, value, SLJIT_FR1, 0, SLJIT_IMM, character_to_int32(char1 | bit));
if (char1 != char2)
- {
- OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(bit != 0 ? bit : char2));
-
- /* MOVD xmm, r/m32 */
- instruction[3] = 0xc0 | (cmp2_ind << 3) | tmp1_reg_ind;
- sljit_emit_op_custom(compiler, instruction, 4);
- }
+ sljit_emit_simd_lane_mov(compiler, value, SLJIT_FR2, 0, SLJIT_IMM, character_to_int32(bit != 0 ? bit : char2));
OP1(SLJIT_MOV, STR_PTR, 0, TMP2, 0);
-/* PSHUFD xmm1, xmm2/m128, imm8 */
-/* instruction[0] = 0x66; */
-/* instruction[1] = 0x0f; */
-instruction[2] = 0x70;
-instruction[3] = 0xc0 | (cmp1_ind << 3) | cmp1_ind;
-instruction[4] = 0;
-sljit_emit_op_custom(compiler, instruction, 5);
+sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_FR1, SLJIT_FR1, 0);
if (char1 != char2)
- {
- /* PSHUFD xmm1, xmm2/m128, imm8 */
- instruction[3] = 0xc0 | (cmp2_ind << 3) | cmp2_ind;
- sljit_emit_op_custom(compiler, instruction, 5);
- }
+ sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_FR2, SLJIT_FR2, 0);
-OP2(SLJIT_AND, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, ~0xf);
-OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf);
+value = (reg_type == SLJIT_SIMD_REG_256) ? 0x1f : 0xf;
+OP2(SLJIT_AND, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, ~value);
+OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, value);
-load_from_mem_sse2(compiler, data_ind, str_ptr_reg_ind, 0);
-for (i = 0; i < 4; i++)
- fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
+value = (reg_type == SLJIT_SIMD_REG_256) ? SLJIT_SIMD_MEM_ALIGNED_256 : SLJIT_SIMD_MEM_ALIGNED_128;
+sljit_emit_simd_mov(compiler, reg_type | value, SLJIT_FR0, SLJIT_MEM1(STR_PTR), 0);
-/* PMOVMSKB reg, xmm */
-/* instruction[0] = 0x66; */
-/* instruction[1] = 0x0f; */
-instruction[2] = 0xd7;
-instruction[3] = 0xc0 | (tmp1_reg_ind << 3) | data_ind;
-sljit_emit_op_custom(compiler, instruction, 4);
+for (i = 0; i < 4; i++)
+ fast_forward_char_pair_sse2_compare(compiler, compare_type, reg_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
+sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_FR0, TMP1, 0);
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);
@@ -462,25 +439,23 @@ OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
/* Second part (aligned) */
start = LABEL();
-OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
+value = (reg_type == SLJIT_SIMD_REG_256) ? 32 : 16;
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, value);
add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
-load_from_mem_sse2(compiler, data_ind, str_ptr_reg_ind, 0);
-for (i = 0; i < 4; i++)
- fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
+value = (reg_type == SLJIT_SIMD_REG_256) ? SLJIT_SIMD_MEM_ALIGNED_256 : SLJIT_SIMD_MEM_ALIGNED_128;
+sljit_emit_simd_mov(compiler, reg_type | value, SLJIT_FR0, SLJIT_MEM1(STR_PTR), 0);
-/* PMOVMSKB reg, xmm */
-/* instruction[0] = 0x66; */
-/* instruction[1] = 0x0f; */
-instruction[2] = 0xd7;
-instruction[3] = 0xc0 | (tmp1_reg_ind << 3) | data_ind;
-sljit_emit_op_custom(compiler, instruction, 4);
+for (i = 0; i < 4; i++)
+ fast_forward_char_pair_sse2_compare(compiler, compare_type, reg_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
+sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_FR0, TMP1, 0);
CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);
JUMPHERE(quit);
+SLJIT_ASSERT(tmp1_reg_ind < 8);
/* BSF r32, r/m32 */
instruction[0] = 0x0f;
instruction[1] = 0xbc;
@@ -496,29 +471,31 @@ return not_found;
#ifndef _WIN64
-#define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD (sljit_has_cpu_feature(SLJIT_HAS_SSE2))
+#define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD (sljit_has_cpu_feature(SLJIT_HAS_SIMD))
static void fast_forward_char_pair_simd(compiler_common *common, sljit_s32 offs1,
PCRE2_UCHAR char1a, PCRE2_UCHAR char1b, sljit_s32 offs2, PCRE2_UCHAR char2a, PCRE2_UCHAR char2b)
{
DEFINE_COMPILER;
sljit_u8 instruction[8];
+/* The AVX2 code path is currently disabled. */
+/* sljit_s32 reg_type = sljit_has_cpu_feature(SLJIT_HAS_AVX2) ? SLJIT_SIMD_REG_256 : SLJIT_SIMD_REG_128; */
+sljit_s32 reg_type = SLJIT_SIMD_REG_128;
+sljit_s32 value;
vector_compare_type compare1_type = vector_compare_match1;
vector_compare_type compare2_type = vector_compare_match1;
sljit_u32 bit1 = 0;
sljit_u32 bit2 = 0;
sljit_u32 diff = IN_UCHARS(offs1 - offs2);
-sljit_s32 tmp1_reg_ind = sljit_get_register_index(TMP1);
-sljit_s32 tmp2_reg_ind = sljit_get_register_index(TMP2);
-sljit_s32 str_ptr_reg_ind = sljit_get_register_index(STR_PTR);
-sljit_s32 data1_ind = 0;
-sljit_s32 data2_ind = 1;
-sljit_s32 tmp1_ind = 2;
-sljit_s32 tmp2_ind = 3;
-sljit_s32 cmp1a_ind = 4;
-sljit_s32 cmp1b_ind = 5;
-sljit_s32 cmp2a_ind = 6;
-sljit_s32 cmp2b_ind = 7;
+sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
+sljit_s32 data1_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR0);
+sljit_s32 data2_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR1);
+sljit_s32 cmp1a_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR2);
+sljit_s32 cmp2a_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR3);
+sljit_s32 cmp1b_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR4);
+sljit_s32 cmp2b_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR5);
+sljit_s32 tmp1_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR6);
+sljit_s32 tmp2_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_TMP_FR0);
struct sljit_label *start;
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
struct sljit_label *restart;
@@ -526,9 +503,8 @@ struct sljit_label *restart;
struct sljit_jump *jump[2];
int i;
-SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE && offs1 > offs2);
-SLJIT_ASSERT(diff <= IN_UCHARS(max_fast_forward_char_pair_offset()));
-SLJIT_ASSERT(tmp1_reg_ind < 8 && tmp2_reg_ind == 1);
+SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE && offs1 > offs2 && offs2 >= 0);
+SLJIT_ASSERT(diff <= (unsigned)IN_UCHARS(max_fast_forward_char_pair_offset()));
/* Initialize. */
if (common->match_end_ptr != 0)
@@ -537,18 +513,13 @@ if (common->match_end_ptr != 0)
OP1(SLJIT_MOV, TMP3, 0, STR_END, 0);
OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1));
- OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, TMP1, 0, STR_END, 0);
- CMOV(SLJIT_LESS, STR_END, TMP1, 0);
+ OP2U(SLJIT_SUB | SLJIT_SET_LESS, TMP1, 0, STR_END, 0);
+ SELECT(SLJIT_LESS, STR_END, TMP1, 0, STR_END);
}
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));
add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
-/* MOVD xmm, r/m32 */
-instruction[0] = 0x66;
-instruction[1] = 0x0f;
-instruction[2] = 0x6e;
-
if (char1a == char1b)
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1a));
else
@@ -569,14 +540,11 @@ else
}
}
-instruction[3] = 0xc0 | (cmp1a_ind << 3) | tmp1_reg_ind;
-sljit_emit_op_custom(compiler, instruction, 4);
+value = SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_32 | SLJIT_SIMD_LANE_ZERO;
+sljit_emit_simd_lane_mov(compiler, value, SLJIT_FR2, 0, TMP1, 0);
if (char1a != char1b)
- {
- instruction[3] = 0xc0 | (cmp1b_ind << 3) | tmp2_reg_ind;
- sljit_emit_op_custom(compiler, instruction, 4);
- }
+ sljit_emit_simd_lane_mov(compiler, value, SLJIT_FR4, 0, TMP2, 0);
if (char2a == char2b)
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char2a));
@@ -598,38 +566,18 @@ else
}
}
-instruction[3] = 0xc0 | (cmp2a_ind << 3) | tmp1_reg_ind;
-sljit_emit_op_custom(compiler, instruction, 4);
+sljit_emit_simd_lane_mov(compiler, value, SLJIT_FR3, 0, TMP1, 0);
if (char2a != char2b)
- {
- instruction[3] = 0xc0 | (cmp2b_ind << 3) | tmp2_reg_ind;
- sljit_emit_op_custom(compiler, instruction, 4);
- }
-
-/* PSHUFD xmm1, xmm2/m128, imm8 */
-/* instruction[0] = 0x66; */
-/* instruction[1] = 0x0f; */
-instruction[2] = 0x70;
-instruction[4] = 0;
-
-instruction[3] = 0xc0 | (cmp1a_ind << 3) | cmp1a_ind;
-sljit_emit_op_custom(compiler, instruction, 5);
+ sljit_emit_simd_lane_mov(compiler, value, SLJIT_FR5, 0, TMP2, 0);
+sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_FR2, SLJIT_FR2, 0);
if (char1a != char1b)
- {
- instruction[3] = 0xc0 | (cmp1b_ind << 3) | cmp1b_ind;
- sljit_emit_op_custom(compiler, instruction, 5);
- }
-
-instruction[3] = 0xc0 | (cmp2a_ind << 3) | cmp2a_ind;
-sljit_emit_op_custom(compiler, instruction, 5);
+ sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_FR4, SLJIT_FR4, 0);
+sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_FR3, SLJIT_FR3, 0);
if (char2a != char2b)
- {
- instruction[3] = 0xc0 | (cmp2b_ind << 3) | cmp2b_ind;
- sljit_emit_op_custom(compiler, instruction, 5);
- }
+ sljit_emit_simd_lane_replicate(compiler, reg_type | SLJIT_SIMD_ELEM_32, SLJIT_FR5, SLJIT_FR5, 0);
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
restart = LABEL();
@@ -637,55 +585,91 @@ restart = LABEL();
OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, diff);
OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
-OP2(SLJIT_AND, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, ~0xf);
+value = (reg_type == SLJIT_SIMD_REG_256) ? ~0x1f : ~0xf;
+OP2(SLJIT_AND, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, value);
-load_from_mem_sse2(compiler, data1_ind, str_ptr_reg_ind, 0);
+value = (reg_type == SLJIT_SIMD_REG_256) ? SLJIT_SIMD_MEM_ALIGNED_256 : SLJIT_SIMD_MEM_ALIGNED_128;
+sljit_emit_simd_mov(compiler, reg_type | value, SLJIT_FR0, SLJIT_MEM1(STR_PTR), 0);
jump[0] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_PTR, 0);
-load_from_mem_sse2(compiler, data2_ind, str_ptr_reg_ind, -(sljit_s8)diff);
+sljit_emit_simd_mov(compiler, reg_type, SLJIT_FR1, SLJIT_MEM1(STR_PTR), -(sljit_sw)diff);
jump[1] = JUMP(SLJIT_JUMP);
JUMPHERE(jump[0]);
-/* MOVDQA xmm1, xmm2/m128 */
-/* instruction[0] = 0x66; */
-/* instruction[1] = 0x0f; */
-instruction[2] = 0x6f;
-instruction[3] = 0xc0 | (data2_ind << 3) | data1_ind;
-sljit_emit_op_custom(compiler, instruction, 4);
-
-/* PSLLDQ xmm1, imm8 */
-/* instruction[0] = 0x66; */
-/* instruction[1] = 0x0f; */
-instruction[2] = 0x73;
-instruction[3] = 0xc0 | (7 << 3) | data2_ind;
-instruction[4] = diff;
-sljit_emit_op_custom(compiler, instruction, 5);
+if (reg_type == SLJIT_SIMD_REG_256)
+ {
+ if (diff != 16)
+ {
+ /* PSLLDQ ymm1, ymm2, imm8 */
+ instruction[0] = 0xc5;
+ instruction[1] = (sljit_u8)(0xf9 ^ (data2_ind << 3));
+ instruction[2] = 0x73;
+ instruction[3] = 0xc0 | (7 << 3) | data1_ind;
+ instruction[4] = diff & 0xf;
+ sljit_emit_op_custom(compiler, instruction, 5);
+ }
+
+ instruction[0] = 0xc4;
+ instruction[1] = 0xe3;
+ if (diff < 16)
+ {
+ /* VINSERTI128 xmm1, xmm2, xmm3/m128 */
+ /* instruction[0] = 0xc4; */
+ /* instruction[1] = 0xe3; */
+ instruction[2] = (sljit_u8)(0x7d ^ (data2_ind << 3));
+ instruction[3] = 0x38;
+ SLJIT_ASSERT(sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR) <= 7);
+ instruction[4] = 0x40 | (data2_ind << 3) | sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);
+ instruction[5] = (sljit_u8)(16 - diff);
+ instruction[6] = 1;
+ sljit_emit_op_custom(compiler, instruction, 7);
+ }
+ else
+ {
+ /* VPERM2I128 xmm1, xmm2, xmm3/m128 */
+ /* instruction[0] = 0xc4; */
+ /* instruction[1] = 0xe3; */
+ value = (diff == 16) ? data1_ind : data2_ind;
+ instruction[2] = (sljit_u8)(0x7d ^ (value << 3));
+ instruction[3] = 0x46;
+ instruction[4] = 0xc0 | (data2_ind << 3) | value;
+ instruction[5] = 0x08;
+ sljit_emit_op_custom(compiler, instruction, 6);
+ }
+ }
+else
+ {
+ /* MOVDQA xmm1, xmm2/m128 */
+ instruction[0] = 0x66;
+ instruction[1] = 0x0f;
+ instruction[2] = 0x6f;
+ instruction[3] = 0xc0 | (data2_ind << 3) | data1_ind;
+ sljit_emit_op_custom(compiler, instruction, 4);
+
+ /* PSLLDQ xmm1, imm8 */
+ /* instruction[0] = 0x66; */
+ /* instruction[1] = 0x0f; */
+ instruction[2] = 0x73;
+ instruction[3] = 0xc0 | (7 << 3) | data2_ind;
+ instruction[4] = diff;
+ sljit_emit_op_custom(compiler, instruction, 5);
+ }
JUMPHERE(jump[1]);
-OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf);
+value = (reg_type == SLJIT_SIMD_REG_256) ? 0x1f : 0xf;
+OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, value);
for (i = 0; i < 4; i++)
{
- fast_forward_char_pair_sse2_compare(compiler, compare2_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp2_ind);
- fast_forward_char_pair_sse2_compare(compiler, compare1_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind);
+ fast_forward_char_pair_sse2_compare(compiler, compare2_type, reg_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp2_ind);
+ fast_forward_char_pair_sse2_compare(compiler, compare1_type, reg_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind);
}
-/* PAND xmm1, xmm2/m128 */
-/* instruction[0] = 0x66; */
-/* instruction[1] = 0x0f; */
-instruction[2] = 0xdb;
-instruction[3] = 0xc0 | (data1_ind << 3) | data2_ind;
-sljit_emit_op_custom(compiler, instruction, 4);
-
-/* PMOVMSKB reg, xmm */
-/* instruction[0] = 0x66; */
-/* instruction[1] = 0x0f; */
-instruction[2] = 0xd7;
-instruction[3] = 0xc0 | (tmp1_reg_ind << 3) | 0;
-sljit_emit_op_custom(compiler, instruction, 4);
+sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_AND | reg_type, SLJIT_FR0, SLJIT_FR0, SLJIT_FR1);
+sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_FR0, TMP1, 0);
/* Ignore matches before the first STR_PTR. */
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
@@ -698,36 +682,28 @@ OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
/* Main loop. */
start = LABEL();
-OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
+value = (reg_type == SLJIT_SIMD_REG_256) ? 32 : 16;
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, value);
add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
-load_from_mem_sse2(compiler, data1_ind, str_ptr_reg_ind, 0);
-load_from_mem_sse2(compiler, data2_ind, str_ptr_reg_ind, -(sljit_s8)diff);
+value = (reg_type == SLJIT_SIMD_REG_256) ? SLJIT_SIMD_MEM_ALIGNED_256 : SLJIT_SIMD_MEM_ALIGNED_128;
+sljit_emit_simd_mov(compiler, reg_type | value, SLJIT_FR0, SLJIT_MEM1(STR_PTR), 0);
+sljit_emit_simd_mov(compiler, reg_type, SLJIT_FR1, SLJIT_MEM1(STR_PTR), -(sljit_sw)diff);
for (i = 0; i < 4; i++)
{
- fast_forward_char_pair_sse2_compare(compiler, compare1_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp2_ind);
- fast_forward_char_pair_sse2_compare(compiler, compare2_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp1_ind);
+ fast_forward_char_pair_sse2_compare(compiler, compare1_type, reg_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp2_ind);
+ fast_forward_char_pair_sse2_compare(compiler, compare2_type, reg_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp1_ind);
}
-/* PAND xmm1, xmm2/m128 */
-/* instruction[0] = 0x66; */
-/* instruction[1] = 0x0f; */
-instruction[2] = 0xdb;
-instruction[3] = 0xc0 | (data1_ind << 3) | data2_ind;
-sljit_emit_op_custom(compiler, instruction, 4);
-
-/* PMOVMSKB reg, xmm */
-/* instruction[0] = 0x66; */
-/* instruction[1] = 0x0f; */
-instruction[2] = 0xd7;
-instruction[3] = 0xc0 | (tmp1_reg_ind << 3) | 0;
-sljit_emit_op_custom(compiler, instruction, 4);
+sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_AND | reg_type, SLJIT_FR0, SLJIT_FR0, SLJIT_FR1);
+sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_FR0, TMP1, 0);
CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);
JUMPHERE(jump[0]);
+SLJIT_ASSERT(tmp1_reg_ind < 8);
/* BSF r32, r/m32 */
instruction[0] = 0x0f;
instruction[1] = 0xbc;
@@ -762,7 +738,7 @@ if (common->match_end_ptr != 0)
#endif /* !_WIN64 */
-#undef SSE2_COMPARE_TYPE_INDEX
+#undef SIMD_COMPARE_TYPE_INDEX
#endif /* SLJIT_CONFIG_X86 */
@@ -776,7 +752,7 @@ typedef union {
} int_char;
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
-static SLJIT_INLINE int utf_continue(sljit_u8 *s)
+static SLJIT_INLINE int utf_continue(PCRE2_SPTR s)
{
#if PCRE2_CODE_UNIT_WIDTH == 8
return (*s & 0xc0) == 0x80;
@@ -865,14 +841,14 @@ static void fast_forward_char_simd(compiler_common *common, PCRE2_UCHAR char1, P
{
DEFINE_COMPILER;
int_char ic;
-struct sljit_jump *partial_quit;
+struct sljit_jump *partial_quit, *quit;
/* Save temporary registers. */
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS0, STR_PTR, 0);
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS1, TMP3, 0);
/* Prepare function arguments */
OP1(SLJIT_MOV, SLJIT_R0, 0, STR_END, 0);
-OP1(SLJIT_MOV, SLJIT_R1, 0, STR_PTR, 0);
+GET_LOCAL_BASE(SLJIT_R1, 0, LOCALS0);
OP1(SLJIT_MOV, SLJIT_R2, 0, SLJIT_IMM, offset);
if (char1 == char2)
@@ -883,14 +859,14 @@ if (char1 == char2)
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
if (common->utf && offset > 0)
- sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW),
- SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs_utf));
+ sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
+ SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_utf));
else
- sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW),
- SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs));
+ sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
+ SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs));
#else
- sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW),
- SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs));
+ sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
+ SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs));
#endif
}
else
@@ -904,14 +880,14 @@ else
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
if (common->utf && offset > 0)
- sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW),
- SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs_mask_utf));
+ sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
+ SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_mask_utf));
else
- sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW),
- SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs_mask));
+ sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
+ SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_mask));
#else
- sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW),
- SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs_mask));
+ sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
+ SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_mask));
#endif
}
else
@@ -922,14 +898,14 @@ else
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
if (common->utf && offset > 0)
- sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW),
- SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs_2_utf));
+ sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
+ SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_2_utf));
else
- sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW),
- SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs_2));
+ sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
+ SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_2));
#else
- sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW),
- SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs_2));
+ sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
+ SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_2));
#endif
}
}
@@ -944,9 +920,14 @@ if (common->mode == PCRE2_JIT_COMPLETE)
/* Fast forward STR_PTR to the result of memchr. */
OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_RETURN_REG, 0);
-
if (common->mode != PCRE2_JIT_COMPLETE)
+ {
+ quit = CMP(SLJIT_NOT_ZERO, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0);
JUMPHERE(partial_quit);
+ OP2U(SLJIT_SUB | SLJIT_SET_GREATER, STR_PTR, 0, STR_END, 0);
+ SELECT(SLJIT_GREATER, STR_PTR, STR_END, 0, STR_PTR);
+ JUMPHERE(quit);
+ }
}
typedef enum {
@@ -1067,11 +1048,11 @@ else
OP1(SLJIT_MOV, SLJIT_R0, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr);
OP2(SLJIT_ADD, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1));
- OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, STR_END, 0, SLJIT_R0, 0);
- CMOV(SLJIT_LESS, SLJIT_R0, STR_END, 0);
+ OP2U(SLJIT_SUB | SLJIT_SET_LESS, STR_END, 0, SLJIT_R0, 0);
+ SELECT(SLJIT_LESS, SLJIT_R0, STR_END, 0, SLJIT_R0);
}
-OP1(SLJIT_MOV, SLJIT_R1, 0, STR_PTR, 0);
+GET_LOCAL_BASE(SLJIT_R1, 0, LOCALS0);
OP1(SLJIT_MOV_S32, SLJIT_R2, 0, SLJIT_IMM, offs1);
OP1(SLJIT_MOV_S32, SLJIT_R3, 0, SLJIT_IMM, offs2);
ic.c.c1 = char1a;
@@ -1084,31 +1065,31 @@ if (diff == 1) {
if (char1a == char1b && char2a == char2b) {
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
if (common->utf)
- sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW) | SLJIT_ARG3(SW) | SLJIT_ARG4(SW),
- SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcps_0_utf));
+ sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
+ SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_0_utf));
else
#endif
- sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW) | SLJIT_ARG3(SW) | SLJIT_ARG4(SW),
- SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcps_0));
+ sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
+ SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_0));
} else {
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
if (common->utf)
- sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW) | SLJIT_ARG3(SW) | SLJIT_ARG4(SW),
- SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcps_1_utf));
+ sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
+ SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_1_utf));
else
#endif
- sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW) | SLJIT_ARG3(SW) | SLJIT_ARG4(SW),
- SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcps_1));
+ sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
+ SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_1));
}
} else {
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
if (common->utf)
- sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW) | SLJIT_ARG3(SW) | SLJIT_ARG4(SW),
- SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcps_default_utf));
+ sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
+ SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_default_utf));
else
#endif
- sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW) | SLJIT_ARG3(SW) | SLJIT_ARG4(SW),
- SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcps_default));
+ sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
+ SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_default));
}
/* Restore STR_PTR register. */
@@ -1177,7 +1158,7 @@ if (step == 0)
OP1(SLJIT_MOV, tmp_general_reg, 0, SLJIT_IMM, chr);
/* VLVG */
- instruction[0] = (sljit_u16)(0xe700 | (dst_vreg << 4) | sljit_get_register_index(tmp_general_reg));
+ instruction[0] = (sljit_u16)(0xe700 | (dst_vreg << 4) | sljit_get_register_index(SLJIT_GP_REGISTER, tmp_general_reg));
instruction[1] = 0;
instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x22);
sljit_emit_op_custom(compiler, instruction, 6);
@@ -1256,8 +1237,8 @@ struct sljit_label *restart;
struct sljit_jump *quit;
struct sljit_jump *partial_quit[2];
vector_compare_type compare_type = vector_compare_match1;
-sljit_s32 tmp1_reg_ind = sljit_get_register_index(TMP1);
-sljit_s32 str_ptr_reg_ind = sljit_get_register_index(STR_PTR);
+sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
+sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);
sljit_s32 data_ind = 0;
sljit_s32 tmp_ind = 1;
sljit_s32 cmp1_ind = 2;
@@ -1418,8 +1399,8 @@ if (common->mode != PCRE2_JIT_COMPLETE)
{
JUMPHERE(partial_quit[0]);
JUMPHERE(partial_quit[1]);
- OP2(SLJIT_SUB | SLJIT_SET_GREATER, SLJIT_UNUSED, 0, STR_PTR, 0, STR_END, 0);
- CMOV(SLJIT_GREATER, STR_PTR, STR_END, 0);
+ OP2U(SLJIT_SUB | SLJIT_SET_GREATER, STR_PTR, 0, STR_END, 0);
+ SELECT(SLJIT_GREATER, STR_PTR, STR_END, 0, STR_PTR);
}
else
add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
@@ -1454,8 +1435,8 @@ struct sljit_label *start;
struct sljit_jump *quit;
jump_list *not_found = NULL;
vector_compare_type compare_type = vector_compare_match1;
-sljit_s32 tmp1_reg_ind = sljit_get_register_index(TMP1);
-sljit_s32 tmp3_reg_ind = sljit_get_register_index(TMP3);
+sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
+sljit_s32 tmp3_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP3);
sljit_s32 data_ind = 0;
sljit_s32 tmp_ind = 1;
sljit_s32 cmp1_ind = 2;
@@ -1624,9 +1605,9 @@ vector_compare_type compare2_type = vector_compare_match1;
sljit_u32 bit1 = 0;
sljit_u32 bit2 = 0;
sljit_s32 diff = IN_UCHARS(offs2 - offs1);
-sljit_s32 tmp1_reg_ind = sljit_get_register_index(TMP1);
-sljit_s32 tmp2_reg_ind = sljit_get_register_index(TMP2);
-sljit_s32 str_ptr_reg_ind = sljit_get_register_index(STR_PTR);
+sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
+sljit_s32 tmp2_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP2);
+sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);
sljit_s32 data1_ind = 0;
sljit_s32 data2_ind = 1;
sljit_s32 tmp1_ind = 2;
@@ -1673,8 +1654,8 @@ if (common->match_end_ptr != 0)
OP1(SLJIT_MOV, TMP3, 0, STR_END, 0);
OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1));
- OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, TMP1, 0, STR_END, 0);
- CMOV(SLJIT_LESS, STR_END, TMP1, 0);
+ OP2U(SLJIT_SUB | SLJIT_SET_LESS, TMP1, 0, STR_END, 0);
+ SELECT(SLJIT_LESS, STR_END, TMP1, 0, STR_END);
}
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));
@@ -1855,4 +1836,520 @@ if (common->match_end_ptr != 0)
#endif /* SLJIT_CONFIG_S390X */
+#if (defined SLJIT_CONFIG_LOONGARCH_64 && SLJIT_CONFIG_LOONGARCH_64)
+
+#ifdef __linux__
+/* Using getauxval(AT_HWCAP) under Linux for detecting whether LSX is available */
+#include <sys/auxv.h>
+#define LOONGARCH_HWCAP_LSX (1 << 4)
+#define HAS_LSX_SUPPORT ((getauxval(AT_HWCAP) & LOONGARCH_HWCAP_LSX) != 0)
+#else
+#define HAS_LSX_SUPPORT 0
+#endif
+
+typedef sljit_ins sljit_u32;
+
+#define SI12_IMM_MASK 0x003ffc00
+#define UI5_IMM_MASK 0x00007c00
+#define UI2_IMM_MASK 0x00000c00
+
+#define VD(vd) ((sljit_ins)vd << 0)
+#define VJ(vj) ((sljit_ins)vj << 5)
+#define VK(vk) ((sljit_ins)vk << 10)
+#define RD_V(rd) ((sljit_ins)rd << 0)
+#define RJ_V(rj) ((sljit_ins)rj << 5)
+
+#define IMM_SI12(imm) (((sljit_ins)(imm) << 10) & SI12_IMM_MASK)
+#define IMM_UI5(imm) (((sljit_ins)(imm) << 10) & UI5_IMM_MASK)
+#define IMM_UI2(imm) (((sljit_ins)(imm) << 10) & UI2_IMM_MASK)
+
+// LSX OPCODES:
+#define VLD 0x2c000000
+#define VOR_V 0x71268000
+#define VAND_V 0x71260000
+#define VBSLL_V 0x728e0000
+#define VMSKLTZ_B 0x729c4000
+#define VPICKVE2GR_WU 0x72f3e000
+
+#if PCRE2_CODE_UNIT_WIDTH == 8
+#define VREPLGR2VR 0x729f0000
+#define VSEQ 0x70000000
+#elif PCRE2_CODE_UNIT_WIDTH == 16
+#define VREPLGR2VR 0x729f0400
+#define VSEQ 0x70008000
+#else
+#define VREPLGR2VR 0x729f0800
+#define VSEQ 0x70010000
+#endif
+
+static void fast_forward_char_pair_lsx_compare(struct sljit_compiler *compiler, vector_compare_type compare_type,
+ sljit_s32 dst_ind, sljit_s32 cmp1_ind, sljit_s32 cmp2_ind, sljit_s32 tmp_ind)
+{
+if (compare_type != vector_compare_match2)
+ {
+ if (compare_type == vector_compare_match1i)
+ {
+ /* VOR.V vd, vj, vk */
+ push_inst(compiler, VOR_V | VD(dst_ind) | VJ(cmp2_ind) | VK(dst_ind));
+ }
+
+ /* VSEQ.B/H/W vd, vj, vk */
+ push_inst(compiler, VSEQ | VD(dst_ind) | VJ(dst_ind) | VK(cmp1_ind));
+ return;
+ }
+
+/* VBSLL.V vd, vj, ui5 */
+push_inst(compiler, VBSLL_V | VD(tmp_ind) | VJ(dst_ind) | IMM_UI5(0));
+
+/* VSEQ.B/H/W vd, vj, vk */
+push_inst(compiler, VSEQ | VD(dst_ind) | VJ(dst_ind) | VK(cmp1_ind));
+
+/* VSEQ.B/H/W vd, vj, vk */
+push_inst(compiler, VSEQ | VD(tmp_ind) | VJ(tmp_ind) | VK(cmp2_ind));
+
+/* VOR vd, vj, vk */
+push_inst(compiler, VOR_V | VD(dst_ind) | VJ(tmp_ind) | VK(dst_ind));
+return;
+}
+
+#define JIT_HAS_FAST_FORWARD_CHAR_SIMD HAS_LSX_SUPPORT
+
+static void fast_forward_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_s32 offset)
+{
+DEFINE_COMPILER;
+struct sljit_label *start;
+#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
+struct sljit_label *restart;
+#endif
+struct sljit_jump *quit;
+struct sljit_jump *partial_quit[2];
+vector_compare_type compare_type = vector_compare_match1;
+sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
+sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);
+sljit_s32 data_ind = 0;
+sljit_s32 tmp_ind = 1;
+sljit_s32 cmp1_ind = 2;
+sljit_s32 cmp2_ind = 3;
+sljit_u32 bit = 0;
+
+SLJIT_UNUSED_ARG(offset);
+
+if (char1 != char2)
+ {
+ bit = char1 ^ char2;
+ compare_type = vector_compare_match1i;
+
+ if (!is_powerof2(bit))
+ {
+ bit = 0;
+ compare_type = vector_compare_match2;
+ }
+ }
+
+partial_quit[0] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
+if (common->mode == PCRE2_JIT_COMPLETE)
+ add_jump(compiler, &common->failed_match, partial_quit[0]);
+
+/* First part (unaligned start) */
+
+OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char1 | bit);
+
+/* VREPLGR2VR.B/H/W vd, rj */
+push_inst(compiler, VREPLGR2VR | VD(cmp1_ind) | RJ_V(tmp1_reg_ind));
+
+if (char1 != char2)
+ {
+ OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, bit != 0 ? bit : char2);
+
+ /* VREPLGR2VR.B/H/W vd, rj */
+ push_inst(compiler, VREPLGR2VR | VD(cmp2_ind) | RJ_V(tmp1_reg_ind));
+ }
+
+OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
+
+#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
+restart = LABEL();
+#endif
+
+OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf);
+OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
+
+/* VLD vd, rj, si12 */
+push_inst(compiler, VLD | VD(data_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));
+fast_forward_char_pair_lsx_compare(compiler, compare_type, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
+
+/* VMSKLTZ.B vd, vj */
+push_inst(compiler, VMSKLTZ_B | VD(tmp_ind) | VJ(data_ind));
+
+/* VPICKVE2GR.WU rd, vj, ui2 */
+push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp_ind) | IMM_UI2(0));
+
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
+OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);
+
+quit = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);
+
+OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
+
+/* Second part (aligned) */
+start = LABEL();
+
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
+
+partial_quit[1] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
+if (common->mode == PCRE2_JIT_COMPLETE)
+ add_jump(compiler, &common->failed_match, partial_quit[1]);
+
+/* VLD vd, rj, si12 */
+push_inst(compiler, VLD | VD(data_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));
+fast_forward_char_pair_lsx_compare(compiler, compare_type, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
+
+/* VMSKLTZ.B vd, vj */
+push_inst(compiler, VMSKLTZ_B | VD(tmp_ind) | VJ(data_ind));
+
+/* VPICKVE2GR.WU rd, vj, ui2 */
+push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp_ind) | IMM_UI2(0));
+
+CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);
+
+JUMPHERE(quit);
+
+/* CTZ.W rd, rj */
+push_inst(compiler, CTZ_W | RD_V(tmp1_reg_ind) | RJ_V(tmp1_reg_ind));
+
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
+
+if (common->mode != PCRE2_JIT_COMPLETE)
+ {
+ JUMPHERE(partial_quit[0]);
+ JUMPHERE(partial_quit[1]);
+ OP2U(SLJIT_SUB | SLJIT_SET_GREATER, STR_PTR, 0, STR_END, 0);
+ SELECT(SLJIT_GREATER, STR_PTR, STR_END, 0, STR_PTR);
+ }
+else
+ add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
+
+#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
+if (common->utf && offset > 0)
+ {
+ SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE);
+
+ OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offset));
+
+ quit = jump_if_utf_char_start(compiler, TMP1);
+
+ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+ add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
+ OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
+ JUMPTO(SLJIT_JUMP, restart);
+
+ JUMPHERE(quit);
+ }
+#endif
+}
+
+#define JIT_HAS_FAST_REQUESTED_CHAR_SIMD HAS_LSX_SUPPORT
+
+static jump_list *fast_requested_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2)
+{
+DEFINE_COMPILER;
+struct sljit_label *start;
+struct sljit_jump *quit;
+jump_list *not_found = NULL;
+vector_compare_type compare_type = vector_compare_match1;
+sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
+sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);
+sljit_s32 data_ind = 0;
+sljit_s32 tmp_ind = 1;
+sljit_s32 cmp1_ind = 2;
+sljit_s32 cmp2_ind = 3;
+sljit_u32 bit = 0;
+
+if (char1 != char2)
+ {
+ bit = char1 ^ char2;
+ compare_type = vector_compare_match1i;
+
+ if (!is_powerof2(bit))
+ {
+ bit = 0;
+ compare_type = vector_compare_match2;
+ }
+ }
+
+add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
+OP1(SLJIT_MOV, TMP2, 0, TMP1, 0);
+OP1(SLJIT_MOV, TMP3, 0, STR_PTR, 0);
+
+/* First part (unaligned start) */
+
+OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char1 | bit);
+
+/* VREPLGR2VR vd, rj */
+push_inst(compiler, VREPLGR2VR | VD(cmp1_ind) | RJ_V(tmp1_reg_ind));
+
+if (char1 != char2)
+ {
+ OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, bit != 0 ? bit : char2);
+ /* VREPLGR2VR vd, rj */
+ push_inst(compiler, VREPLGR2VR | VD(cmp2_ind) | RJ_V(tmp1_reg_ind));
+ }
+
+OP1(SLJIT_MOV, STR_PTR, 0, TMP2, 0);
+OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf);
+OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
+
+/* VLD vd, rj, si12 */
+push_inst(compiler, VLD | VD(data_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));
+fast_forward_char_pair_lsx_compare(compiler, compare_type, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
+
+/* VMSKLTZ.B vd, vj */
+push_inst(compiler, VMSKLTZ_B | VD(tmp_ind) | VJ(data_ind));
+
+/* VPICKVE2GR.WU rd, vj, ui2 */
+push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp_ind) | IMM_UI2(0));
+
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
+OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);
+
+quit = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);
+
+OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
+
+/* Second part (aligned) */
+start = LABEL();
+
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
+
+add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
+
+/* VLD vd, rj, si12 */
+push_inst(compiler, VLD | VD(data_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));
+fast_forward_char_pair_lsx_compare(compiler, compare_type, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
+
+/* VMSKLTZ.B vd, vj */
+push_inst(compiler, VMSKLTZ_B | VD(tmp_ind) | VJ(data_ind));
+
+/* VPICKVE2GR.WU rd, vj, ui2 */
+push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp_ind) | IMM_UI2(0));
+
+CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);
+
+JUMPHERE(quit);
+
+/* CTZ.W rd, rj */
+push_inst(compiler, CTZ_W | RD_V(tmp1_reg_ind) | RJ_V(tmp1_reg_ind));
+
+OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, STR_PTR, 0);
+add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
+
+OP1(SLJIT_MOV, STR_PTR, 0, TMP3, 0);
+return not_found;
+}
+
+#define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD HAS_LSX_SUPPORT
+
+static void fast_forward_char_pair_simd(compiler_common *common, sljit_s32 offs1,
+ PCRE2_UCHAR char1a, PCRE2_UCHAR char1b, sljit_s32 offs2, PCRE2_UCHAR char2a, PCRE2_UCHAR char2b)
+{
+DEFINE_COMPILER;
+vector_compare_type compare1_type = vector_compare_match1;
+vector_compare_type compare2_type = vector_compare_match1;
+sljit_u32 bit1 = 0;
+sljit_u32 bit2 = 0;
+sljit_u32 diff = IN_UCHARS(offs1 - offs2);
+sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
+sljit_s32 tmp2_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP2);
+sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);
+sljit_s32 data1_ind = 0;
+sljit_s32 data2_ind = 1;
+sljit_s32 tmp1_ind = 2;
+sljit_s32 tmp2_ind = 3;
+sljit_s32 cmp1a_ind = 4;
+sljit_s32 cmp1b_ind = 5;
+sljit_s32 cmp2a_ind = 6;
+sljit_s32 cmp2b_ind = 7;
+struct sljit_label *start;
+#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
+struct sljit_label *restart;
+#endif
+struct sljit_jump *jump[2];
+
+SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE && offs1 > offs2);
+SLJIT_ASSERT(diff <= IN_UCHARS(max_fast_forward_char_pair_offset()));
+
+/* Initialize. */
+if (common->match_end_ptr != 0)
+ {
+ OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr);
+ OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1));
+ OP1(SLJIT_MOV, TMP3, 0, STR_END, 0);
+
+ OP2U(SLJIT_SUB | SLJIT_SET_LESS, TMP1, 0, STR_END, 0);
+ SELECT(SLJIT_LESS, STR_END, TMP1, 0, STR_END);
+ }
+
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));
+add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
+
+if (char1a == char1b)
+ OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char1a);
+else
+ {
+ bit1 = char1a ^ char1b;
+ if (is_powerof2(bit1))
+ {
+ compare1_type = vector_compare_match1i;
+ OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char1a | bit1);
+ OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, bit1);
+ }
+ else
+ {
+ compare1_type = vector_compare_match2;
+ bit1 = 0;
+ OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char1a);
+ OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, char1b);
+ }
+ }
+
+/* VREPLGR2VR vd, rj */
+push_inst(compiler, VREPLGR2VR | VD(cmp1a_ind) | RJ_V(tmp1_reg_ind));
+
+if (char1a != char1b)
+ {
+ /* VREPLGR2VR vd, rj */
+ push_inst(compiler, VREPLGR2VR | VD(cmp1b_ind) | RJ_V(tmp2_reg_ind));
+ }
+
+if (char2a == char2b)
+ OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char2a);
+else
+ {
+ bit2 = char2a ^ char2b;
+ if (is_powerof2(bit2))
+ {
+ compare2_type = vector_compare_match1i;
+ OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char2a | bit2);
+ OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, bit2);
+ }
+ else
+ {
+ compare2_type = vector_compare_match2;
+ bit2 = 0;
+ OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char2a);
+ OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, char2b);
+ }
+ }
+
+/* VREPLGR2VR vd, rj */
+push_inst(compiler, VREPLGR2VR | VD(cmp2a_ind) | RJ_V(tmp1_reg_ind));
+
+if (char2a != char2b)
+ {
+ /* VREPLGR2VR vd, rj */
+ push_inst(compiler, VREPLGR2VR | VD(cmp2b_ind) | RJ_V(tmp2_reg_ind));
+ }
+
+#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
+restart = LABEL();
+#endif
+
+OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, diff);
+OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
+OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf);
+OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
+
+/* VLD vd, rj, si12 */
+push_inst(compiler, VLD | VD(data1_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));
+
+jump[0] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_PTR, 0);
+
+/* VLD vd, rj, si12 */
+push_inst(compiler, VLD | VD(data2_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(-(sljit_s8)diff));
+jump[1] = JUMP(SLJIT_JUMP);
+
+JUMPHERE(jump[0]);
+
+/* VBSLL.V vd, vj, ui5 */
+push_inst(compiler, VBSLL_V | VD(data2_ind) | VJ(data1_ind) | IMM_UI5(diff));
+
+JUMPHERE(jump[1]);
+
+fast_forward_char_pair_lsx_compare(compiler, compare2_type, data2_ind, cmp2a_ind, cmp2b_ind, tmp2_ind);
+fast_forward_char_pair_lsx_compare(compiler, compare1_type, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind);
+
+/* VAND vd, vj, vk */
+push_inst(compiler, VOR_V | VD(data1_ind) | VJ(data1_ind) | VK(data2_ind));
+
+/* VMSKLTZ.B vd, vj */
+push_inst(compiler, VMSKLTZ_B | VD(tmp1_ind) | VJ(data1_ind));
+
+/* VPICKVE2GR.WU rd, vj, ui2 */
+push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp1_ind) | IMM_UI2(0));
+
+/* Ignore matches before the first STR_PTR. */
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
+OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);
+
+jump[0] = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);
+
+OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
+
+/* Main loop. */
+start = LABEL();
+
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
+add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
+
+/* VLD vd, rj, si12 */
+push_inst(compiler, VLD | VD(data1_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));
+push_inst(compiler, VLD | VD(data2_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(-(sljit_s8)diff));
+
+fast_forward_char_pair_lsx_compare(compiler, compare1_type, data1_ind, cmp1a_ind, cmp1b_ind, tmp2_ind);
+fast_forward_char_pair_lsx_compare(compiler, compare2_type, data2_ind, cmp2a_ind, cmp2b_ind, tmp1_ind);
+
+/* VAND.V vd, vj, vk */
+push_inst(compiler, VAND_V | VD(data1_ind) | VJ(data1_ind) | VK(data2_ind));
+
+/* VMSKLTZ.B vd, vj */
+push_inst(compiler, VMSKLTZ_B | VD(tmp1_ind) | VJ(data1_ind));
+
+/* VPICKVE2GR.WU rd, vj, ui2 */
+push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp1_ind) | IMM_UI2(0));
+
+CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);
+
+JUMPHERE(jump[0]);
+
+/* CTZ.W rd, rj */
+push_inst(compiler, CTZ_W | RD_V(tmp1_reg_ind) | RJ_V(tmp1_reg_ind));
+
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
+
+add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
+
+#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
+if (common->utf)
+ {
+ OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offs1));
+
+ jump[0] = jump_if_utf_char_start(compiler, TMP1);
+
+ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+ CMPTO(SLJIT_LESS, STR_PTR, 0, STR_END, 0, restart);
+
+ add_jump(compiler, &common->failed_match, JUMP(SLJIT_JUMP));
+
+ JUMPHERE(jump[0]);
+ }
+#endif
+
+OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));
+
+if (common->match_end_ptr != 0)
+ OP1(SLJIT_MOV, STR_END, 0, TMP3, 0);
+}
+
+#endif /* SLJIT_CONFIG_LOONGARCH_64 */
+
#endif /* !SUPPORT_VALGRIND */