diff options
Diffstat (limited to 'src/3rdparty/pcre2/src/sljit/sljitNativeX86_common.c')
-rw-r--r-- | src/3rdparty/pcre2/src/sljit/sljitNativeX86_common.c | 3555 |
1 files changed, 2613 insertions, 942 deletions
diff --git a/src/3rdparty/pcre2/src/sljit/sljitNativeX86_common.c b/src/3rdparty/pcre2/src/sljit/sljitNativeX86_common.c index ddcc5ebf76..c2c0421349 100644 --- a/src/3rdparty/pcre2/src/sljit/sljitNativeX86_common.c +++ b/src/3rdparty/pcre2/src/sljit/sljitNativeX86_common.c @@ -24,13 +24,15 @@ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) +#include <sanitizer/msan_interface.h> +#endif /* __has_feature(memory_sanitizer) */ +#endif /* defined(__has_feature) */ + SLJIT_API_FUNC_ATTRIBUTE const char* sljit_get_platform_name(void) { -#if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL) - return "x86" SLJIT_CPUINFO " ABI:fastcall"; -#else return "x86" SLJIT_CPUINFO; -#endif } /* @@ -65,33 +67,33 @@ SLJIT_API_FUNC_ATTRIBUTE const char* sljit_get_platform_name(void) 15 - R15 */ +#define TMP_REG1 (SLJIT_NUMBER_OF_REGISTERS + 2) +#define TMP_FREG (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1) + #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) -/* Last register + 1. */ -#define TMP_REG1 (SLJIT_NUMBER_OF_REGISTERS + 2) static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 3] = { - 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 7, 6, 3, 4, 5 + 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 5, 7, 6, 4, 3 +}; + +static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2] = { + 0, 1, 2, 3, 4, 5, 6, 7, 0 }; #define CHECK_EXTRA_REGS(p, w, do) \ if (p >= SLJIT_R3 && p <= SLJIT_S3) { \ - if (p <= compiler->scratches) \ - w = compiler->saveds_offset - ((p) - SLJIT_R2) * (sljit_sw)sizeof(sljit_sw); \ - else \ - w = compiler->locals_offset + ((p) - SLJIT_S2) * (sljit_sw)sizeof(sljit_sw); \ + w = (2 * SSIZE_OF(sw)) + ((p) - SLJIT_R3) * SSIZE_OF(sw); \ p = SLJIT_MEM1(SLJIT_SP); \ do; \ } #else /* SLJIT_CONFIG_X86_32 */ -/* Last register + 1. */ -#define TMP_REG1 (SLJIT_NUMBER_OF_REGISTERS + 2) #define TMP_REG2 (SLJIT_NUMBER_OF_REGISTERS + 3) /* Note: r12 & 0x7 == 0b100, which decoded as SIB byte present - Note: avoid to use r12 and r13 for memory addessing + Note: avoid to use r12 and r13 for memory addressing therefore r12 is better to be a higher saved register. */ #ifndef _WIN64 /* Args: rdi(=7), rsi(=6), rdx(=2), rcx(=1), r8, r9. Scratches: rax(=0), r10, r11 */ @@ -100,7 +102,7 @@ static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 4] = { }; /* low-map. reg_map & 0x7. */ static const sljit_u8 reg_lmap[SLJIT_NUMBER_OF_REGISTERS + 4] = { - 0, 0, 6, 7, 1, 0, 3, 2, 4, 5, 5, 6, 7, 3, 4, 2, 1 + 0, 0, 6, 7, 1, 0, 3, 2, 4, 5, 5, 6, 7, 3, 4, 2, 1 }; #else /* Args: rcx(=1), rdx(=2), r8, r9. Scratches: rax(=0), r10, r11 */ @@ -114,12 +116,12 @@ static const sljit_u8 reg_lmap[SLJIT_NUMBER_OF_REGISTERS + 4] = { #endif /* Args: xmm0-xmm3 */ -static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1] = { - 4, 0, 1, 2, 3, 5, 6 +static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2] = { + 0, 0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4 }; /* low-map. freg_map & 0x7. */ -static const sljit_u8 freg_lmap[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1] = { - 4, 0, 1, 2, 3, 5, 6 +static const sljit_u8 freg_lmap[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2] = { + 0, 0, 1, 2, 3, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 4 }; #define REX_W 0x48 @@ -143,153 +145,255 @@ static const sljit_u8 freg_lmap[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1] = { #endif /* SLJIT_CONFIG_X86_32 */ -#define TMP_FREG (0) +#define U8(v) ((sljit_u8)(v)) /* Size flags for emit_x86_instruction: */ -#define EX86_BIN_INS 0x0010 -#define EX86_SHIFT_INS 0x0020 -#define EX86_REX 0x0040 -#define EX86_NO_REXW 0x0080 -#define EX86_BYTE_ARG 0x0100 -#define EX86_HALF_ARG 0x0200 -#define EX86_PREF_66 0x0400 -#define EX86_PREF_F2 0x0800 -#define EX86_PREF_F3 0x1000 -#define EX86_SSE2_OP1 0x2000 -#define EX86_SSE2_OP2 0x4000 +#define EX86_BIN_INS ((sljit_uw)0x000010) +#define EX86_SHIFT_INS ((sljit_uw)0x000020) +#define EX86_BYTE_ARG ((sljit_uw)0x000040) +#define EX86_HALF_ARG ((sljit_uw)0x000080) +/* Size flags for both emit_x86_instruction and emit_vex_instruction: */ +#define EX86_REX ((sljit_uw)0x000100) +#define EX86_NO_REXW ((sljit_uw)0x000200) +#define EX86_PREF_66 ((sljit_uw)0x000400) +#define EX86_PREF_F2 ((sljit_uw)0x000800) +#define EX86_PREF_F3 ((sljit_uw)0x001000) +#define EX86_SSE2_OP1 ((sljit_uw)0x002000) +#define EX86_SSE2_OP2 ((sljit_uw)0x004000) #define EX86_SSE2 (EX86_SSE2_OP1 | EX86_SSE2_OP2) +#define EX86_VEX_EXT ((sljit_uw)0x008000) +/* Op flags for emit_vex_instruction: */ +#define VEX_OP_0F38 ((sljit_uw)0x010000) +#define VEX_OP_0F3A ((sljit_uw)0x020000) +#define VEX_SSE2_OPV ((sljit_uw)0x040000) +#define VEX_AUTO_W ((sljit_uw)0x080000) +#define VEX_W ((sljit_uw)0x100000) +#define VEX_256 ((sljit_uw)0x200000) + +#define EX86_SELECT_66(op) (((op) & SLJIT_32) ? 0 : EX86_PREF_66) +#define EX86_SELECT_F2_F3(op) (((op) & SLJIT_32) ? EX86_PREF_F3 : EX86_PREF_F2) /* --------------------------------------------------------------------- */ -/* Instrucion forms */ +/* Instruction forms */ /* --------------------------------------------------------------------- */ -#define ADD (/* BINARY */ 0 << 3) -#define ADD_EAX_i32 0x05 -#define ADD_r_rm 0x03 -#define ADD_rm_r 0x01 -#define ADDSD_x_xm 0x58 -#define ADC (/* BINARY */ 2 << 3) -#define ADC_EAX_i32 0x15 -#define ADC_r_rm 0x13 -#define ADC_rm_r 0x11 -#define AND (/* BINARY */ 4 << 3) -#define AND_EAX_i32 0x25 -#define AND_r_rm 0x23 -#define AND_rm_r 0x21 -#define ANDPD_x_xm 0x54 -#define BSR_r_rm (/* GROUP_0F */ 0xbd) -#define CALL_i32 0xe8 -#define CALL_rm (/* GROUP_FF */ 2 << 3) -#define CDQ 0x99 -#define CMOVE_r_rm (/* GROUP_0F */ 0x44) -#define CMP (/* BINARY */ 7 << 3) -#define CMP_EAX_i32 0x3d -#define CMP_r_rm 0x3b -#define CMP_rm_r 0x39 -#define CVTPD2PS_x_xm 0x5a -#define CVTSI2SD_x_rm 0x2a -#define CVTTSD2SI_r_xm 0x2c -#define DIV (/* GROUP_F7 */ 6 << 3) -#define DIVSD_x_xm 0x5e -#define FSTPS 0xd9 -#define FSTPD 0xdd -#define INT3 0xcc -#define IDIV (/* GROUP_F7 */ 7 << 3) -#define IMUL (/* GROUP_F7 */ 5 << 3) -#define IMUL_r_rm (/* GROUP_0F */ 0xaf) -#define IMUL_r_rm_i8 0x6b -#define IMUL_r_rm_i32 0x69 -#define JE_i8 0x74 -#define JNE_i8 0x75 -#define JMP_i8 0xeb -#define JMP_i32 0xe9 -#define JMP_rm (/* GROUP_FF */ 4 << 3) -#define LEA_r_m 0x8d -#define MOV_r_rm 0x8b -#define MOV_r_i32 0xb8 -#define MOV_rm_r 0x89 -#define MOV_rm_i32 0xc7 -#define MOV_rm8_i8 0xc6 -#define MOV_rm8_r8 0x88 -#define MOVSD_x_xm 0x10 -#define MOVSD_xm_x 0x11 -#define MOVSXD_r_rm 0x63 -#define MOVSX_r_rm8 (/* GROUP_0F */ 0xbe) -#define MOVSX_r_rm16 (/* GROUP_0F */ 0xbf) -#define MOVZX_r_rm8 (/* GROUP_0F */ 0xb6) -#define MOVZX_r_rm16 (/* GROUP_0F */ 0xb7) -#define MUL (/* GROUP_F7 */ 4 << 3) -#define MULSD_x_xm 0x59 -#define NEG_rm (/* GROUP_F7 */ 3 << 3) -#define NOP 0x90 -#define NOT_rm (/* GROUP_F7 */ 2 << 3) -#define OR (/* BINARY */ 1 << 3) -#define OR_r_rm 0x0b -#define OR_EAX_i32 0x0d -#define OR_rm_r 0x09 -#define OR_rm8_r8 0x08 -#define POP_r 0x58 -#define POP_rm 0x8f -#define POPF 0x9d -#define PREFETCH 0x18 -#define PUSH_i32 0x68 -#define PUSH_r 0x50 -#define PUSH_rm (/* GROUP_FF */ 6 << 3) -#define PUSHF 0x9c -#define RET_near 0xc3 -#define RET_i16 0xc2 -#define SBB (/* BINARY */ 3 << 3) -#define SBB_EAX_i32 0x1d -#define SBB_r_rm 0x1b -#define SBB_rm_r 0x19 -#define SAR (/* SHIFT */ 7 << 3) -#define SHL (/* SHIFT */ 4 << 3) -#define SHR (/* SHIFT */ 5 << 3) -#define SUB (/* BINARY */ 5 << 3) -#define SUB_EAX_i32 0x2d -#define SUB_r_rm 0x2b -#define SUB_rm_r 0x29 -#define SUBSD_x_xm 0x5c -#define TEST_EAX_i32 0xa9 -#define TEST_rm_r 0x85 -#define UCOMISD_x_xm 0x2e -#define UNPCKLPD_x_xm 0x14 -#define XCHG_EAX_r 0x90 -#define XCHG_r_rm 0x87 -#define XOR (/* BINARY */ 6 << 3) -#define XOR_EAX_i32 0x35 -#define XOR_r_rm 0x33 -#define XOR_rm_r 0x31 -#define XORPD_x_xm 0x57 - -#define GROUP_0F 0x0f -#define GROUP_F7 0xf7 -#define GROUP_FF 0xff -#define GROUP_BINARY_81 0x81 -#define GROUP_BINARY_83 0x83 -#define GROUP_SHIFT_1 0xd1 -#define GROUP_SHIFT_N 0xc1 -#define GROUP_SHIFT_CL 0xd3 - -#define MOD_REG 0xc0 -#define MOD_DISP8 0x40 - -#define INC_SIZE(s) (*inst++ = (s), compiler->size += (s)) - -#define PUSH_REG(r) (*inst++ = (PUSH_r + (r))) -#define POP_REG(r) (*inst++ = (POP_r + (r))) -#define RET() (*inst++ = (RET_near)) -#define RET_I16(n) (*inst++ = (RET_i16), *inst++ = n, *inst++ = 0) -/* r32, r/m32 */ -#define MOV_RM(mod, reg, rm) (*inst++ = (MOV_r_rm), *inst++ = (mod) << 6 | (reg) << 3 | (rm)) +#define ADD (/* BINARY */ 0 << 3) +#define ADD_EAX_i32 0x05 +#define ADD_r_rm 0x03 +#define ADD_rm_r 0x01 +#define ADDSD_x_xm 0x58 +#define ADC (/* BINARY */ 2 << 3) +#define ADC_EAX_i32 0x15 +#define ADC_r_rm 0x13 +#define ADC_rm_r 0x11 +#define AND (/* BINARY */ 4 << 3) +#define AND_EAX_i32 0x25 +#define AND_r_rm 0x23 +#define AND_rm_r 0x21 +#define ANDPD_x_xm 0x54 +#define BSR_r_rm (/* GROUP_0F */ 0xbd) +#define BSF_r_rm (/* GROUP_0F */ 0xbc) +#define BSWAP_r (/* GROUP_0F */ 0xc8) +#define CALL_i32 0xe8 +#define CALL_rm (/* GROUP_FF */ 2 << 3) +#define CDQ 0x99 +#define CMOVE_r_rm (/* GROUP_0F */ 0x44) +#define CMP (/* BINARY */ 7 << 3) +#define CMP_EAX_i32 0x3d +#define CMP_r_rm 0x3b +#define CMP_rm_r 0x39 +#define CMPS_x_xm 0xc2 +#define CMPXCHG_rm_r 0xb1 +#define CMPXCHG_rm8_r 0xb0 +#define CVTPD2PS_x_xm 0x5a +#define CVTPS2PD_x_xm 0x5a +#define CVTSI2SD_x_rm 0x2a +#define CVTTSD2SI_r_xm 0x2c +#define DIV (/* GROUP_F7 */ 6 << 3) +#define DIVSD_x_xm 0x5e +#define EXTRACTPS_x_xm 0x17 +#define FLDS 0xd9 +#define FLDL 0xdd +#define FSTPS 0xd9 +#define FSTPD 0xdd +#define INSERTPS_x_xm 0x21 +#define INT3 0xcc +#define IDIV (/* GROUP_F7 */ 7 << 3) +#define IMUL (/* GROUP_F7 */ 5 << 3) +#define IMUL_r_rm (/* GROUP_0F */ 0xaf) +#define IMUL_r_rm_i8 0x6b +#define IMUL_r_rm_i32 0x69 +#define JL_i8 0x7c +#define JE_i8 0x74 +#define JNC_i8 0x73 +#define JNE_i8 0x75 +#define JMP_i8 0xeb +#define JMP_i32 0xe9 +#define JMP_rm (/* GROUP_FF */ 4 << 3) +#define LEA_r_m 0x8d +#define LOOP_i8 0xe2 +#define LZCNT_r_rm (/* GROUP_F3 */ /* GROUP_0F */ 0xbd) +#define MOV_r_rm 0x8b +#define MOV_r_i32 0xb8 +#define MOV_rm_r 0x89 +#define MOV_rm_i32 0xc7 +#define MOV_rm8_i8 0xc6 +#define MOV_rm8_r8 0x88 +#define MOVAPS_x_xm 0x28 +#define MOVAPS_xm_x 0x29 +#define MOVD_x_rm 0x6e +#define MOVD_rm_x 0x7e +#define MOVDDUP_x_xm 0x12 +#define MOVDQA_x_xm 0x6f +#define MOVDQA_xm_x 0x7f +#define MOVHLPS_x_x 0x12 +#define MOVHPD_m_x 0x17 +#define MOVHPD_x_m 0x16 +#define MOVLHPS_x_x 0x16 +#define MOVLPD_m_x 0x13 +#define MOVLPD_x_m 0x12 +#define MOVMSKPS_r_x (/* GROUP_0F */ 0x50) +#define MOVQ_x_xm (/* GROUP_0F */ 0x7e) +#define MOVSD_x_xm 0x10 +#define MOVSD_xm_x 0x11 +#define MOVSHDUP_x_xm 0x16 +#define MOVSXD_r_rm 0x63 +#define MOVSX_r_rm8 (/* GROUP_0F */ 0xbe) +#define MOVSX_r_rm16 (/* GROUP_0F */ 0xbf) +#define MOVUPS_x_xm 0x10 +#define MOVZX_r_rm8 (/* GROUP_0F */ 0xb6) +#define MOVZX_r_rm16 (/* GROUP_0F */ 0xb7) +#define MUL (/* GROUP_F7 */ 4 << 3) +#define MULSD_x_xm 0x59 +#define NEG_rm (/* GROUP_F7 */ 3 << 3) +#define NOP 0x90 +#define NOT_rm (/* GROUP_F7 */ 2 << 3) +#define OR (/* BINARY */ 1 << 3) +#define OR_r_rm 0x0b +#define OR_EAX_i32 0x0d +#define OR_rm_r 0x09 +#define OR_rm8_r8 0x08 +#define ORPD_x_xm 0x56 +#define PACKSSWB_x_xm (/* GROUP_0F */ 0x63) +#define PAND_x_xm 0xdb +#define PCMPEQD_x_xm 0x76 +#define PINSRB_x_rm_i8 0x20 +#define PINSRW_x_rm_i8 0xc4 +#define PINSRD_x_rm_i8 0x22 +#define PEXTRB_rm_x_i8 0x14 +#define PEXTRW_rm_x_i8 0x15 +#define PEXTRD_rm_x_i8 0x16 +#define PMOVMSKB_r_x (/* GROUP_0F */ 0xd7) +#define PMOVSXBD_x_xm 0x21 +#define PMOVSXBQ_x_xm 0x22 +#define PMOVSXBW_x_xm 0x20 +#define PMOVSXDQ_x_xm 0x25 +#define PMOVSXWD_x_xm 0x23 +#define PMOVSXWQ_x_xm 0x24 +#define PMOVZXBD_x_xm 0x31 +#define PMOVZXBQ_x_xm 0x32 +#define PMOVZXBW_x_xm 0x30 +#define PMOVZXDQ_x_xm 0x35 +#define PMOVZXWD_x_xm 0x33 +#define PMOVZXWQ_x_xm 0x34 +#define POP_r 0x58 +#define POP_rm 0x8f +#define POPF 0x9d +#define POR_x_xm 0xeb +#define PREFETCH 0x18 +#define PSHUFB_x_xm 0x00 +#define PSHUFD_x_xm 0x70 +#define PSHUFLW_x_xm 0x70 +#define PSRLDQ_x 0x73 +#define PSLLD_x_i8 0x72 +#define PSLLQ_x_i8 0x73 +#define PUSH_i32 0x68 +#define PUSH_r 0x50 +#define PUSH_rm (/* GROUP_FF */ 6 << 3) +#define PUSHF 0x9c +#define PXOR_x_xm 0xef +#define ROL (/* SHIFT */ 0 << 3) +#define ROR (/* SHIFT */ 1 << 3) +#define RET_near 0xc3 +#define RET_i16 0xc2 +#define SBB (/* BINARY */ 3 << 3) +#define SBB_EAX_i32 0x1d +#define SBB_r_rm 0x1b +#define SBB_rm_r 0x19 +#define SAR (/* SHIFT */ 7 << 3) +#define SHL (/* SHIFT */ 4 << 3) +#define SHLD (/* GROUP_0F */ 0xa5) +#define SHRD (/* GROUP_0F */ 0xad) +#define SHR (/* SHIFT */ 5 << 3) +#define SHUFPS_x_xm 0xc6 +#define SUB (/* BINARY */ 5 << 3) +#define SUB_EAX_i32 0x2d +#define SUB_r_rm 0x2b +#define SUB_rm_r 0x29 +#define SUBSD_x_xm 0x5c +#define TEST_EAX_i32 0xa9 +#define TEST_rm_r 0x85 +#define TZCNT_r_rm (/* GROUP_F3 */ /* GROUP_0F */ 0xbc) +#define UCOMISD_x_xm 0x2e +#define UNPCKLPD_x_xm 0x14 +#define UNPCKLPS_x_xm 0x14 +#define VBROADCASTSD_x_xm 0x19 +#define VBROADCASTSS_x_xm 0x18 +#define VEXTRACTF128_x_ym 0x19 +#define VEXTRACTI128_x_ym 0x39 +#define VINSERTF128_y_y_xm 0x18 +#define VINSERTI128_y_y_xm 0x38 +#define VPBROADCASTB_x_xm 0x78 +#define VPBROADCASTD_x_xm 0x58 +#define VPBROADCASTQ_x_xm 0x59 +#define VPBROADCASTW_x_xm 0x79 +#define VPERMPD_y_ym 0x01 +#define VPERMQ_y_ym 0x00 +#define XCHG_EAX_r 0x90 +#define XCHG_r_rm 0x87 +#define XOR (/* BINARY */ 6 << 3) +#define XOR_EAX_i32 0x35 +#define XOR_r_rm 0x33 +#define XOR_rm_r 0x31 +#define XORPD_x_xm 0x57 + +#define GROUP_0F 0x0f +#define GROUP_66 0x66 +#define GROUP_F3 0xf3 +#define GROUP_F7 0xf7 +#define GROUP_FF 0xff +#define GROUP_BINARY_81 0x81 +#define GROUP_BINARY_83 0x83 +#define GROUP_SHIFT_1 0xd1 +#define GROUP_SHIFT_N 0xc1 +#define GROUP_SHIFT_CL 0xd3 +#define GROUP_LOCK 0xf0 + +#define MOD_REG 0xc0 +#define MOD_DISP8 0x40 + +#define INC_SIZE(s) (*inst++ = U8(s), compiler->size += (s)) + +#define PUSH_REG(r) (*inst++ = U8(PUSH_r + (r))) +#define POP_REG(r) (*inst++ = U8(POP_r + (r))) +#define RET() (*inst++ = RET_near) +#define RET_I16(n) (*inst++ = RET_i16, *inst++ = U8(n), *inst++ = 0) /* Multithreading does not affect these static variables, since they store built-in CPU features. Therefore they can be overwritten by different threads if they detect the CPU features in the same time. */ +#define CPU_FEATURE_DETECTED 0x001 #if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2) -static sljit_s32 cpu_has_sse2 = -1; +#define CPU_FEATURE_SSE2 0x002 #endif -static sljit_s32 cpu_has_cmov = -1; +#define CPU_FEATURE_SSE41 0x004 +#define CPU_FEATURE_LZCNT 0x008 +#define CPU_FEATURE_TZCNT 0x010 +#define CPU_FEATURE_CMOV 0x020 +#define CPU_FEATURE_AVX 0x040 +#define CPU_FEATURE_AVX2 0x080 + +static sljit_u32 cpu_feature_list = 0; #ifdef _WIN32_WCE #include <cmnintrin.h> @@ -320,82 +424,167 @@ static SLJIT_INLINE void sljit_unaligned_store_sw(void *addr, sljit_sw value) /* Utility functions */ /******************************************************/ -static void get_cpu_features(void) +static void execute_cpu_id(sljit_u32 info[4]) { - sljit_u32 features; - #if defined(_MSC_VER) && _MSC_VER >= 1400 - int CPUInfo[4]; - __cpuid(CPUInfo, 1); - features = (sljit_u32)CPUInfo[3]; + __cpuidex((int*)info, (int)info[0], (int)info[2]); -#elif defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__SUNPRO_C) +#elif defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__SUNPRO_C) || defined(__TINYC__) /* AT&T syntax. */ __asm__ ( - "movl $0x1, %%eax\n" #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) - /* On x86-32, there is no red zone, so this - should work (no need for a local variable). */ - "push %%ebx\n" -#endif + "movl %0, %%esi\n" + "movl (%%esi), %%eax\n" + "movl 8(%%esi), %%ecx\n" + "pushl %%ebx\n" "cpuid\n" -#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) - "pop %%ebx\n" -#endif - "movl %%edx, %0\n" - : "=g" (features) + "movl %%eax, (%%esi)\n" + "movl %%ebx, 4(%%esi)\n" + "popl %%ebx\n" + "movl %%ecx, 8(%%esi)\n" + "movl %%edx, 12(%%esi)\n" +#else /* !SLJIT_CONFIG_X86_32 */ + "movq %0, %%rsi\n" + "movl (%%rsi), %%eax\n" + "movl 8(%%rsi), %%ecx\n" + "cpuid\n" + "movl %%eax, (%%rsi)\n" + "movl %%ebx, 4(%%rsi)\n" + "movl %%ecx, 8(%%rsi)\n" + "movl %%edx, 12(%%rsi)\n" +#endif /* SLJIT_CONFIG_X86_32 */ : + : "r" (info) #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) - : "%eax", "%ecx", "%edx" -#else - : "%rax", "%rbx", "%rcx", "%rdx" -#endif + : "memory", "eax", "ecx", "edx", "esi" +#else /* !SLJIT_CONFIG_X86_32 */ + : "memory", "rax", "rbx", "rcx", "rdx", "rsi" +#endif /* SLJIT_CONFIG_X86_32 */ ); -#else /* _MSC_VER && _MSC_VER >= 1400 */ +#else /* _MSC_VER < 1400 */ /* Intel syntax. */ __asm { - mov eax, 1 +#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) + mov esi, info + mov eax, [esi] + mov ecx, [esi + 8] + cpuid + mov [esi], eax + mov [esi + 4], ebx + mov [esi + 8], ecx + mov [esi + 12], edx +#else /* !SLJIT_CONFIG_X86_32 */ + mov rsi, info + mov eax, [rsi] + mov ecx, [rsi + 8] cpuid - mov features, edx + mov [rsi], eax + mov [rsi + 4], ebx + mov [rsi + 8], ecx + mov [rsi + 12], edx +#endif /* SLJIT_CONFIG_X86_32 */ } #endif /* _MSC_VER && _MSC_VER >= 1400 */ +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) +__msan_unpoison(info, 4 * sizeof(sljit_u32)); +#endif /* __has_feature(memory_sanitizer) */ +#endif /* defined(__has_feature) */ + +} + +static void get_cpu_features(void) +{ + sljit_u32 feature_list = CPU_FEATURE_DETECTED; + sljit_u32 info[4]; + sljit_u32 max_id; + + info[0] = 0; + execute_cpu_id(info); + max_id = info[0]; + + if (max_id >= 7) { + info[0] = 7; + info[2] = 0; + execute_cpu_id(info); + + if (info[1] & 0x8) + feature_list |= CPU_FEATURE_TZCNT; + if (info[1] & 0x20) + feature_list |= CPU_FEATURE_AVX2; + } + + if (max_id >= 1) { + info[0] = 1; + execute_cpu_id(info); + + if (info[2] & 0x80000) + feature_list |= CPU_FEATURE_SSE41; + if (info[2] & 0x10000000) + feature_list |= CPU_FEATURE_AVX; #if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2) - cpu_has_sse2 = (features >> 26) & 0x1; + if (info[3] & 0x4000000) + feature_list |= CPU_FEATURE_SSE2; #endif - cpu_has_cmov = (features >> 15) & 0x1; + if (info[3] & 0x8000) + feature_list |= CPU_FEATURE_CMOV; + } + + info[0] = 0x80000001; + info[2] = 0; /* Silences an incorrect compiler warning. */ + execute_cpu_id(info); + + if (info[2] & 0x20) + feature_list |= CPU_FEATURE_LZCNT; + + cpu_feature_list = feature_list; } -static sljit_u8 get_jump_code(sljit_s32 type) +static sljit_u8 get_jump_code(sljit_uw type) { switch (type) { case SLJIT_EQUAL: - case SLJIT_EQUAL_F64: + case SLJIT_ATOMIC_STORED: + case SLJIT_F_EQUAL: + case SLJIT_UNORDERED_OR_EQUAL: return 0x84 /* je */; case SLJIT_NOT_EQUAL: - case SLJIT_NOT_EQUAL_F64: + case SLJIT_ATOMIC_NOT_STORED: + case SLJIT_F_NOT_EQUAL: + case SLJIT_ORDERED_NOT_EQUAL: return 0x85 /* jne */; case SLJIT_LESS: - case SLJIT_LESS_F64: + case SLJIT_CARRY: + case SLJIT_F_LESS: + case SLJIT_UNORDERED_OR_LESS: + case SLJIT_UNORDERED_OR_GREATER: return 0x82 /* jc */; case SLJIT_GREATER_EQUAL: - case SLJIT_GREATER_EQUAL_F64: + case SLJIT_NOT_CARRY: + case SLJIT_F_GREATER_EQUAL: + case SLJIT_ORDERED_GREATER_EQUAL: + case SLJIT_ORDERED_LESS_EQUAL: return 0x83 /* jae */; case SLJIT_GREATER: - case SLJIT_GREATER_F64: + case SLJIT_F_GREATER: + case SLJIT_ORDERED_LESS: + case SLJIT_ORDERED_GREATER: return 0x87 /* jnbe */; case SLJIT_LESS_EQUAL: - case SLJIT_LESS_EQUAL_F64: + case SLJIT_F_LESS_EQUAL: + case SLJIT_UNORDERED_OR_GREATER_EQUAL: + case SLJIT_UNORDERED_OR_LESS_EQUAL: return 0x86 /* jbe */; case SLJIT_SIG_LESS: @@ -411,17 +600,17 @@ static sljit_u8 get_jump_code(sljit_s32 type) return 0x8e /* jle */; case SLJIT_OVERFLOW: - case SLJIT_MUL_OVERFLOW: return 0x80 /* jo */; case SLJIT_NOT_OVERFLOW: - case SLJIT_MUL_NOT_OVERFLOW: return 0x81 /* jno */; - case SLJIT_UNORDERED_F64: + case SLJIT_UNORDERED: + case SLJIT_ORDERED_EQUAL: /* NaN. */ return 0x8a /* jp */; - case SLJIT_ORDERED_F64: + case SLJIT_ORDERED: + case SLJIT_UNORDERED_OR_NOT_EQUAL: /* Not NaN. */ return 0x8b /* jpo */; } return 0; @@ -436,22 +625,22 @@ static sljit_u8* generate_put_label_code(struct sljit_put_label *put_label, slji static sljit_u8* generate_near_jump_code(struct sljit_jump *jump, sljit_u8 *code_ptr, sljit_u8 *code, sljit_sw executable_offset) { - sljit_s32 type = jump->flags >> TYPE_SHIFT; + sljit_uw type = jump->flags >> TYPE_SHIFT; sljit_s32 short_jump; sljit_uw label_addr; if (jump->flags & JUMP_LABEL) label_addr = (sljit_uw)(code + jump->u.label->size); else - label_addr = jump->u.target - executable_offset; - - short_jump = (sljit_sw)(label_addr - (jump->addr + 2)) >= -128 && (sljit_sw)(label_addr - (jump->addr + 2)) <= 127; + label_addr = jump->u.target - (sljit_uw)executable_offset; #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) - if ((sljit_sw)(label_addr - (jump->addr + 1)) > HALFWORD_MAX || (sljit_sw)(label_addr - (jump->addr + 1)) < HALFWORD_MIN) + if ((sljit_sw)(label_addr - (jump->addr + 2)) > HALFWORD_MAX || (sljit_sw)(label_addr - (jump->addr + 6)) < HALFWORD_MIN) return generate_far_jump_code(jump, code_ptr); #endif + short_jump = (sljit_sw)(label_addr - (jump->addr + 2)) >= -128 && (sljit_sw)(label_addr - (jump->addr + 2)) <= 127; + if (type == SLJIT_JUMP) { if (short_jump) *code_ptr++ = JMP_i8; @@ -465,7 +654,7 @@ static sljit_u8* generate_near_jump_code(struct sljit_jump *jump, sljit_u8 *code jump->addr++; } else if (short_jump) { - *code_ptr++ = get_jump_code(type) - 0x10; + *code_ptr++ = U8(get_jump_code(type) - 0x10); jump->addr++; } else { @@ -494,7 +683,7 @@ SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compil sljit_u8 *buf_end; sljit_u8 len; sljit_sw executable_offset; - sljit_sw jump_addr; + sljit_uw jump_addr; struct sljit_label *label; struct sljit_jump *jump; @@ -532,7 +721,7 @@ SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compil switch (*buf_ptr) { case 0: label->addr = (sljit_uw)SLJIT_ADD_EXEC_OFFSET(code_ptr, executable_offset); - label->size = code_ptr - code; + label->size = (sljit_uw)(code_ptr - code); label = label->next; break; case 1: @@ -577,32 +766,33 @@ SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compil jump = compiler->jumps; while (jump) { - jump_addr = jump->addr + executable_offset; + if (jump->flags & (PATCH_MB | PATCH_MW)) { + if (jump->flags & JUMP_LABEL) + jump_addr = jump->u.label->addr; + else + jump_addr = jump->u.target; - if (jump->flags & PATCH_MB) { - SLJIT_ASSERT((sljit_sw)(jump->u.label->addr - (jump_addr + sizeof(sljit_s8))) >= -128 && (sljit_sw)(jump->u.label->addr - (jump_addr + sizeof(sljit_s8))) <= 127); - *(sljit_u8*)jump->addr = (sljit_u8)(jump->u.label->addr - (jump_addr + sizeof(sljit_s8))); - } else if (jump->flags & PATCH_MW) { - if (jump->flags & JUMP_LABEL) { -#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) - sljit_unaligned_store_sw((void*)jump->addr, (sljit_sw)(jump->u.label->addr - (jump_addr + sizeof(sljit_sw)))); -#else - SLJIT_ASSERT((sljit_sw)(jump->u.label->addr - (jump_addr + sizeof(sljit_s32))) >= HALFWORD_MIN && (sljit_sw)(jump->u.label->addr - (jump_addr + sizeof(sljit_s32))) <= HALFWORD_MAX); - sljit_unaligned_store_s32((void*)jump->addr, (sljit_s32)(jump->u.label->addr - (jump_addr + sizeof(sljit_s32)))); -#endif - } - else { + jump_addr -= jump->addr + (sljit_uw)executable_offset; + + if (jump->flags & PATCH_MB) { + jump_addr -= sizeof(sljit_s8); + SLJIT_ASSERT((sljit_sw)jump_addr >= -128 && (sljit_sw)jump_addr <= 127); + *(sljit_u8*)jump->addr = U8(jump_addr); + } else { + jump_addr -= sizeof(sljit_s32); #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) - sljit_unaligned_store_sw((void*)jump->addr, (sljit_sw)(jump->u.target - (jump_addr + sizeof(sljit_sw)))); + sljit_unaligned_store_sw((void*)jump->addr, (sljit_sw)jump_addr); #else - SLJIT_ASSERT((sljit_sw)(jump->u.target - (jump_addr + sizeof(sljit_s32))) >= HALFWORD_MIN && (sljit_sw)(jump->u.target - (jump_addr + sizeof(sljit_s32))) <= HALFWORD_MAX); - sljit_unaligned_store_s32((void*)jump->addr, (sljit_s32)(jump->u.target - (jump_addr + sizeof(sljit_s32)))); + SLJIT_ASSERT((sljit_sw)jump_addr >= HALFWORD_MIN && (sljit_sw)jump_addr <= HALFWORD_MAX); + sljit_unaligned_store_s32((void*)jump->addr, (sljit_s32)jump_addr); #endif } } #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) - else if (jump->flags & PATCH_MD) - sljit_unaligned_store_sw((void*)jump->addr, jump->u.label->addr); + else if (jump->flags & PATCH_MD) { + SLJIT_ASSERT(jump->flags & JUMP_LABEL); + sljit_unaligned_store_sw((void*)jump->addr, (sljit_sw)jump->u.label->addr); + } #endif jump = jump->next; @@ -628,7 +818,7 @@ SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compil compiler->error = SLJIT_ERR_COMPILED; compiler->executable_offset = executable_offset; - compiler->executable_size = code_ptr - code; + compiler->executable_size = (sljit_uw)(code_ptr - code); code = (sljit_u8*)SLJIT_ADD_EXEC_OFFSET(code, executable_offset); @@ -641,11 +831,11 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_has_cpu_feature(sljit_s32 feature_type) switch (feature_type) { case SLJIT_HAS_FPU: #ifdef SLJIT_IS_FPU_AVAILABLE - return SLJIT_IS_FPU_AVAILABLE; + return (SLJIT_IS_FPU_AVAILABLE) != 0; #elif (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2) - if (cpu_has_sse2 == -1) + if (cpu_feature_list == 0) get_cpu_features(); - return cpu_has_sse2; + return (cpu_feature_list & CPU_FEATURE_SSE2) != 0; #else /* SLJIT_DETECT_SSE2 */ return 1; #endif /* SLJIT_DETECT_SSE2 */ @@ -653,48 +843,112 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_has_cpu_feature(sljit_s32 feature_type) #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) case SLJIT_HAS_VIRTUAL_REGISTERS: return 1; -#endif +#endif /* SLJIT_CONFIG_X86_32 */ case SLJIT_HAS_CLZ: + if (cpu_feature_list == 0) + get_cpu_features(); + + return (cpu_feature_list & CPU_FEATURE_LZCNT) ? 1 : 2; + + case SLJIT_HAS_CTZ: + if (cpu_feature_list == 0) + get_cpu_features(); + + return (cpu_feature_list & CPU_FEATURE_TZCNT) ? 1 : 2; + case SLJIT_HAS_CMOV: - if (cpu_has_cmov == -1) + if (cpu_feature_list == 0) get_cpu_features(); - return cpu_has_cmov; + return (cpu_feature_list & CPU_FEATURE_CMOV) != 0; + case SLJIT_HAS_REV: + case SLJIT_HAS_ROT: case SLJIT_HAS_PREFETCH: + case SLJIT_HAS_COPY_F32: + case SLJIT_HAS_COPY_F64: + case SLJIT_HAS_ATOMIC: return 1; - case SLJIT_HAS_SSE2: -#if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2) - if (cpu_has_sse2 == -1) +#if !(defined SLJIT_IS_FPU_AVAILABLE) || SLJIT_IS_FPU_AVAILABLE + case SLJIT_HAS_AVX: + if (cpu_feature_list == 0) get_cpu_features(); - return cpu_has_sse2; -#else - return 1; -#endif - + return (cpu_feature_list & CPU_FEATURE_AVX) != 0; + case SLJIT_HAS_AVX2: + if (cpu_feature_list == 0) + get_cpu_features(); + return (cpu_feature_list & CPU_FEATURE_AVX2) != 0; + case SLJIT_HAS_SIMD: + if (cpu_feature_list == 0) + get_cpu_features(); + return (cpu_feature_list & CPU_FEATURE_SSE41) != 0; +#endif /* SLJIT_IS_FPU_AVAILABLE */ default: return 0; } } +SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_cmp_info(sljit_s32 type) +{ + switch (type) { + case SLJIT_ORDERED_EQUAL: + case SLJIT_UNORDERED_OR_NOT_EQUAL: + return 2; + } + + return 0; +} + /* --------------------------------------------------------------------- */ /* Operators */ /* --------------------------------------------------------------------- */ #define BINARY_OPCODE(opcode) (((opcode ## _EAX_i32) << 24) | ((opcode ## _r_rm) << 16) | ((opcode ## _rm_r) << 8) | (opcode)) -static sljit_s32 emit_cum_binary(struct sljit_compiler *compiler, - sljit_u32 op_types, - sljit_s32 dst, sljit_sw dstw, - sljit_s32 src1, sljit_sw src1w, - sljit_s32 src2, sljit_sw src2w); +#define BINARY_IMM32(op_imm, immw, arg, argw) \ + do { \ + inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, immw, arg, argw); \ + FAIL_IF(!inst); \ + *(inst + 1) |= (op_imm); \ + } while (0) -static sljit_s32 emit_non_cum_binary(struct sljit_compiler *compiler, - sljit_u32 op_types, - sljit_s32 dst, sljit_sw dstw, - sljit_s32 src1, sljit_sw src1w, - sljit_s32 src2, sljit_sw src2w); +#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) + +#define BINARY_IMM(op_imm, op_mr, immw, arg, argw) \ + do { \ + if (IS_HALFWORD(immw) || compiler->mode32) { \ + BINARY_IMM32(op_imm, immw, arg, argw); \ + } \ + else { \ + FAIL_IF(emit_load_imm64(compiler, (arg == TMP_REG1) ? TMP_REG2 : TMP_REG1, immw)); \ + inst = emit_x86_instruction(compiler, 1, (arg == TMP_REG1) ? TMP_REG2 : TMP_REG1, 0, arg, argw); \ + FAIL_IF(!inst); \ + *inst = (op_mr); \ + } \ + } while (0) + +#define BINARY_EAX_IMM(op_eax_imm, immw) \ + FAIL_IF(emit_do_imm32(compiler, (!compiler->mode32) ? REX_W : 0, (op_eax_imm), immw)) + +#else /* !SLJIT_CONFIG_X86_64 */ + +#define BINARY_IMM(op_imm, op_mr, immw, arg, argw) \ + BINARY_IMM32(op_imm, immw, arg, argw) + +#define BINARY_EAX_IMM(op_eax_imm, immw) \ + FAIL_IF(emit_do_imm(compiler, (op_eax_imm), immw)) + +#endif /* SLJIT_CONFIG_X86_64 */ + +static sljit_s32 emit_byte(struct sljit_compiler *compiler, sljit_u8 byte) +{ + sljit_u8 *inst = (sljit_u8*)ensure_buf(compiler, 1 + 1); + FAIL_IF(!inst); + INC_SIZE(1); + *inst = byte; + return SLJIT_SUCCESS; +} static sljit_s32 emit_mov(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, @@ -703,6 +957,14 @@ static sljit_s32 emit_mov(struct sljit_compiler *compiler, #define EMIT_MOV(compiler, dst, dstw, src, srcw) \ FAIL_IF(emit_mov(compiler, dst, dstw, src, srcw)); +static sljit_s32 emit_groupf(struct sljit_compiler *compiler, + sljit_uw op, + sljit_s32 dst, sljit_s32 src, sljit_sw srcw); + +static sljit_s32 emit_groupf_ext(struct sljit_compiler *compiler, + sljit_uw op, + sljit_s32 dst, sljit_s32 src, sljit_sw srcw); + static SLJIT_INLINE sljit_s32 emit_sse2_store(struct sljit_compiler *compiler, sljit_s32 single, sljit_s32 dst, sljit_sw dstw, sljit_s32 src); @@ -713,6 +975,10 @@ static sljit_s32 emit_cmp_binary(struct sljit_compiler *compiler, sljit_s32 src1, sljit_sw src1w, sljit_s32 src2, sljit_sw src2w); +static sljit_s32 emit_cmov_generic(struct sljit_compiler *compiler, sljit_s32 type, + sljit_s32 dst_reg, + sljit_s32 src, sljit_sw srcw); + static SLJIT_INLINE sljit_s32 emit_endbranch(struct sljit_compiler *compiler) { #if (defined SLJIT_CONFIG_X86_CET && SLJIT_CONFIG_X86_CET) @@ -721,14 +987,14 @@ static SLJIT_INLINE sljit_s32 emit_endbranch(struct sljit_compiler *compiler) inst = (sljit_u8*)ensure_buf(compiler, 1 + 4); FAIL_IF(!inst); INC_SIZE(4); - *inst++ = 0xf3; - *inst++ = 0x0f; - *inst++ = 0x1e; + inst[0] = GROUP_F3; + inst[1] = GROUP_0F; + inst[2] = 0x1e; #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) - *inst = 0xfb; -#else - *inst = 0xfa; -#endif + inst[3] = 0xfb; +#else /* !SLJIT_CONFIG_X86_32 */ + inst[3] = 0xfa; +#endif /* SLJIT_CONFIG_X86_32 */ #else /* !SLJIT_CONFIG_X86_CET */ SLJIT_UNUSED_ARG(compiler); #endif /* SLJIT_CONFIG_X86_CET */ @@ -751,13 +1017,17 @@ static SLJIT_INLINE sljit_s32 emit_rdssp(struct sljit_compiler *compiler, sljit_ inst = (sljit_u8*)ensure_buf(compiler, 1 + size); FAIL_IF(!inst); INC_SIZE(size); - *inst++ = 0xf3; + *inst++ = GROUP_F3; #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) *inst++ = REX_W | (reg_map[reg] <= 7 ? 0 : REX_B); #endif - *inst++ = 0x0f; - *inst++ = 0x1e; - *inst = (0x3 << 6) | (0x1 << 3) | (reg_map[reg] & 0x7); + inst[0] = GROUP_0F; + inst[1] = 0x1e; +#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) + inst[2] = U8(MOD_REG | (0x1 << 3) | reg_lmap[reg]); +#else + inst[2] = U8(MOD_REG | (0x1 << 3) | reg_map[reg]); +#endif return SLJIT_SUCCESS; } @@ -775,13 +1045,13 @@ static SLJIT_INLINE sljit_s32 emit_incssp(struct sljit_compiler *compiler, sljit inst = (sljit_u8*)ensure_buf(compiler, 1 + size); FAIL_IF(!inst); INC_SIZE(size); - *inst++ = 0xf3; + *inst++ = GROUP_F3; #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) *inst++ = REX_W | (reg_map[reg] <= 7 ? 0 : REX_B); #endif - *inst++ = 0x0f; - *inst++ = 0xae; - *inst = (0x3 << 6) | (0x5 << 3) | (reg_map[reg] & 0x7); + inst[0] = GROUP_0F; + inst[1] = 0xae; + inst[2] = (0x3 << 6) | (0x5 << 3) | (reg_map[reg] & 0x7); return SLJIT_SUCCESS; } @@ -797,7 +1067,7 @@ static SLJIT_INLINE sljit_s32 cpu_has_shadow_stack(void) } static SLJIT_INLINE sljit_s32 adjust_shadow_stack(struct sljit_compiler *compiler, - sljit_s32 src, sljit_sw srcw, sljit_s32 base, sljit_sw disp) + sljit_s32 src, sljit_sw srcw) { #if (defined SLJIT_CONFIG_X86_CET && SLJIT_CONFIG_X86_CET) && defined (__SHSTK__) sljit_u8 *inst, *jz_after_cmp_inst; @@ -809,25 +1079,7 @@ static SLJIT_INLINE sljit_s32 adjust_shadow_stack(struct sljit_compiler *compile FAIL_IF(emit_rdssp(compiler, TMP_REG1)); /* Load return address on shadow stack into TMP_REG1. */ -#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) - SLJIT_ASSERT(reg_map[TMP_REG1] == 5); - - /* Hand code unsupported "mov 0x0(%ebp),%ebp". */ - inst = (sljit_u8*)ensure_buf(compiler, 1 + 3); - FAIL_IF(!inst); - INC_SIZE(3); - *inst++ = 0x8b; - *inst++ = 0x6d; - *inst = 0; -#else /* !SLJIT_CONFIG_X86_32 */ EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_MEM1(TMP_REG1), 0); -#endif /* SLJIT_CONFIG_X86_32 */ - - if (src == SLJIT_UNUSED) { - /* Return address is on stack. */ - src = SLJIT_MEM1(base); - srcw = disp; - } /* Compare return address against TMP_REG1. */ FAIL_IF(emit_cmp_binary (compiler, TMP_REG1, 0, src, srcw)); @@ -855,16 +1107,14 @@ static SLJIT_INLINE sljit_s32 adjust_shadow_stack(struct sljit_compiler *compile inst = (sljit_u8*)ensure_buf(compiler, 1 + 2); FAIL_IF(!inst); INC_SIZE(2); - *inst++ = JMP_i8; - *inst = size_before_rdssp_inst - compiler->size; + inst[0] = JMP_i8; + inst[1] = size_before_rdssp_inst - compiler->size; *jz_after_cmp_inst = compiler->size - size_jz_after_cmp_inst; #else /* !SLJIT_CONFIG_X86_CET || !__SHSTK__ */ SLJIT_UNUSED_ARG(compiler); SLJIT_UNUSED_ARG(src); SLJIT_UNUSED_ARG(srcw); - SLJIT_UNUSED_ARG(base); - SLJIT_UNUSED_ARG(disp); #endif /* SLJIT_CONFIG_X86_CET && __SHSTK__ */ return SLJIT_SUCCESS; } @@ -881,25 +1131,24 @@ static sljit_s32 emit_mov(struct sljit_compiler *compiler, { sljit_u8* inst; - SLJIT_ASSERT(dst != SLJIT_UNUSED); - if (FAST_IS_REG(src)) { inst = emit_x86_instruction(compiler, 1, src, 0, dst, dstw); FAIL_IF(!inst); *inst = MOV_rm_r; return SLJIT_SUCCESS; } - if (src & SLJIT_IMM) { + + if (src == SLJIT_IMM) { if (FAST_IS_REG(dst)) { #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) - return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw); + return emit_do_imm(compiler, MOV_r_i32 | reg_map[dst], srcw); #else if (!compiler->mode32) { if (NOT_HALFWORD(srcw)) return emit_load_imm64(compiler, dst, srcw); } else - return emit_do_imm32(compiler, (reg_map[dst] >= 8) ? REX_B : 0, MOV_r_i32 + reg_lmap[dst], srcw); + return emit_do_imm32(compiler, (reg_map[dst] >= 8) ? REX_B : 0, U8(MOV_r_i32 | reg_lmap[dst]), srcw); #endif } #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) @@ -936,11 +1185,32 @@ static sljit_s32 emit_mov(struct sljit_compiler *compiler, return SLJIT_SUCCESS; } +static sljit_s32 emit_cmov_generic(struct sljit_compiler *compiler, sljit_s32 type, + sljit_s32 dst_reg, + sljit_s32 src, sljit_sw srcw) +{ + sljit_u8* inst; + sljit_uw size; + + SLJIT_ASSERT(type >= SLJIT_EQUAL && type <= SLJIT_ORDERED_LESS_EQUAL); + + inst = (sljit_u8*)ensure_buf(compiler, 1 + 2); + FAIL_IF(!inst); + INC_SIZE(2); + inst[0] = U8(get_jump_code((sljit_uw)type ^ 0x1) - 0x10); + + size = compiler->size; + EMIT_MOV(compiler, dst_reg, 0, src, srcw); + + inst[1] = U8(compiler->size - size); + return SLJIT_SUCCESS; +} + SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compiler, sljit_s32 op) { sljit_u8 *inst; #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) - sljit_s32 size; + sljit_uw size; #endif CHECK_ERROR(); @@ -948,17 +1218,9 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compile switch (GET_OPCODE(op)) { case SLJIT_BREAKPOINT: - inst = (sljit_u8*)ensure_buf(compiler, 1 + 1); - FAIL_IF(!inst); - INC_SIZE(1); - *inst = INT3; - break; + return emit_byte(compiler, INT3); case SLJIT_NOP: - inst = (sljit_u8*)ensure_buf(compiler, 1 + 1); - FAIL_IF(!inst); - INC_SIZE(1); - *inst = NOP; - break; + return emit_byte(compiler, NOP); case SLJIT_LMUL_UW: case SLJIT_LMUL_SW: case SLJIT_DIVMOD_UW: @@ -977,7 +1239,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compile && reg_map[SLJIT_R1] < 7 && reg_map[TMP_REG1] == 2); #endif - compiler->mode32 = op & SLJIT_I32_OP; + compiler->mode32 = op & SLJIT_32; #endif SLJIT_COMPILE_ASSERT((SLJIT_DIVMOD_UW & 0x2) == 0 && SLJIT_DIV_UW - 0x2 == SLJIT_DIVMOD_UW, bad_div_opcode_assignments); @@ -999,23 +1261,16 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compile #endif #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) - inst = (sljit_u8*)ensure_buf(compiler, 1 + 1); - FAIL_IF(!inst); - INC_SIZE(1); - *inst = CDQ; + FAIL_IF(emit_byte(compiler, CDQ)); #else - if (compiler->mode32) { - inst = (sljit_u8*)ensure_buf(compiler, 1 + 1); - FAIL_IF(!inst); - INC_SIZE(1); - *inst = CDQ; - } else { + if (!compiler->mode32) { inst = (sljit_u8*)ensure_buf(compiler, 1 + 2); FAIL_IF(!inst); INC_SIZE(2); - *inst++ = REX_W; - *inst = CDQ; - } + inst[0] = REX_W; + inst[1] = CDQ; + } else + FAIL_IF(emit_byte(compiler, CDQ)); #endif } @@ -1023,14 +1278,14 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compile inst = (sljit_u8*)ensure_buf(compiler, 1 + 2); FAIL_IF(!inst); INC_SIZE(2); - *inst++ = GROUP_F7; - *inst = MOD_REG | ((op >= SLJIT_DIVMOD_UW) ? reg_map[TMP_REG1] : reg_map[SLJIT_R1]); -#else + inst[0] = GROUP_F7; + inst[1] = MOD_REG | ((op >= SLJIT_DIVMOD_UW) ? reg_map[TMP_REG1] : reg_map[SLJIT_R1]); +#else /* !SLJIT_CONFIG_X86_32 */ #ifdef _WIN64 size = (!compiler->mode32 || op >= SLJIT_DIVMOD_UW) ? 3 : 2; -#else +#else /* !_WIN64 */ size = (!compiler->mode32) ? 3 : 2; -#endif +#endif /* _WIN64 */ inst = (sljit_u8*)ensure_buf(compiler, 1 + size); FAIL_IF(!inst); INC_SIZE(size); @@ -1039,29 +1294,29 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compile *inst++ = REX_W | ((op >= SLJIT_DIVMOD_UW) ? REX_B : 0); else if (op >= SLJIT_DIVMOD_UW) *inst++ = REX_B; - *inst++ = GROUP_F7; - *inst = MOD_REG | ((op >= SLJIT_DIVMOD_UW) ? reg_lmap[TMP_REG1] : reg_lmap[SLJIT_R1]); -#else + inst[0] = GROUP_F7; + inst[1] = MOD_REG | ((op >= SLJIT_DIVMOD_UW) ? reg_lmap[TMP_REG1] : reg_lmap[SLJIT_R1]); +#else /* !_WIN64 */ if (!compiler->mode32) *inst++ = REX_W; - *inst++ = GROUP_F7; - *inst = MOD_REG | reg_map[SLJIT_R1]; -#endif -#endif + inst[0] = GROUP_F7; + inst[1] = MOD_REG | reg_map[SLJIT_R1]; +#endif /* _WIN64 */ +#endif /* SLJIT_CONFIG_X86_32 */ switch (op) { case SLJIT_LMUL_UW: - *inst |= MUL; + inst[1] |= MUL; break; case SLJIT_LMUL_SW: - *inst |= IMUL; + inst[1] |= IMUL; break; case SLJIT_DIVMOD_UW: case SLJIT_DIV_UW: - *inst |= DIV; + inst[1] |= DIV; break; case SLJIT_DIVMOD_SW: case SLJIT_DIV_SW: - *inst |= IDIV; + inst[1] |= IDIV; break; } #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) && !defined(_WIN64) @@ -1081,32 +1336,21 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compile return SLJIT_SUCCESS; } -#define ENCODE_PREFIX(prefix) \ - do { \ - inst = (sljit_u8*)ensure_buf(compiler, 1 + 1); \ - FAIL_IF(!inst); \ - INC_SIZE(1); \ - *inst = (prefix); \ - } while (0) - static sljit_s32 emit_mov_byte(struct sljit_compiler *compiler, sljit_s32 sign, sljit_s32 dst, sljit_sw dstw, sljit_s32 src, sljit_sw srcw) { sljit_u8* inst; sljit_s32 dst_r; -#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) - sljit_s32 work_r; -#endif #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) compiler->mode32 = 0; #endif - if (src & SLJIT_IMM) { + if (src == SLJIT_IMM) { if (FAST_IS_REG(dst)) { #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) - return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw); + return emit_do_imm(compiler, MOV_r_i32 | reg_map[dst], srcw); #else inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, 0); FAIL_IF(!inst); @@ -1132,100 +1376,33 @@ static sljit_s32 emit_mov_byte(struct sljit_compiler *compiler, sljit_s32 sign, #else dst_r = src; #endif - } + } else { #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) - else if (FAST_IS_REG(src) && reg_map[src] >= 4) { - /* src, dst are registers. */ - SLJIT_ASSERT(SLOW_IS_REG(dst)); - if (reg_map[dst] < 4) { - if (dst != src) - EMIT_MOV(compiler, dst, 0, src, 0); - inst = emit_x86_instruction(compiler, 2, dst, 0, dst, 0); - FAIL_IF(!inst); - *inst++ = GROUP_0F; - *inst = sign ? MOVSX_r_rm8 : MOVZX_r_rm8; - } - else { - if (dst != src) - EMIT_MOV(compiler, dst, 0, src, 0); - if (sign) { - /* shl reg, 24 */ - inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 24, dst, 0); - FAIL_IF(!inst); - *inst |= SHL; - /* sar reg, 24 */ - inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 24, dst, 0); - FAIL_IF(!inst); - *inst |= SAR; - } - else { + if (FAST_IS_REG(src) && reg_map[src] >= 4) { + /* Both src and dst are registers. */ + SLJIT_ASSERT(FAST_IS_REG(dst)); + + if (src == dst && !sign) { inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, 0xff, dst, 0); FAIL_IF(!inst); *(inst + 1) |= AND; + return SLJIT_SUCCESS; } + + EMIT_MOV(compiler, TMP_REG1, 0, src, 0); + src = TMP_REG1; + srcw = 0; } - return SLJIT_SUCCESS; - } -#endif - else { +#endif /* !SLJIT_CONFIG_X86_32 */ + /* src can be memory addr or reg_map[src] < 4 on x86_32 architectures. */ - inst = emit_x86_instruction(compiler, 2, dst_r, 0, src, srcw); - FAIL_IF(!inst); - *inst++ = GROUP_0F; - *inst = sign ? MOVSX_r_rm8 : MOVZX_r_rm8; + FAIL_IF(emit_groupf(compiler, sign ? MOVSX_r_rm8 : MOVZX_r_rm8, dst_r, src, srcw)); } if (dst & SLJIT_MEM) { -#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) - if (dst_r == TMP_REG1) { - /* Find a non-used register, whose reg_map[src] < 4. */ - if ((dst & REG_MASK) == SLJIT_R0) { - if ((dst & OFFS_REG_MASK) == TO_OFFS_REG(SLJIT_R1)) - work_r = SLJIT_R2; - else - work_r = SLJIT_R1; - } - else { - if ((dst & OFFS_REG_MASK) != TO_OFFS_REG(SLJIT_R0)) - work_r = SLJIT_R0; - else if ((dst & REG_MASK) == SLJIT_R1) - work_r = SLJIT_R2; - else - work_r = SLJIT_R1; - } - - if (work_r == SLJIT_R0) { - ENCODE_PREFIX(XCHG_EAX_r + reg_map[TMP_REG1]); - } - else { - inst = emit_x86_instruction(compiler, 1, work_r, 0, dst_r, 0); - FAIL_IF(!inst); - *inst = XCHG_r_rm; - } - - inst = emit_x86_instruction(compiler, 1, work_r, 0, dst, dstw); - FAIL_IF(!inst); - *inst = MOV_rm8_r8; - - if (work_r == SLJIT_R0) { - ENCODE_PREFIX(XCHG_EAX_r + reg_map[TMP_REG1]); - } - else { - inst = emit_x86_instruction(compiler, 1, work_r, 0, dst_r, 0); - FAIL_IF(!inst); - *inst = XCHG_r_rm; - } - } - else { - inst = emit_x86_instruction(compiler, 1, dst_r, 0, dst, dstw); - FAIL_IF(!inst); - *inst = MOV_rm8_r8; - } -#else inst = emit_x86_instruction(compiler, 1 | EX86_REX | EX86_NO_REXW, dst_r, 0, dst, dstw); FAIL_IF(!inst); *inst = MOV_rm8_r8; -#endif } return SLJIT_SUCCESS; @@ -1242,15 +1419,15 @@ static sljit_s32 emit_prefetch(struct sljit_compiler *compiler, sljit_s32 op, inst = emit_x86_instruction(compiler, 2, 0, 0, src, srcw); FAIL_IF(!inst); - *inst++ = GROUP_0F; - *inst++ = PREFETCH; + inst[0] = GROUP_0F; + inst[1] = PREFETCH; if (op == SLJIT_PREFETCH_L1) - *inst |= (1 << 3); + inst[2] |= (1 << 3); else if (op == SLJIT_PREFETCH_L2) - *inst |= (2 << 3); + inst[2] |= (2 << 3); else if (op == SLJIT_PREFETCH_L3) - *inst |= (3 << 3); + inst[2] |= (3 << 3); return SLJIT_SUCCESS; } @@ -1266,10 +1443,10 @@ static sljit_s32 emit_mov_half(struct sljit_compiler *compiler, sljit_s32 sign, compiler->mode32 = 0; #endif - if (src & SLJIT_IMM) { + if (src == SLJIT_IMM) { if (FAST_IS_REG(dst)) { #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) - return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw); + return emit_do_imm(compiler, MOV_r_i32 | reg_map[dst], srcw); #else inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, 0); FAIL_IF(!inst); @@ -1287,12 +1464,8 @@ static sljit_s32 emit_mov_half(struct sljit_compiler *compiler, sljit_s32 sign, if ((dst & SLJIT_MEM) && FAST_IS_REG(src)) dst_r = src; - else { - inst = emit_x86_instruction(compiler, 2, dst_r, 0, src, srcw); - FAIL_IF(!inst); - *inst++ = GROUP_0F; - *inst = sign ? MOVSX_r_rm16 : MOVZX_r_rm16; - } + else + FAIL_IF(emit_groupf(compiler, sign ? MOVSX_r_rm16 : MOVZX_r_rm16, dst_r, src, srcw)); if (dst & SLJIT_MEM) { inst = emit_x86_instruction(compiler, 1 | EX86_NO_REXW | EX86_PREF_66, dst_r, 0, dst, dstw); @@ -1313,136 +1486,206 @@ static sljit_s32 emit_unary(struct sljit_compiler *compiler, sljit_u8 opcode, /* Same input and output */ inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw); FAIL_IF(!inst); - *inst++ = GROUP_F7; - *inst |= opcode; - return SLJIT_SUCCESS; - } - - if (SLJIT_UNLIKELY(dst == SLJIT_UNUSED)) - dst = TMP_REG1; - - if (FAST_IS_REG(dst)) { - EMIT_MOV(compiler, dst, 0, src, srcw); - inst = emit_x86_instruction(compiler, 1, 0, 0, dst, 0); - FAIL_IF(!inst); - *inst++ = GROUP_F7; - *inst |= opcode; + inst[0] = GROUP_F7; + inst[1] |= opcode; return SLJIT_SUCCESS; } - EMIT_MOV(compiler, TMP_REG1, 0, src, srcw); - inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0); - FAIL_IF(!inst); - *inst++ = GROUP_F7; - *inst |= opcode; - EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0); - return SLJIT_SUCCESS; -} - -static sljit_s32 emit_not_with_flags(struct sljit_compiler *compiler, - sljit_s32 dst, sljit_sw dstw, - sljit_s32 src, sljit_sw srcw) -{ - sljit_u8* inst; - - if (dst == SLJIT_UNUSED) - dst = TMP_REG1; - if (FAST_IS_REG(dst)) { EMIT_MOV(compiler, dst, 0, src, srcw); inst = emit_x86_instruction(compiler, 1, 0, 0, dst, 0); FAIL_IF(!inst); - *inst++ = GROUP_F7; - *inst |= NOT_rm; - inst = emit_x86_instruction(compiler, 1, dst, 0, dst, 0); - FAIL_IF(!inst); - *inst = OR_r_rm; + inst[0] = GROUP_F7; + inst[1] |= opcode; return SLJIT_SUCCESS; } EMIT_MOV(compiler, TMP_REG1, 0, src, srcw); inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0); FAIL_IF(!inst); - *inst++ = GROUP_F7; - *inst |= NOT_rm; - inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, TMP_REG1, 0); - FAIL_IF(!inst); - *inst = OR_r_rm; + inst[0] = GROUP_F7; + inst[1] |= opcode; EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0); return SLJIT_SUCCESS; } #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) static const sljit_sw emit_clz_arg = 32 + 31; +static const sljit_sw emit_ctz_arg = 32; #endif -static sljit_s32 emit_clz(struct sljit_compiler *compiler, sljit_s32 op_flags, +static sljit_s32 emit_clz_ctz(struct sljit_compiler *compiler, sljit_s32 is_clz, sljit_s32 dst, sljit_sw dstw, sljit_s32 src, sljit_sw srcw) { sljit_u8* inst; sljit_s32 dst_r; + sljit_sw max; - SLJIT_UNUSED_ARG(op_flags); - - if (cpu_has_cmov == -1) - get_cpu_features(); + SLJIT_ASSERT(cpu_feature_list != 0); dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1; - inst = emit_x86_instruction(compiler, 2, dst_r, 0, src, srcw); - FAIL_IF(!inst); - *inst++ = GROUP_0F; - *inst = BSR_r_rm; + if (is_clz ? (cpu_feature_list & CPU_FEATURE_LZCNT) : (cpu_feature_list & CPU_FEATURE_TZCNT)) { + FAIL_IF(emit_groupf(compiler, (is_clz ? LZCNT_r_rm : TZCNT_r_rm) | EX86_PREF_F3, dst_r, src, srcw)); + + if (dst & SLJIT_MEM) + EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0); + return SLJIT_SUCCESS; + } + + FAIL_IF(emit_groupf(compiler, is_clz ? BSR_r_rm : BSF_r_rm, dst_r, src, srcw)); #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) - if (cpu_has_cmov) { + max = is_clz ? (32 + 31) : 32; + + if (cpu_feature_list & CPU_FEATURE_CMOV) { if (dst_r != TMP_REG1) { - EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, 32 + 31); + EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, max); inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG1, 0); } else - inst = emit_x86_instruction(compiler, 2, dst_r, 0, SLJIT_MEM0(), (sljit_sw)&emit_clz_arg); + inst = emit_x86_instruction(compiler, 2, dst_r, 0, SLJIT_MEM0(), is_clz ? (sljit_sw)&emit_clz_arg : (sljit_sw)&emit_ctz_arg); FAIL_IF(!inst); - *inst++ = GROUP_0F; - *inst = CMOVE_r_rm; + inst[0] = GROUP_0F; + inst[1] = CMOVE_r_rm; } else - FAIL_IF(sljit_emit_cmov_generic(compiler, SLJIT_EQUAL, dst_r, SLJIT_IMM, 32 + 31)); - - inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, 31, dst_r, 0); -#else - if (cpu_has_cmov) { - EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_IMM, !(op_flags & SLJIT_I32_OP) ? (64 + 63) : (32 + 31)); + FAIL_IF(emit_cmov_generic(compiler, SLJIT_EQUAL, dst_r, SLJIT_IMM, max)); - inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG2, 0); + if (is_clz) { + inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, 31, dst_r, 0); FAIL_IF(!inst); - *inst++ = GROUP_0F; - *inst = CMOVE_r_rm; + *(inst + 1) |= XOR; } +#else + if (is_clz) + max = compiler->mode32 ? (32 + 31) : (64 + 63); else - FAIL_IF(sljit_emit_cmov_generic(compiler, SLJIT_EQUAL, dst_r, SLJIT_IMM, !(op_flags & SLJIT_I32_OP) ? (64 + 63) : (32 + 31))); + max = compiler->mode32 ? 32 : 64; - inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, !(op_flags & SLJIT_I32_OP) ? 63 : 31, dst_r, 0); -#endif + if (cpu_feature_list & CPU_FEATURE_CMOV) { + EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_IMM, max); + FAIL_IF(emit_groupf(compiler, CMOVE_r_rm, dst_r, TMP_REG2, 0)); + } else + FAIL_IF(emit_cmov_generic(compiler, SLJIT_EQUAL, dst_r, SLJIT_IMM, max)); - FAIL_IF(!inst); - *(inst + 1) |= XOR; + if (is_clz) { + inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, max >> 1, dst_r, 0); + FAIL_IF(!inst); + *(inst + 1) |= XOR; + } +#endif if (dst & SLJIT_MEM) EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0); return SLJIT_SUCCESS; } +static sljit_s32 emit_bswap(struct sljit_compiler *compiler, + sljit_s32 op, + sljit_s32 dst, sljit_sw dstw, + sljit_s32 src, sljit_sw srcw) +{ + sljit_u8 *inst; + sljit_s32 dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1; + sljit_uw size; +#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) + sljit_u8 rex = 0; +#else /* !SLJIT_CONFIG_X86_64 */ + sljit_s32 dst_is_ereg = op & SLJIT_32; +#endif /* SLJIT_CONFIG_X86_64 */ + +#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) + if (op == SLJIT_REV_U32 || op == SLJIT_REV_S32) + compiler->mode32 = 1; +#else /* !SLJIT_CONFIG_X86_64 */ + op &= ~SLJIT_32; +#endif /* SLJIT_CONFIG_X86_64 */ + + if (src != dst_r) { + /* Only the lower 16 bit is read for eregs. */ + if (op == SLJIT_REV_U16 || op == SLJIT_REV_S16) + FAIL_IF(emit_mov_half(compiler, 0, dst_r, 0, src, srcw)); + else + EMIT_MOV(compiler, dst_r, 0, src, srcw); + } + + size = 2; +#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) + if (!compiler->mode32) + rex = REX_W; + + if (reg_map[dst_r] >= 8) + rex |= REX_B; + + if (rex != 0) + size++; +#endif /* SLJIT_CONFIG_X86_64 */ + + inst = (sljit_u8*)ensure_buf(compiler, 1 + size); + FAIL_IF(!inst); + INC_SIZE(size); + +#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) + if (rex != 0) + *inst++ = rex; + + inst[0] = GROUP_0F; + inst[1] = BSWAP_r | reg_lmap[dst_r]; +#else /* !SLJIT_CONFIG_X86_64 */ + inst[0] = GROUP_0F; + inst[1] = BSWAP_r | reg_map[dst_r]; +#endif /* SLJIT_CONFIG_X86_64 */ + + if (op == SLJIT_REV_U16 || op == SLJIT_REV_S16) { +#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) + size = compiler->mode32 ? 16 : 48; +#else /* !SLJIT_CONFIG_X86_64 */ + size = 16; +#endif /* SLJIT_CONFIG_X86_64 */ + + inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, (sljit_sw)size, dst_r, 0); + FAIL_IF(!inst); + if (op == SLJIT_REV_U16) + inst[1] |= SHR; + else + inst[1] |= SAR; + } + + if (dst & SLJIT_MEM) { +#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) + if (dst_is_ereg) + op = SLJIT_REV; +#endif /* SLJIT_CONFIG_X86_32 */ + if (op == SLJIT_REV_U16 || op == SLJIT_REV_S16) + return emit_mov_half(compiler, 0, dst, dstw, TMP_REG1, 0); + + return emit_mov(compiler, dst, dstw, TMP_REG1, 0); + } + +#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) + if (op == SLJIT_REV_S32) { + compiler->mode32 = 0; + inst = emit_x86_instruction(compiler, 1, dst, 0, dst, 0); + FAIL_IF(!inst); + *inst = MOVSXD_r_rm; + } +#endif /* SLJIT_CONFIG_X86_64 */ + + return SLJIT_SUCCESS; +} + SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op1(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 dst, sljit_sw dstw, sljit_s32 src, sljit_sw srcw) { - sljit_s32 op_flags = GET_ALL_FLAGS(op); #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) sljit_s32 dst_is_ereg = 0; -#endif +#else /* !SLJIT_CONFIG_X86_32 */ + sljit_s32 op_flags = GET_ALL_FLAGS(op); +#endif /* SLJIT_CONFIG_X86_32 */ CHECK_ERROR(); CHECK(check_sljit_emit_op1(compiler, op, dst, dstw, src, srcw)); @@ -1452,35 +1695,35 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op1(struct sljit_compiler *compile CHECK_EXTRA_REGS(dst, dstw, dst_is_ereg = 1); CHECK_EXTRA_REGS(src, srcw, (void)0); #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) - compiler->mode32 = op_flags & SLJIT_I32_OP; -#endif + compiler->mode32 = op_flags & SLJIT_32; +#endif /* SLJIT_CONFIG_X86_64 */ op = GET_OPCODE(op); if (op >= SLJIT_MOV && op <= SLJIT_MOV_P) { #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) compiler->mode32 = 0; -#endif +#endif /* SLJIT_CONFIG_X86_64 */ if (FAST_IS_REG(src) && src == dst) { if (!TYPE_CAST_NEEDED(op)) return SLJIT_SUCCESS; } - if (op_flags & SLJIT_I32_OP) { #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) + if (op_flags & SLJIT_32) { if (src & SLJIT_MEM) { if (op == SLJIT_MOV_S32) op = SLJIT_MOV_U32; } - else if (src & SLJIT_IMM) { + else if (src == SLJIT_IMM) { if (op == SLJIT_MOV_U32) op = SLJIT_MOV_S32; } -#endif } +#endif /* SLJIT_CONFIG_X86_64 */ - if (src & SLJIT_IMM) { + if (src == SLJIT_IMM) { switch (op) { case SLJIT_MOV_U8: srcw = (sljit_u8)srcw; @@ -1501,12 +1744,12 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op1(struct sljit_compiler *compile case SLJIT_MOV_S32: srcw = (sljit_s32)srcw; break; -#endif +#endif /* SLJIT_CONFIG_X86_64 */ } #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) if (SLJIT_UNLIKELY(dst_is_ereg)) return emit_mov(compiler, dst, dstw, src, srcw); -#endif +#endif /* SLJIT_CONFIG_X86_32 */ } #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) @@ -1514,7 +1757,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op1(struct sljit_compiler *compile SLJIT_ASSERT(dst == SLJIT_MEM1(SLJIT_SP)); dst = TMP_REG1; } -#endif +#endif /* SLJIT_CONFIG_X86_32 */ switch (op) { case SLJIT_MOV: @@ -1522,8 +1765,9 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op1(struct sljit_compiler *compile #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) case SLJIT_MOV_U32: case SLJIT_MOV_S32: -#endif - FAIL_IF(emit_mov(compiler, dst, dstw, src, srcw)); + case SLJIT_MOV32: +#endif /* SLJIT_CONFIG_X86_32 */ + EMIT_MOV(compiler, dst, dstw, src, srcw); break; case SLJIT_MOV_U8: FAIL_IF(emit_mov_byte(compiler, 0, dst, dstw, src, srcw)); @@ -1544,62 +1788,40 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op1(struct sljit_compiler *compile case SLJIT_MOV_S32: FAIL_IF(emit_mov_int(compiler, 1, dst, dstw, src, srcw)); break; -#endif + case SLJIT_MOV32: + compiler->mode32 = 1; + EMIT_MOV(compiler, dst, dstw, src, srcw); + compiler->mode32 = 0; + break; +#endif /* SLJIT_CONFIG_X86_64 */ } #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) if (SLJIT_UNLIKELY(dst_is_ereg) && dst == TMP_REG1) return emit_mov(compiler, SLJIT_MEM1(SLJIT_SP), dstw, TMP_REG1, 0); -#endif +#endif /* SLJIT_CONFIG_X86_32 */ return SLJIT_SUCCESS; } switch (op) { - case SLJIT_NOT: - if (SLJIT_UNLIKELY(op_flags & SLJIT_SET_Z)) - return emit_not_with_flags(compiler, dst, dstw, src, srcw); - return emit_unary(compiler, NOT_rm, dst, dstw, src, srcw); - - case SLJIT_NEG: - return emit_unary(compiler, NEG_rm, dst, dstw, src, srcw); - case SLJIT_CLZ: - return emit_clz(compiler, op_flags, dst, dstw, src, srcw); + case SLJIT_CTZ: + return emit_clz_ctz(compiler, (op == SLJIT_CLZ), dst, dstw, src, srcw); + case SLJIT_REV: + case SLJIT_REV_U16: + case SLJIT_REV_S16: + case SLJIT_REV_U32: + case SLJIT_REV_S32: +#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) + if (dst_is_ereg) + op |= SLJIT_32; +#endif /* SLJIT_CONFIG_X86_32 */ + return emit_bswap(compiler, op, dst, dstw, src, srcw); } return SLJIT_SUCCESS; } -#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) - -#define BINARY_IMM(op_imm, op_mr, immw, arg, argw) \ - if (IS_HALFWORD(immw) || compiler->mode32) { \ - inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, immw, arg, argw); \ - FAIL_IF(!inst); \ - *(inst + 1) |= (op_imm); \ - } \ - else { \ - FAIL_IF(emit_load_imm64(compiler, (arg == TMP_REG1) ? TMP_REG2 : TMP_REG1, immw)); \ - inst = emit_x86_instruction(compiler, 1, (arg == TMP_REG1) ? TMP_REG2 : TMP_REG1, 0, arg, argw); \ - FAIL_IF(!inst); \ - *inst = (op_mr); \ - } - -#define BINARY_EAX_IMM(op_eax_imm, immw) \ - FAIL_IF(emit_do_imm32(compiler, (!compiler->mode32) ? REX_W : 0, (op_eax_imm), immw)) - -#else - -#define BINARY_IMM(op_imm, op_mr, immw, arg, argw) \ - inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, immw, arg, argw); \ - FAIL_IF(!inst); \ - *(inst + 1) |= (op_imm); - -#define BINARY_EAX_IMM(op_eax_imm, immw) \ - FAIL_IF(emit_do_imm(compiler, (op_eax_imm), immw)) - -#endif - static sljit_s32 emit_cum_binary(struct sljit_compiler *compiler, sljit_u32 op_types, sljit_s32 dst, sljit_sw dstw, @@ -1607,26 +1829,13 @@ static sljit_s32 emit_cum_binary(struct sljit_compiler *compiler, sljit_s32 src2, sljit_sw src2w) { sljit_u8* inst; - sljit_u8 op_eax_imm = (op_types >> 24); - sljit_u8 op_rm = (op_types >> 16) & 0xff; - sljit_u8 op_mr = (op_types >> 8) & 0xff; - sljit_u8 op_imm = op_types & 0xff; - - if (dst == SLJIT_UNUSED) { - EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w); - if (src2 & SLJIT_IMM) { - BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0); - } - else { - inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w); - FAIL_IF(!inst); - *inst = op_rm; - } - return SLJIT_SUCCESS; - } + sljit_u8 op_eax_imm = U8(op_types >> 24); + sljit_u8 op_rm = U8((op_types >> 16) & 0xff); + sljit_u8 op_mr = U8((op_types >> 8) & 0xff); + sljit_u8 op_imm = U8(op_types & 0xff); if (dst == src1 && dstw == src1w) { - if (src2 & SLJIT_IMM) { + if (src2 == SLJIT_IMM) { #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) { #else @@ -1660,7 +1869,7 @@ static sljit_s32 emit_cum_binary(struct sljit_compiler *compiler, /* Only for cumulative operations. */ if (dst == src2 && dstw == src2w) { - if (src1 & SLJIT_IMM) { + if (src1 == SLJIT_IMM) { #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) if ((dst == SLJIT_R0) && (src1w > 127 || src1w < -128) && (compiler->mode32 || IS_HALFWORD(src1w))) { #else @@ -1694,7 +1903,7 @@ static sljit_s32 emit_cum_binary(struct sljit_compiler *compiler, /* General version. */ if (FAST_IS_REG(dst)) { EMIT_MOV(compiler, dst, 0, src1, src1w); - if (src2 & SLJIT_IMM) { + if (src2 == SLJIT_IMM) { BINARY_IMM(op_imm, op_mr, src2w, dst, 0); } else { @@ -1706,7 +1915,7 @@ static sljit_s32 emit_cum_binary(struct sljit_compiler *compiler, else { /* This version requires less memory writing. */ EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w); - if (src2 & SLJIT_IMM) { + if (src2 == SLJIT_IMM) { BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0); } else { @@ -1727,26 +1936,13 @@ static sljit_s32 emit_non_cum_binary(struct sljit_compiler *compiler, sljit_s32 src2, sljit_sw src2w) { sljit_u8* inst; - sljit_u8 op_eax_imm = (op_types >> 24); - sljit_u8 op_rm = (op_types >> 16) & 0xff; - sljit_u8 op_mr = (op_types >> 8) & 0xff; - sljit_u8 op_imm = op_types & 0xff; - - if (dst == SLJIT_UNUSED) { - EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w); - if (src2 & SLJIT_IMM) { - BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0); - } - else { - inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w); - FAIL_IF(!inst); - *inst = op_rm; - } - return SLJIT_SUCCESS; - } + sljit_u8 op_eax_imm = U8(op_types >> 24); + sljit_u8 op_rm = U8((op_types >> 16) & 0xff); + sljit_u8 op_mr = U8((op_types >> 8) & 0xff); + sljit_u8 op_imm = U8(op_types & 0xff); if (dst == src1 && dstw == src1w) { - if (src2 & SLJIT_IMM) { + if (src2 == SLJIT_IMM) { #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) { #else @@ -1780,7 +1976,7 @@ static sljit_s32 emit_non_cum_binary(struct sljit_compiler *compiler, /* General version. */ if (FAST_IS_REG(dst) && dst != src2) { EMIT_MOV(compiler, dst, 0, src1, src1w); - if (src2 & SLJIT_IMM) { + if (src2 == SLJIT_IMM) { BINARY_IMM(op_imm, op_mr, src2w, dst, 0); } else { @@ -1792,7 +1988,7 @@ static sljit_s32 emit_non_cum_binary(struct sljit_compiler *compiler, else { /* This version requires less memory writing. */ EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w); - if (src2 & SLJIT_IMM) { + if (src2 == SLJIT_IMM) { BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0); } else { @@ -1812,25 +2008,15 @@ static sljit_s32 emit_mul(struct sljit_compiler *compiler, sljit_s32 src2, sljit_sw src2w) { sljit_u8* inst; - sljit_s32 dst_r; - - dst_r = SLOW_IS_REG(dst) ? dst : TMP_REG1; + sljit_s32 dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1; /* Register destination. */ - if (dst_r == src1 && !(src2 & SLJIT_IMM)) { - inst = emit_x86_instruction(compiler, 2, dst_r, 0, src2, src2w); - FAIL_IF(!inst); - *inst++ = GROUP_0F; - *inst = IMUL_r_rm; - } - else if (dst_r == src2 && !(src1 & SLJIT_IMM)) { - inst = emit_x86_instruction(compiler, 2, dst_r, 0, src1, src1w); - FAIL_IF(!inst); - *inst++ = GROUP_0F; - *inst = IMUL_r_rm; - } - else if (src1 & SLJIT_IMM) { - if (src2 & SLJIT_IMM) { + if (dst_r == src1 && src2 != SLJIT_IMM) { + FAIL_IF(emit_groupf(compiler, IMUL_r_rm, dst_r, src2, src2w)); + } else if (dst_r == src2 && src1 != SLJIT_IMM) { + FAIL_IF(emit_groupf(compiler, IMUL_r_rm, dst_r, src1, src1w)); + } else if (src1 == SLJIT_IMM) { + if (src2 == SLJIT_IMM) { EMIT_MOV(compiler, dst_r, 0, SLJIT_IMM, src2w); src2 = dst_r; src2w = 0; @@ -1840,10 +2026,8 @@ static sljit_s32 emit_mul(struct sljit_compiler *compiler, inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w); FAIL_IF(!inst); *inst = IMUL_r_rm_i8; - inst = (sljit_u8*)ensure_buf(compiler, 1 + 1); - FAIL_IF(!inst); - INC_SIZE(1); - *inst = (sljit_s8)src1w; + + FAIL_IF(emit_byte(compiler, U8(src1w))); } #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) else { @@ -1869,30 +2053,26 @@ static sljit_s32 emit_mul(struct sljit_compiler *compiler, if (dst_r != src2) EMIT_MOV(compiler, dst_r, 0, src2, src2w); FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src1w)); - inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG2, 0); - FAIL_IF(!inst); - *inst++ = GROUP_0F; - *inst = IMUL_r_rm; + FAIL_IF(emit_groupf(compiler, IMUL_r_rm, dst_r, TMP_REG2, 0)); } #endif } - else if (src2 & SLJIT_IMM) { + else if (src2 == SLJIT_IMM) { /* Note: src1 is NOT immediate. */ if (src2w <= 127 && src2w >= -128) { inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w); FAIL_IF(!inst); *inst = IMUL_r_rm_i8; - inst = (sljit_u8*)ensure_buf(compiler, 1 + 1); - FAIL_IF(!inst); - INC_SIZE(1); - *inst = (sljit_s8)src2w; + + FAIL_IF(emit_byte(compiler, U8(src2w))); } #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) else { inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w); FAIL_IF(!inst); *inst = IMUL_r_rm_i32; + inst = (sljit_u8*)ensure_buf(compiler, 1 + 4); FAIL_IF(!inst); INC_SIZE(4); @@ -1903,31 +2083,24 @@ static sljit_s32 emit_mul(struct sljit_compiler *compiler, inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w); FAIL_IF(!inst); *inst = IMUL_r_rm_i32; + inst = (sljit_u8*)ensure_buf(compiler, 1 + 4); FAIL_IF(!inst); INC_SIZE(4); sljit_unaligned_store_s32(inst, (sljit_s32)src2w); - } - else { + } else { if (dst_r != src1) EMIT_MOV(compiler, dst_r, 0, src1, src1w); FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src2w)); - inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG2, 0); - FAIL_IF(!inst); - *inst++ = GROUP_0F; - *inst = IMUL_r_rm; + FAIL_IF(emit_groupf(compiler, IMUL_r_rm, dst_r, TMP_REG2, 0)); } #endif - } - else { + } else { /* Neither argument is immediate. */ if (ADDRESSING_DEPENDS_ON(src2, dst_r)) dst_r = TMP_REG1; EMIT_MOV(compiler, dst_r, 0, src1, src1w); - inst = emit_x86_instruction(compiler, 2, dst_r, 0, src2, src2w); - FAIL_IF(!inst); - *inst++ = GROUP_0F; - *inst = IMUL_r_rm; + FAIL_IF(emit_groupf(compiler, IMUL_r_rm, dst_r, src2, src2w)); } if (dst & SLJIT_MEM) @@ -1960,10 +2133,10 @@ static sljit_s32 emit_lea_binary(struct sljit_compiler *compiler, done = 1; } #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) - if ((src2 & SLJIT_IMM) && (compiler->mode32 || IS_HALFWORD(src2w))) { + if (src2 == SLJIT_IMM && (compiler->mode32 || IS_HALFWORD(src2w))) { inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src1), (sljit_s32)src2w); #else - if (src2 & SLJIT_IMM) { + if (src2 == SLJIT_IMM) { inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src1), src2w); #endif FAIL_IF(!inst); @@ -1973,10 +2146,10 @@ static sljit_s32 emit_lea_binary(struct sljit_compiler *compiler, } else if (FAST_IS_REG(src2)) { #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) - if ((src1 & SLJIT_IMM) && (compiler->mode32 || IS_HALFWORD(src1w))) { + if (src1 == SLJIT_IMM && (compiler->mode32 || IS_HALFWORD(src1w))) { inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src2), (sljit_s32)src1w); #else - if (src1 & SLJIT_IMM) { + if (src1 == SLJIT_IMM) { inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src2), src1w); #endif FAIL_IF(!inst); @@ -2000,16 +2173,16 @@ static sljit_s32 emit_cmp_binary(struct sljit_compiler *compiler, sljit_u8* inst; #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) - if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) { + if (src1 == SLJIT_R0 && src2 == SLJIT_IMM && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) { #else - if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128)) { + if (src1 == SLJIT_R0 && src2 == SLJIT_IMM && (src2w > 127 || src2w < -128)) { #endif BINARY_EAX_IMM(CMP_EAX_i32, src2w); return SLJIT_SUCCESS; } if (FAST_IS_REG(src1)) { - if (src2 & SLJIT_IMM) { + if (src2 == SLJIT_IMM) { BINARY_IMM(CMP, CMP_rm_r, src2w, src1, 0); } else { @@ -2020,15 +2193,15 @@ static sljit_s32 emit_cmp_binary(struct sljit_compiler *compiler, return SLJIT_SUCCESS; } - if (FAST_IS_REG(src2) && !(src1 & SLJIT_IMM)) { + if (FAST_IS_REG(src2) && src1 != SLJIT_IMM) { inst = emit_x86_instruction(compiler, 1, src2, 0, src1, src1w); FAIL_IF(!inst); *inst = CMP_rm_r; return SLJIT_SUCCESS; } - if (src2 & SLJIT_IMM) { - if (src1 & SLJIT_IMM) { + if (src2 == SLJIT_IMM) { + if (src1 == SLJIT_IMM) { EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w); src1 = TMP_REG1; src1w = 0; @@ -2051,25 +2224,25 @@ static sljit_s32 emit_test_binary(struct sljit_compiler *compiler, sljit_u8* inst; #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) - if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) { + if (src1 == SLJIT_R0 && src2 == SLJIT_IMM && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) { #else - if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128)) { + if (src1 == SLJIT_R0 && src2 == SLJIT_IMM && (src2w > 127 || src2w < -128)) { #endif BINARY_EAX_IMM(TEST_EAX_i32, src2w); return SLJIT_SUCCESS; } #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) - if (src2 == SLJIT_R0 && (src1 & SLJIT_IMM) && (src1w > 127 || src1w < -128) && (compiler->mode32 || IS_HALFWORD(src1w))) { + if (src2 == SLJIT_R0 && src1 == SLJIT_IMM && (src1w > 127 || src1w < -128) && (compiler->mode32 || IS_HALFWORD(src1w))) { #else - if (src2 == SLJIT_R0 && (src1 & SLJIT_IMM) && (src1w > 127 || src1w < -128)) { + if (src2 == SLJIT_R0 && src1 == SLJIT_IMM && (src1w > 127 || src1w < -128)) { #endif BINARY_EAX_IMM(TEST_EAX_i32, src1w); return SLJIT_SUCCESS; } - if (!(src1 & SLJIT_IMM)) { - if (src2 & SLJIT_IMM) { + if (src1 != SLJIT_IMM) { + if (src2 == SLJIT_IMM) { #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) if (IS_HALFWORD(src2w) || compiler->mode32) { inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, src1w); @@ -2097,8 +2270,8 @@ static sljit_s32 emit_test_binary(struct sljit_compiler *compiler, } } - if (!(src2 & SLJIT_IMM)) { - if (src1 & SLJIT_IMM) { + if (src2 != SLJIT_IMM) { + if (src1 == SLJIT_IMM) { #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) if (IS_HALFWORD(src1w) || compiler->mode32) { inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src1w, src2, src2w); @@ -2127,7 +2300,7 @@ static sljit_s32 emit_test_binary(struct sljit_compiler *compiler, } EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w); - if (src2 & SLJIT_IMM) { + if (src2 == SLJIT_IMM) { #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) if (IS_HALFWORD(src2w) || compiler->mode32) { inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, TMP_REG1, 0); @@ -2160,27 +2333,23 @@ static sljit_s32 emit_shift(struct sljit_compiler *compiler, sljit_s32 src1, sljit_sw src1w, sljit_s32 src2, sljit_sw src2w) { +#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) + sljit_s32 mode32; +#endif sljit_u8* inst; - if ((src2 & SLJIT_IMM) || (src2 == SLJIT_PREF_SHIFT_REG)) { + if (src2 == SLJIT_IMM || src2 == SLJIT_PREF_SHIFT_REG) { if (dst == src1 && dstw == src1w) { inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, dst, dstw); FAIL_IF(!inst); - *inst |= mode; - return SLJIT_SUCCESS; - } - if (dst == SLJIT_UNUSED) { - EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w); - inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, TMP_REG1, 0); - FAIL_IF(!inst); - *inst |= mode; + inst[1] |= mode; return SLJIT_SUCCESS; } if (dst == SLJIT_PREF_SHIFT_REG && src2 == SLJIT_PREF_SHIFT_REG) { EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w); inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0); FAIL_IF(!inst); - *inst |= mode; + inst[1] |= mode; EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0); return SLJIT_SUCCESS; } @@ -2188,14 +2357,14 @@ static sljit_s32 emit_shift(struct sljit_compiler *compiler, EMIT_MOV(compiler, dst, 0, src1, src1w); inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, dst, 0); FAIL_IF(!inst); - *inst |= mode; + inst[1] |= mode; return SLJIT_SUCCESS; } EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w); inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, TMP_REG1, 0); FAIL_IF(!inst); - *inst |= mode; + inst[1] |= mode; EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0); return SLJIT_SUCCESS; } @@ -2205,41 +2374,62 @@ static sljit_s32 emit_shift(struct sljit_compiler *compiler, EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w); inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0); FAIL_IF(!inst); - *inst |= mode; - EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0); + inst[1] |= mode; + return emit_mov(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0); } - else if (SLOW_IS_REG(dst) && dst != src2 && !ADDRESSING_DEPENDS_ON(src2, dst)) { + + if (FAST_IS_REG(dst) && dst != src2 && dst != TMP_REG1 && !ADDRESSING_DEPENDS_ON(src2, dst)) { if (src1 != dst) EMIT_MOV(compiler, dst, 0, src1, src1w); +#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) + mode32 = compiler->mode32; + compiler->mode32 = 0; +#endif EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_PREF_SHIFT_REG, 0); +#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) + compiler->mode32 = mode32; +#endif EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w); inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, dst, 0); FAIL_IF(!inst); - *inst |= mode; + inst[1] |= mode; +#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) + compiler->mode32 = 0; +#endif EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0); +#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) + compiler->mode32 = mode32; +#endif + return SLJIT_SUCCESS; } - else { - /* This case is complex since ecx itself may be used for - addressing, and this case must be supported as well. */ - EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w); + + /* This case is complex since ecx itself may be used for + addressing, and this case must be supported as well. */ + EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w); #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) - EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), 0, SLJIT_PREF_SHIFT_REG, 0); - EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w); - inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0); - FAIL_IF(!inst); - *inst |= mode; - EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, SLJIT_MEM1(SLJIT_SP), 0); + EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), 0, SLJIT_PREF_SHIFT_REG, 0); +#else /* !SLJIT_CONFIG_X86_32 */ + mode32 = compiler->mode32; + compiler->mode32 = 0; + EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_PREF_SHIFT_REG, 0); + compiler->mode32 = mode32; +#endif /* SLJIT_CONFIG_X86_32 */ + + EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w); + inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0); + FAIL_IF(!inst); + inst[1] |= mode; + +#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) + EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, SLJIT_MEM1(SLJIT_SP), 0); #else - EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_PREF_SHIFT_REG, 0); - EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w); - inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0); - FAIL_IF(!inst); - *inst |= mode; - EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG2, 0); -#endif - if (dst != SLJIT_UNUSED) - return emit_mov(compiler, dst, dstw, TMP_REG1, 0); - } + compiler->mode32 = 0; + EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG2, 0); + compiler->mode32 = mode32; +#endif /* SLJIT_CONFIG_X86_32 */ + + if (dst != TMP_REG1) + return emit_mov(compiler, dst, dstw, TMP_REG1, 0); return SLJIT_SUCCESS; } @@ -2251,14 +2441,15 @@ static sljit_s32 emit_shift_with_flags(struct sljit_compiler *compiler, sljit_s32 src2, sljit_sw src2w) { /* The CPU does not set flags if the shift count is 0. */ - if (src2 & SLJIT_IMM) { + if (src2 == SLJIT_IMM) { #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) - if ((src2w & 0x3f) != 0 || (compiler->mode32 && (src2w & 0x1f) != 0)) - return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w); -#else - if ((src2w & 0x1f) != 0) + src2w &= compiler->mode32 ? 0x1f : 0x3f; +#else /* !SLJIT_CONFIG_X86_64 */ + src2w &= 0x1f; +#endif /* SLJIT_CONFIG_X86_64 */ + if (src2w != 0) return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w); -#endif + if (!set_flags) return emit_mov(compiler, dst, dstw, src1, src1w); /* OR dst, src, 0 */ @@ -2275,7 +2466,7 @@ static sljit_s32 emit_shift_with_flags(struct sljit_compiler *compiler, FAIL_IF(emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w)); if (FAST_IS_REG(dst)) - return emit_cmp_binary(compiler, (dst == SLJIT_UNUSED) ? TMP_REG1 : dst, dstw, SLJIT_IMM, 0); + return emit_cmp_binary(compiler, dst, dstw, SLJIT_IMM, 0); return SLJIT_SUCCESS; } @@ -2285,7 +2476,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2(struct sljit_compiler *compile sljit_s32 src2, sljit_sw src2w) { CHECK_ERROR(); - CHECK(check_sljit_emit_op2(compiler, op, dst, dstw, src1, src1w, src2, src2w)); + CHECK(check_sljit_emit_op2(compiler, op, 0, dst, dstw, src1, src1w, src2, src2w)); ADJUST_LOCAL_OFFSET(dst, dstw); ADJUST_LOCAL_OFFSET(src1, src1w); ADJUST_LOCAL_OFFSET(src2, src2w); @@ -2294,11 +2485,10 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2(struct sljit_compiler *compile CHECK_EXTRA_REGS(src1, src1w, (void)0); CHECK_EXTRA_REGS(src2, src2w, (void)0); #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) - compiler->mode32 = op & SLJIT_I32_OP; + compiler->mode32 = op & SLJIT_32; #endif - if (dst == SLJIT_UNUSED && !HAS_FLAGS(op)) - return SLJIT_SUCCESS; + SLJIT_ASSERT(dst != TMP_REG1 || HAS_FLAGS(op)); switch (GET_OPCODE(op)) { case SLJIT_ADD: @@ -2312,17 +2502,18 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2(struct sljit_compiler *compile return emit_cum_binary(compiler, BINARY_OPCODE(ADC), dst, dstw, src1, src1w, src2, src2w); case SLJIT_SUB: + if (src1 == SLJIT_IMM && src1w == 0) + return emit_unary(compiler, NEG_rm, dst, dstw, src2, src2w); + if (!HAS_FLAGS(op)) { - if ((src2 & SLJIT_IMM) && emit_lea_binary(compiler, dst, dstw, src1, src1w, SLJIT_IMM, -src2w) != SLJIT_ERR_UNSUPPORTED) + if (src2 == SLJIT_IMM && emit_lea_binary(compiler, dst, dstw, src1, src1w, SLJIT_IMM, -src2w) != SLJIT_ERR_UNSUPPORTED) return compiler->error; - if (SLOW_IS_REG(dst) && src2 == dst) { + if (FAST_IS_REG(dst) && src2 == dst) { FAIL_IF(emit_non_cum_binary(compiler, BINARY_OPCODE(SUB), dst, 0, dst, 0, src1, src1w)); return emit_unary(compiler, NEG_rm, dst, 0, dst, 0); } } - if (dst == SLJIT_UNUSED) - return emit_cmp_binary(compiler, src1, src1w, src2, src2w); return emit_non_cum_binary(compiler, BINARY_OPCODE(SUB), dst, dstw, src1, src1w, src2, src2w); case SLJIT_SUBC: @@ -2331,27 +2522,261 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2(struct sljit_compiler *compile case SLJIT_MUL: return emit_mul(compiler, dst, dstw, src1, src1w, src2, src2w); case SLJIT_AND: - if (dst == SLJIT_UNUSED) - return emit_test_binary(compiler, src1, src1w, src2, src2w); return emit_cum_binary(compiler, BINARY_OPCODE(AND), dst, dstw, src1, src1w, src2, src2w); case SLJIT_OR: return emit_cum_binary(compiler, BINARY_OPCODE(OR), dst, dstw, src1, src1w, src2, src2w); case SLJIT_XOR: + if (!HAS_FLAGS(op)) { + if (src2 == SLJIT_IMM && src2w == -1) + return emit_unary(compiler, NOT_rm, dst, dstw, src1, src1w); + if (src1 == SLJIT_IMM && src1w == -1) + return emit_unary(compiler, NOT_rm, dst, dstw, src2, src2w); + } + return emit_cum_binary(compiler, BINARY_OPCODE(XOR), dst, dstw, src1, src1w, src2, src2w); case SLJIT_SHL: + case SLJIT_MSHL: return emit_shift_with_flags(compiler, SHL, HAS_FLAGS(op), dst, dstw, src1, src1w, src2, src2w); case SLJIT_LSHR: + case SLJIT_MLSHR: return emit_shift_with_flags(compiler, SHR, HAS_FLAGS(op), dst, dstw, src1, src1w, src2, src2w); case SLJIT_ASHR: + case SLJIT_MASHR: return emit_shift_with_flags(compiler, SAR, HAS_FLAGS(op), dst, dstw, src1, src1w, src2, src2w); + case SLJIT_ROTL: + return emit_shift_with_flags(compiler, ROL, 0, + dst, dstw, src1, src1w, src2, src2w); + case SLJIT_ROTR: + return emit_shift_with_flags(compiler, ROR, 0, + dst, dstw, src1, src1w, src2, src2w); + } + + return SLJIT_SUCCESS; +} + +SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2u(struct sljit_compiler *compiler, sljit_s32 op, + sljit_s32 src1, sljit_sw src1w, + sljit_s32 src2, sljit_sw src2w) +{ + sljit_s32 opcode = GET_OPCODE(op); + + CHECK_ERROR(); + CHECK(check_sljit_emit_op2(compiler, op, 1, 0, 0, src1, src1w, src2, src2w)); + + if (opcode != SLJIT_SUB && opcode != SLJIT_AND) { + SLJIT_SKIP_CHECKS(compiler); + return sljit_emit_op2(compiler, op, TMP_REG1, 0, src1, src1w, src2, src2w); + } + + ADJUST_LOCAL_OFFSET(src1, src1w); + ADJUST_LOCAL_OFFSET(src2, src2w); + + CHECK_EXTRA_REGS(src1, src1w, (void)0); + CHECK_EXTRA_REGS(src2, src2w, (void)0); +#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) + compiler->mode32 = op & SLJIT_32; +#endif + + if (opcode == SLJIT_SUB) { + return emit_cmp_binary(compiler, src1, src1w, src2, src2w); + } + return emit_test_binary(compiler, src1, src1w, src2, src2w); +} + +SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_shift_into(struct sljit_compiler *compiler, sljit_s32 op, + sljit_s32 dst_reg, + sljit_s32 src1_reg, + sljit_s32 src2_reg, + sljit_s32 src3, sljit_sw src3w) +{ + sljit_s32 is_rotate, is_left, move_src1; + sljit_u8* inst; + sljit_sw src1w = 0; + sljit_sw dstw = 0; + /* The whole register must be saved even for 32 bit operations. */ + sljit_u8 restore_ecx = 0; +#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) + sljit_sw src2w = 0; + sljit_s32 restore_sp4 = 0; +#endif /* SLJIT_CONFIG_X86_32 */ + + CHECK_ERROR(); + CHECK(check_sljit_emit_shift_into(compiler, op, dst_reg, src1_reg, src2_reg, src3, src3w)); + ADJUST_LOCAL_OFFSET(src3, src3w); + + CHECK_EXTRA_REGS(dst_reg, dstw, (void)0); + CHECK_EXTRA_REGS(src3, src3w, (void)0); + +#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) + compiler->mode32 = op & SLJIT_32; +#endif /* SLJIT_CONFIG_X86_64 */ + + if (src3 == SLJIT_IMM) { +#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) + src3w &= 0x1f; +#else /* !SLJIT_CONFIG_X86_32 */ + src3w &= (op & SLJIT_32) ? 0x1f : 0x3f; +#endif /* SLJIT_CONFIG_X86_32 */ + + if (src3w == 0) + return SLJIT_SUCCESS; + } + + is_left = (GET_OPCODE(op) == SLJIT_SHL || GET_OPCODE(op) == SLJIT_MSHL); + + is_rotate = (src1_reg == src2_reg); + CHECK_EXTRA_REGS(src1_reg, src1w, (void)0); + CHECK_EXTRA_REGS(src2_reg, src2w, (void)0); + + if (is_rotate) + return emit_shift(compiler, is_left ? ROL : ROR, dst_reg, dstw, src1_reg, src1w, src3, src3w); + +#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) + if (src2_reg & SLJIT_MEM) { + EMIT_MOV(compiler, TMP_REG1, 0, src2_reg, src2w); + src2_reg = TMP_REG1; + } +#endif /* SLJIT_CONFIG_X86_32 */ + + if (dst_reg == SLJIT_PREF_SHIFT_REG && src3 != SLJIT_IMM && (src3 != SLJIT_PREF_SHIFT_REG || src1_reg != SLJIT_PREF_SHIFT_REG)) { +#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) + EMIT_MOV(compiler, TMP_REG1, 0, src1_reg, src1w); + src1_reg = TMP_REG1; + src1w = 0; +#else /* !SLJIT_CONFIG_X86_64 */ + if (src2_reg != TMP_REG1) { + EMIT_MOV(compiler, TMP_REG1, 0, src1_reg, src1w); + src1_reg = TMP_REG1; + src1w = 0; + } else if ((src1_reg & SLJIT_MEM) || src1_reg == SLJIT_PREF_SHIFT_REG) { + restore_sp4 = (src3 == SLJIT_R0) ? SLJIT_R1 : SLJIT_R0; + EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_s32), restore_sp4, 0); + EMIT_MOV(compiler, restore_sp4, 0, src1_reg, src1w); + src1_reg = restore_sp4; + src1w = 0; + } else { + EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_s32), src1_reg, 0); + restore_sp4 = src1_reg; + } +#endif /* SLJIT_CONFIG_X86_64 */ + + if (src3 != SLJIT_PREF_SHIFT_REG) + EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src3, src3w); + } else { + if (src2_reg == SLJIT_PREF_SHIFT_REG && src3 != SLJIT_IMM && src3 != SLJIT_PREF_SHIFT_REG) { +#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) + compiler->mode32 = 0; +#endif /* SLJIT_CONFIG_X86_64 */ + EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_PREF_SHIFT_REG, 0); +#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) + compiler->mode32 = op & SLJIT_32; +#endif /* SLJIT_CONFIG_X86_64 */ + src2_reg = TMP_REG1; + restore_ecx = 1; + } + + move_src1 = 0; +#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) + if (dst_reg != src1_reg) { + if (dst_reg != src3) { + EMIT_MOV(compiler, dst_reg, 0, src1_reg, src1w); + src1_reg = dst_reg; + src1w = 0; + } else + move_src1 = 1; + } +#else /* !SLJIT_CONFIG_X86_64 */ + if (dst_reg & SLJIT_MEM) { + if (src2_reg != TMP_REG1) { + EMIT_MOV(compiler, TMP_REG1, 0, src1_reg, src1w); + src1_reg = TMP_REG1; + src1w = 0; + } else if ((src1_reg & SLJIT_MEM) || src1_reg == SLJIT_PREF_SHIFT_REG) { + restore_sp4 = (src3 == SLJIT_R0) ? SLJIT_R1 : SLJIT_R0; + EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_s32), restore_sp4, 0); + EMIT_MOV(compiler, restore_sp4, 0, src1_reg, src1w); + src1_reg = restore_sp4; + src1w = 0; + } else { + EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_s32), src1_reg, 0); + restore_sp4 = src1_reg; + } + } else if (dst_reg != src1_reg) { + if (dst_reg != src3) { + EMIT_MOV(compiler, dst_reg, 0, src1_reg, src1w); + src1_reg = dst_reg; + src1w = 0; + } else + move_src1 = 1; + } +#endif /* SLJIT_CONFIG_X86_64 */ + + if (src3 != SLJIT_IMM && src3 != SLJIT_PREF_SHIFT_REG) { + if (!restore_ecx) { +#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) + compiler->mode32 = 0; + EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_PREF_SHIFT_REG, 0); + compiler->mode32 = op & SLJIT_32; + restore_ecx = 1; +#else /* !SLJIT_CONFIG_X86_64 */ + if (src1_reg != TMP_REG1 && src2_reg != TMP_REG1) { + EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_PREF_SHIFT_REG, 0); + restore_ecx = 1; + } else { + EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), 0, SLJIT_PREF_SHIFT_REG, 0); + restore_ecx = 2; + } +#endif /* SLJIT_CONFIG_X86_64 */ + } + EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src3, src3w); + } + + if (move_src1) { + EMIT_MOV(compiler, dst_reg, 0, src1_reg, src1w); + src1_reg = dst_reg; + src1w = 0; + } } + inst = emit_x86_instruction(compiler, 2, src2_reg, 0, src1_reg, src1w); + FAIL_IF(!inst); + inst[0] = GROUP_0F; + + if (src3 == SLJIT_IMM) { + inst[1] = U8((is_left ? SHLD : SHRD) - 1); + + /* Immediate argument is added separately. */ + FAIL_IF(emit_byte(compiler, U8(src3w))); + } else + inst[1] = U8(is_left ? SHLD : SHRD); + +#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) + if (restore_ecx) { + compiler->mode32 = 0; + EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0); + } + + if (src1_reg != dst_reg) { + compiler->mode32 = op & SLJIT_32; + return emit_mov(compiler, dst_reg, dstw, src1_reg, 0); + } +#else /* !SLJIT_CONFIG_X86_64 */ + if (restore_ecx) + EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, restore_ecx == 1 ? TMP_REG1 : SLJIT_MEM1(SLJIT_SP), 0); + + if (src1_reg != dst_reg) + EMIT_MOV(compiler, dst_reg, dstw, src1_reg, 0); + + if (restore_sp4) + return emit_mov(compiler, restore_sp4, 0, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_s32)); +#endif /* SLJIT_CONFIG_X86_32 */ + return SLJIT_SUCCESS; } @@ -2371,7 +2796,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_src(struct sljit_compiler *comp /* Don't adjust shadow stack if it isn't enabled. */ if (!cpu_has_shadow_stack ()) return SLJIT_SUCCESS; - return adjust_shadow_stack(compiler, src, srcw, SLJIT_UNUSED, 0); + return adjust_shadow_stack(compiler, src, srcw); case SLJIT_PREFETCH_L1: case SLJIT_PREFETCH_L2: case SLJIT_PREFETCH_L3: @@ -2382,28 +2807,45 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_src(struct sljit_compiler *comp return SLJIT_SUCCESS; } -SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_register_index(sljit_s32 reg) +SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_dst(struct sljit_compiler *compiler, sljit_s32 op, + sljit_s32 dst, sljit_sw dstw) { - CHECK_REG_INDEX(check_sljit_get_register_index(reg)); -#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) - if (reg >= SLJIT_R3 && reg <= SLJIT_R8) - return -1; -#endif - return reg_map[reg]; + CHECK_ERROR(); + CHECK(check_sljit_emit_op_dst(compiler, op, dst, dstw)); + ADJUST_LOCAL_OFFSET(dst, dstw); + + CHECK_EXTRA_REGS(dst, dstw, (void)0); + + switch (op) { + case SLJIT_FAST_ENTER: + return emit_fast_enter(compiler, dst, dstw); + case SLJIT_GET_RETURN_ADDRESS: + return sljit_emit_get_return_address(compiler, dst, dstw); + } + + return SLJIT_SUCCESS; } -SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_float_register_index(sljit_s32 reg) +SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_register_index(sljit_s32 type, sljit_s32 reg) { - CHECK_REG_INDEX(check_sljit_get_float_register_index(reg)); + CHECK_REG_INDEX(check_sljit_get_register_index(type, reg)); + + if (type == SLJIT_GP_REGISTER) { #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) - return reg; -#else + if (reg >= SLJIT_R3 && reg <= SLJIT_R8) + return -1; +#endif /* SLJIT_CONFIG_X86_32 */ + return reg_map[reg]; + } + + if (type != SLJIT_FLOAT_REGISTER && type != SLJIT_SIMD_REG_128 && type != SLJIT_SIMD_REG_256 && type != SLJIT_SIMD_REG_512) + return -1; + return freg_map[reg]; -#endif } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_custom(struct sljit_compiler *compiler, - void *instruction, sljit_s32 size) + void *instruction, sljit_u32 size) { sljit_u8 *inst; @@ -2422,13 +2864,15 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_custom(struct sljit_compiler *c /* --------------------------------------------------------------------- */ /* Alignment(3) + 4 * 16 bytes. */ -static sljit_s32 sse2_data[3 + (4 * 4)]; -static sljit_s32 *sse2_buffer; +static sljit_u32 sse2_data[3 + (4 * 4)]; +static sljit_u32 *sse2_buffer; static void init_compiler(void) { + get_cpu_features(); + /* Align to 16 bytes. */ - sse2_buffer = (sljit_s32*)(((sljit_uw)sse2_data + 15) & ~0xf); + sse2_buffer = (sljit_u32*)(((sljit_uw)sse2_data + 15) & ~(sljit_uw)0xf); /* Single precision constants (each constant is 16 byte long). */ sse2_buffer[0] = 0x80000000; @@ -2440,58 +2884,60 @@ static void init_compiler(void) sse2_buffer[13] = 0x7fffffff; } -static sljit_s32 emit_sse2(struct sljit_compiler *compiler, sljit_u8 opcode, - sljit_s32 single, sljit_s32 xmm1, sljit_s32 xmm2, sljit_sw xmm2w) +static sljit_s32 emit_groupf(struct sljit_compiler *compiler, + sljit_uw op, + sljit_s32 dst, sljit_s32 src, sljit_sw srcw) { - sljit_u8 *inst; - - inst = emit_x86_instruction(compiler, 2 | (single ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, xmm1, 0, xmm2, xmm2w); + sljit_u8 *inst = emit_x86_instruction(compiler, 2 | (op & ~(sljit_uw)0xff), dst, 0, src, srcw); FAIL_IF(!inst); - *inst++ = GROUP_0F; - *inst = opcode; + inst[0] = GROUP_0F; + inst[1] = op & 0xff; return SLJIT_SUCCESS; } -static sljit_s32 emit_sse2_logic(struct sljit_compiler *compiler, sljit_u8 opcode, - sljit_s32 pref66, sljit_s32 xmm1, sljit_s32 xmm2, sljit_sw xmm2w) +static sljit_s32 emit_groupf_ext(struct sljit_compiler *compiler, + sljit_uw op, + sljit_s32 dst, sljit_s32 src, sljit_sw srcw) { sljit_u8 *inst; - inst = emit_x86_instruction(compiler, 2 | (pref66 ? EX86_PREF_66 : 0) | EX86_SSE2, xmm1, 0, xmm2, xmm2w); + SLJIT_ASSERT((op & EX86_SSE2) && ((op & VEX_OP_0F38) || (op & VEX_OP_0F3A))); + + inst = emit_x86_instruction(compiler, 3 | (op & ~((sljit_uw)0xff | VEX_OP_0F38 | VEX_OP_0F3A)), dst, 0, src, srcw); FAIL_IF(!inst); - *inst++ = GROUP_0F; - *inst = opcode; + inst[0] = GROUP_0F; + inst[1] = U8((op & VEX_OP_0F38) ? 0x38 : 0x3A); + inst[2] = op & 0xff; return SLJIT_SUCCESS; } static SLJIT_INLINE sljit_s32 emit_sse2_load(struct sljit_compiler *compiler, sljit_s32 single, sljit_s32 dst, sljit_s32 src, sljit_sw srcw) { - return emit_sse2(compiler, MOVSD_x_xm, single, dst, src, srcw); + return emit_groupf(compiler, MOVSD_x_xm | (single ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, dst, src, srcw); } static SLJIT_INLINE sljit_s32 emit_sse2_store(struct sljit_compiler *compiler, sljit_s32 single, sljit_s32 dst, sljit_sw dstw, sljit_s32 src) { - return emit_sse2(compiler, MOVSD_xm_x, single, src, dst, dstw); + return emit_groupf(compiler, MOVSD_xm_x | (single ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, src, dst, dstw); } static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_sw_from_f64(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 dst, sljit_sw dstw, sljit_s32 src, sljit_sw srcw) { - sljit_s32 dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1; - sljit_u8 *inst; + sljit_s32 dst_r; + + CHECK_EXTRA_REGS(dst, dstw, (void)0); + dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1; #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) if (GET_OPCODE(op) == SLJIT_CONV_SW_FROM_F64) compiler->mode32 = 0; #endif - inst = emit_x86_instruction(compiler, 2 | ((op & SLJIT_F32_OP) ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2_OP2, dst_r, 0, src, srcw); - FAIL_IF(!inst); - *inst++ = GROUP_0F; - *inst = CVTTSD2SI_r_xm; + FAIL_IF(emit_groupf(compiler, CVTTSD2SI_r_xm | EX86_SELECT_F2_F3(op) | EX86_SSE2_OP2, dst_r, src, srcw)); if (dst & SLJIT_MEM) return emit_mov(compiler, dst, dstw, TMP_REG1, 0); @@ -2503,14 +2949,15 @@ static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_f64_from_sw(struct sljit_comp sljit_s32 src, sljit_sw srcw) { sljit_s32 dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG; - sljit_u8 *inst; + + CHECK_EXTRA_REGS(src, srcw, (void)0); #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_SW) compiler->mode32 = 0; #endif - if (src & SLJIT_IMM) { + if (src == SLJIT_IMM) { #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_S32) srcw = (sljit_s32)srcw; @@ -2520,16 +2967,13 @@ static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_f64_from_sw(struct sljit_comp srcw = 0; } - inst = emit_x86_instruction(compiler, 2 | ((op & SLJIT_F32_OP) ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2_OP1, dst_r, 0, src, srcw); - FAIL_IF(!inst); - *inst++ = GROUP_0F; - *inst = CVTSI2SD_x_rm; + FAIL_IF(emit_groupf(compiler, CVTSI2SD_x_rm | EX86_SELECT_F2_F3(op) | EX86_SSE2_OP1, dst_r, src, srcw)); #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) compiler->mode32 = 1; #endif if (dst_r == TMP_FREG) - return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, TMP_FREG); + return emit_sse2_store(compiler, op & SLJIT_32, dst, dstw, TMP_FREG); return SLJIT_SUCCESS; } @@ -2537,12 +2981,37 @@ static SLJIT_INLINE sljit_s32 sljit_emit_fop1_cmp(struct sljit_compiler *compile sljit_s32 src1, sljit_sw src1w, sljit_s32 src2, sljit_sw src2w) { + switch (GET_FLAG_TYPE(op)) { + case SLJIT_ORDERED_EQUAL: + /* Also: SLJIT_UNORDERED_OR_NOT_EQUAL */ + FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src1, src1w)); + FAIL_IF(emit_groupf(compiler, CMPS_x_xm | EX86_SELECT_F2_F3(op) | EX86_SSE2, TMP_FREG, src2, src2w)); + + /* EQ */ + FAIL_IF(emit_byte(compiler, 0)); + + src1 = TMP_FREG; + src2 = TMP_FREG; + src2w = 0; + break; + + case SLJIT_ORDERED_LESS: + case SLJIT_UNORDERED_OR_GREATER: + /* Also: SLJIT_UNORDERED_OR_GREATER_EQUAL, SLJIT_ORDERED_LESS_EQUAL */ + if (!FAST_IS_REG(src2)) { + FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src2, src2w)); + src2 = TMP_FREG; + } + + return emit_groupf(compiler, UCOMISD_x_xm | EX86_SELECT_66(op) | EX86_SSE2, src2, src1, src1w); + } + if (!FAST_IS_REG(src1)) { - FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, TMP_FREG, src1, src1w)); + FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src1, src1w)); src1 = TMP_FREG; } - return emit_sse2_logic(compiler, UCOMISD_x_xm, !(op & SLJIT_F32_OP), src1, src2, src2w); + return emit_groupf(compiler, UCOMISD_x_xm | EX86_SELECT_66(op) | EX86_SSE2, src1, src2, src2w); } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop1(struct sljit_compiler *compiler, sljit_s32 op, @@ -2550,6 +3019,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop1(struct sljit_compiler *compil sljit_s32 src, sljit_sw srcw) { sljit_s32 dst_r; + sljit_u8 *inst; #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) compiler->mode32 = 1; @@ -2560,11 +3030,11 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop1(struct sljit_compiler *compil if (GET_OPCODE(op) == SLJIT_MOV_F64) { if (FAST_IS_REG(dst)) - return emit_sse2_load(compiler, op & SLJIT_F32_OP, dst, src, srcw); + return emit_sse2_load(compiler, op & SLJIT_32, dst, src, srcw); if (FAST_IS_REG(src)) - return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, src); - FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, TMP_FREG, src, srcw)); - return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, TMP_FREG); + return emit_sse2_store(compiler, op & SLJIT_32, dst, dstw, src); + FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src, srcw)); + return emit_sse2_store(compiler, op & SLJIT_32, dst, dstw, TMP_FREG); } if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_F32) { @@ -2573,42 +3043,57 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop1(struct sljit_compiler *compil /* We overwrite the high bits of source. From SLJIT point of view, this is not an issue. Note: In SSE3, we could also use MOVDDUP and MOVSLDUP. */ - FAIL_IF(emit_sse2_logic(compiler, UNPCKLPD_x_xm, op & SLJIT_F32_OP, src, src, 0)); - } - else { - FAIL_IF(emit_sse2_load(compiler, !(op & SLJIT_F32_OP), TMP_FREG, src, srcw)); + FAIL_IF(emit_groupf(compiler, UNPCKLPD_x_xm | ((op & SLJIT_32) ? EX86_PREF_66 : 0) | EX86_SSE2, src, src, 0)); + } else { + FAIL_IF(emit_sse2_load(compiler, !(op & SLJIT_32), TMP_FREG, src, srcw)); src = TMP_FREG; } - FAIL_IF(emit_sse2_logic(compiler, CVTPD2PS_x_xm, op & SLJIT_F32_OP, dst_r, src, 0)); + FAIL_IF(emit_groupf(compiler, CVTPD2PS_x_xm | ((op & SLJIT_32) ? EX86_PREF_66 : 0) | EX86_SSE2, dst_r, src, 0)); if (dst_r == TMP_FREG) - return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, TMP_FREG); + return emit_sse2_store(compiler, op & SLJIT_32, dst, dstw, TMP_FREG); return SLJIT_SUCCESS; } if (FAST_IS_REG(dst)) { - dst_r = dst; - if (dst != src) - FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, dst_r, src, srcw)); - } - else { - dst_r = TMP_FREG; - FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, dst_r, src, srcw)); + dst_r = (dst == src) ? TMP_FREG : dst; + + if (src & SLJIT_MEM) + FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src, srcw)); + + FAIL_IF(emit_groupf(compiler, PCMPEQD_x_xm | EX86_PREF_66 | EX86_SSE2, dst_r, dst_r, 0)); + + inst = emit_x86_instruction(compiler, 2 | EX86_PREF_66 | EX86_SSE2_OP2, 0, 0, dst_r, 0); + inst[0] = GROUP_0F; + /* Same as PSRLD_x / PSRLQ_x */ + inst[1] = (op & SLJIT_32) ? PSLLD_x_i8 : PSLLQ_x_i8; + + if (GET_OPCODE(op) == SLJIT_ABS_F64) { + inst[2] |= 2 << 3; + FAIL_IF(emit_byte(compiler, 1)); + } else { + inst[2] |= 6 << 3; + FAIL_IF(emit_byte(compiler, ((op & SLJIT_32) ? 31 : 63))); + } + + if (dst_r != TMP_FREG) + dst_r = (src & SLJIT_MEM) ? TMP_FREG : src; + return emit_groupf(compiler, (GET_OPCODE(op) == SLJIT_NEG_F64 ? XORPD_x_xm : ANDPD_x_xm) | EX86_SSE2, dst, dst_r, 0); } + FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src, srcw)); + switch (GET_OPCODE(op)) { case SLJIT_NEG_F64: - FAIL_IF(emit_sse2_logic(compiler, XORPD_x_xm, 1, dst_r, SLJIT_MEM0(), (sljit_sw)(op & SLJIT_F32_OP ? sse2_buffer : sse2_buffer + 8))); + FAIL_IF(emit_groupf(compiler, XORPD_x_xm | EX86_SELECT_66(op) | EX86_SSE2, TMP_FREG, SLJIT_MEM0(), (sljit_sw)((op & SLJIT_32) ? sse2_buffer : sse2_buffer + 8))); break; case SLJIT_ABS_F64: - FAIL_IF(emit_sse2_logic(compiler, ANDPD_x_xm, 1, dst_r, SLJIT_MEM0(), (sljit_sw)(op & SLJIT_F32_OP ? sse2_buffer + 4 : sse2_buffer + 12))); + FAIL_IF(emit_groupf(compiler, ANDPD_x_xm | EX86_SELECT_66(op) | EX86_SSE2, TMP_FREG, SLJIT_MEM0(), (sljit_sw)((op & SLJIT_32) ? sse2_buffer + 4 : sse2_buffer + 12))); break; } - if (dst_r == TMP_FREG) - return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, TMP_FREG); - return SLJIT_SUCCESS; + return emit_sse2_store(compiler, op & SLJIT_32, dst, dstw, TMP_FREG); } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop2(struct sljit_compiler *compiler, sljit_s32 op, @@ -2638,40 +3123,79 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop2(struct sljit_compiler *compil src2w = src1w; } else if (dst != src2) - FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, dst_r, src1, src1w)); + FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, dst_r, src1, src1w)); else { dst_r = TMP_FREG; - FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, TMP_FREG, src1, src1w)); + FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src1, src1w)); } } else { dst_r = TMP_FREG; - FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, TMP_FREG, src1, src1w)); + FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src1, src1w)); } switch (GET_OPCODE(op)) { case SLJIT_ADD_F64: - FAIL_IF(emit_sse2(compiler, ADDSD_x_xm, op & SLJIT_F32_OP, dst_r, src2, src2w)); + FAIL_IF(emit_groupf(compiler, ADDSD_x_xm | EX86_SELECT_F2_F3(op) | EX86_SSE2, dst_r, src2, src2w)); break; case SLJIT_SUB_F64: - FAIL_IF(emit_sse2(compiler, SUBSD_x_xm, op & SLJIT_F32_OP, dst_r, src2, src2w)); + FAIL_IF(emit_groupf(compiler, SUBSD_x_xm | EX86_SELECT_F2_F3(op) | EX86_SSE2, dst_r, src2, src2w)); break; case SLJIT_MUL_F64: - FAIL_IF(emit_sse2(compiler, MULSD_x_xm, op & SLJIT_F32_OP, dst_r, src2, src2w)); + FAIL_IF(emit_groupf(compiler, MULSD_x_xm | EX86_SELECT_F2_F3(op) | EX86_SSE2, dst_r, src2, src2w)); break; case SLJIT_DIV_F64: - FAIL_IF(emit_sse2(compiler, DIVSD_x_xm, op & SLJIT_F32_OP, dst_r, src2, src2w)); + FAIL_IF(emit_groupf(compiler, DIVSD_x_xm | EX86_SELECT_F2_F3(op) | EX86_SSE2, dst_r, src2, src2w)); break; } if (dst_r == TMP_FREG) - return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, TMP_FREG); + return emit_sse2_store(compiler, op & SLJIT_32, dst, dstw, TMP_FREG); return SLJIT_SUCCESS; } +SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop2r(struct sljit_compiler *compiler, sljit_s32 op, + sljit_s32 dst_freg, + sljit_s32 src1, sljit_sw src1w, + sljit_s32 src2, sljit_sw src2w) +{ + sljit_uw pref; + + CHECK_ERROR(); + CHECK(check_sljit_emit_fop2r(compiler, op, dst_freg, src1, src1w, src2, src2w)); + ADJUST_LOCAL_OFFSET(src1, src1w); + ADJUST_LOCAL_OFFSET(src2, src2w); + +#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) + compiler->mode32 = 1; +#endif + + if (dst_freg == src1) { + FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src2, src2w)); + pref = EX86_SELECT_66(op) | EX86_SSE2; + FAIL_IF(emit_groupf(compiler, XORPD_x_xm | pref, TMP_FREG, src1, src1w)); + FAIL_IF(emit_groupf(compiler, ANDPD_x_xm | pref, TMP_FREG, SLJIT_MEM0(), (sljit_sw)((op & SLJIT_32) ? sse2_buffer : sse2_buffer + 8))); + return emit_groupf(compiler, XORPD_x_xm | pref, dst_freg, TMP_FREG, 0); + } + + if (src1 & SLJIT_MEM) { + FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src1, src1w)); + src1 = TMP_FREG; + src1w = 0; + } + + if (dst_freg != src2) + FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, dst_freg, src2, src2w)); + + pref = EX86_SELECT_66(op) | EX86_SSE2; + FAIL_IF(emit_groupf(compiler, XORPD_x_xm | pref, dst_freg, src1, src1w)); + FAIL_IF(emit_groupf(compiler, ANDPD_x_xm | pref, dst_freg, SLJIT_MEM0(), (sljit_sw)((op & SLJIT_32) ? sse2_buffer : sse2_buffer + 8))); + return emit_groupf(compiler, XORPD_x_xm | pref, dst_freg, src1, src1w); +} + /* --------------------------------------------------------------------- */ /* Conditional instructions */ /* --------------------------------------------------------------------- */ @@ -2693,9 +3217,8 @@ SLJIT_API_FUNC_ATTRIBUTE struct sljit_label* sljit_emit_label(struct sljit_compi inst = (sljit_u8*)ensure_buf(compiler, 2); PTR_FAIL_IF(!inst); - - *inst++ = 0; - *inst++ = 0; + inst[0] = 0; + inst[1] = 0; return label; } @@ -2710,7 +3233,7 @@ SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_jump(struct sljit_compile jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump)); PTR_FAIL_IF_NULL(jump); - set_jump(jump, compiler, (type & SLJIT_REWRITABLE_JUMP) | ((type & 0xff) << TYPE_SHIFT)); + set_jump(jump, compiler, (sljit_u32)((type & SLJIT_REWRITABLE_JUMP) | ((type & 0xff) << TYPE_SHIFT))); type &= 0xff; /* Worst case size. */ @@ -2723,8 +3246,8 @@ SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_jump(struct sljit_compile inst = (sljit_u8*)ensure_buf(compiler, 2); PTR_FAIL_IF_NULL(inst); - *inst++ = 0; - *inst++ = 1; + inst[0] = 0; + inst[1] = 1; return jump; } @@ -2742,8 +3265,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_ijump(struct sljit_compiler *compi if (src == SLJIT_IMM) { jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump)); FAIL_IF_NULL(jump); - set_jump(jump, compiler, JUMP_ADDR | (type << TYPE_SHIFT)); - jump->u.target = srcw; + set_jump(jump, compiler, (sljit_u32)(JUMP_ADDR | (type << TYPE_SHIFT))); + jump->u.target = (sljit_uw)srcw; /* Worst case size. */ #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) @@ -2755,8 +3278,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_ijump(struct sljit_compiler *compi inst = (sljit_u8*)ensure_buf(compiler, 2); FAIL_IF_NULL(inst); - *inst++ = 0; - *inst++ = 1; + inst[0] = 0; + inst[1] = 1; } else { #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) @@ -2765,8 +3288,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_ijump(struct sljit_compiler *compi #endif inst = emit_x86_instruction(compiler, 1, 0, 0, src, srcw); FAIL_IF(!inst); - *inst++ = GROUP_FF; - *inst |= (type >= SLJIT_FAST_CALL) ? CALL_rm : JMP_rm; + inst[0] = GROUP_FF; + inst[1] = U8(inst[1] | ((type >= SLJIT_FAST_CALL) ? CALL_rm : JMP_rm)); } return SLJIT_SUCCESS; } @@ -2776,10 +3299,10 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *co sljit_s32 type) { sljit_u8 *inst; - sljit_u8 cond_set = 0; + sljit_u8 cond_set; #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) sljit_s32 reg; -#endif +#endif /* !SLJIT_CONFIG_X86_64 */ /* ADJUST_LOCAL_OFFSET and CHECK_EXTRA_REGS might overwrite these values. */ sljit_s32 dst_save = dst; sljit_sw dstw_save = dstw; @@ -2790,9 +3313,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *co ADJUST_LOCAL_OFFSET(dst, dstw); CHECK_EXTRA_REGS(dst, dstw, (void)0); - type &= 0xff; /* setcc = jcc + 0x10. */ - cond_set = get_jump_code(type) + 0x10; + cond_set = U8(get_jump_code((sljit_uw)type) + 0x10); #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) if (GET_OPCODE(op) == SLJIT_OR && !GET_ALL_FLAGS(op) && FAST_IS_REG(dst)) { @@ -2800,13 +3322,13 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *co FAIL_IF(!inst); INC_SIZE(4 + 3); /* Set low register to conditional flag. */ - *inst++ = (reg_map[TMP_REG1] <= 7) ? REX : REX_B; - *inst++ = GROUP_0F; - *inst++ = cond_set; - *inst++ = MOD_REG | reg_lmap[TMP_REG1]; - *inst++ = REX | (reg_map[TMP_REG1] <= 7 ? 0 : REX_R) | (reg_map[dst] <= 7 ? 0 : REX_B); - *inst++ = OR_rm8_r8; - *inst++ = MOD_REG | (reg_lmap[TMP_REG1] << 3) | reg_lmap[dst]; + inst[0] = (reg_map[TMP_REG1] <= 7) ? REX : REX_B; + inst[1] = GROUP_0F; + inst[2] = cond_set; + inst[3] = MOD_REG | reg_lmap[TMP_REG1]; + inst[4] = U8(REX | (reg_map[TMP_REG1] <= 7 ? 0 : REX_R) | (reg_map[dst] <= 7 ? 0 : REX_B)); + inst[5] = OR_rm8_r8; + inst[6] = U8(MOD_REG | (reg_lmap[TMP_REG1] << 3) | reg_lmap[dst]); return SLJIT_SUCCESS; } @@ -2816,15 +3338,15 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *co FAIL_IF(!inst); INC_SIZE(4 + 4); /* Set low register to conditional flag. */ - *inst++ = (reg_map[reg] <= 7) ? REX : REX_B; - *inst++ = GROUP_0F; - *inst++ = cond_set; - *inst++ = MOD_REG | reg_lmap[reg]; - *inst++ = REX_W | (reg_map[reg] <= 7 ? 0 : (REX_B | REX_R)); + inst[0] = (reg_map[reg] <= 7) ? REX : REX_B; + inst[1] = GROUP_0F; + inst[2] = cond_set; + inst[3] = MOD_REG | reg_lmap[reg]; + inst[4] = REX_W | (reg_map[reg] <= 7 ? 0 : (REX_B | REX_R)); /* The movzx instruction does not affect flags. */ - *inst++ = GROUP_0F; - *inst++ = MOVZX_r_rm8; - *inst = MOD_REG | (reg_lmap[reg] << 3) | reg_lmap[reg]; + inst[5] = GROUP_0F; + inst[6] = MOVZX_r_rm8; + inst[7] = U8(MOD_REG | (reg_lmap[reg] << 3) | reg_lmap[reg]); if (reg != TMP_REG1) return SLJIT_SUCCESS; @@ -2834,165 +3356,1314 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *co return emit_mov(compiler, dst, dstw, TMP_REG1, 0); } -#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \ - || (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS) - compiler->skip_checks = 1; -#endif + SLJIT_SKIP_CHECKS(compiler); return sljit_emit_op2(compiler, op, dst_save, dstw_save, dst_save, dstw_save, TMP_REG1, 0); -#else +#else /* !SLJIT_CONFIG_X86_64 */ + SLJIT_ASSERT(reg_map[TMP_REG1] < 4); + /* The SLJIT_CONFIG_X86_32 code path starts here. */ - if (GET_OPCODE(op) < SLJIT_ADD && FAST_IS_REG(dst)) { - if (reg_map[dst] <= 4) { - /* Low byte is accessible. */ - inst = (sljit_u8*)ensure_buf(compiler, 1 + 3 + 3); - FAIL_IF(!inst); - INC_SIZE(3 + 3); - /* Set low byte to conditional flag. */ - *inst++ = GROUP_0F; - *inst++ = cond_set; - *inst++ = MOD_REG | reg_map[dst]; - - *inst++ = GROUP_0F; - *inst++ = MOVZX_r_rm8; - *inst = MOD_REG | (reg_map[dst] << 3) | reg_map[dst]; + if (GET_OPCODE(op) < SLJIT_ADD && FAST_IS_REG(dst) && reg_map[dst] <= 4) { + /* Low byte is accessible. */ + inst = (sljit_u8*)ensure_buf(compiler, 1 + 3 + 3); + FAIL_IF(!inst); + INC_SIZE(3 + 3); + /* Set low byte to conditional flag. */ + inst[0] = GROUP_0F; + inst[1] = cond_set; + inst[2] = U8(MOD_REG | reg_map[dst]); + + inst[3] = GROUP_0F; + inst[4] = MOVZX_r_rm8; + inst[5] = U8(MOD_REG | (reg_map[dst] << 3) | reg_map[dst]); + return SLJIT_SUCCESS; + } + + if (GET_OPCODE(op) == SLJIT_OR && !GET_ALL_FLAGS(op) && FAST_IS_REG(dst) && reg_map[dst] <= 4) { + inst = (sljit_u8*)ensure_buf(compiler, 1 + 3 + 2); + FAIL_IF(!inst); + INC_SIZE(3 + 2); + + /* Set low byte to conditional flag. */ + inst[0] = GROUP_0F; + inst[1] = cond_set; + inst[2] = U8(MOD_REG | reg_map[TMP_REG1]); + + inst[3] = OR_rm8_r8; + inst[4] = U8(MOD_REG | (reg_map[TMP_REG1] << 3) | reg_map[dst]); + return SLJIT_SUCCESS; + } + + inst = (sljit_u8*)ensure_buf(compiler, 1 + 3 + 3); + FAIL_IF(!inst); + INC_SIZE(3 + 3); + /* Set low byte to conditional flag. */ + inst[0] = GROUP_0F; + inst[1] = cond_set; + inst[2] = U8(MOD_REG | reg_map[TMP_REG1]); + + inst[3] = GROUP_0F; + inst[4] = MOVZX_r_rm8; + inst[5] = U8(MOD_REG | (reg_map[TMP_REG1] << 3) | reg_map[TMP_REG1]); + + if (GET_OPCODE(op) < SLJIT_ADD) + return emit_mov(compiler, dst, dstw, TMP_REG1, 0); + + SLJIT_SKIP_CHECKS(compiler); + return sljit_emit_op2(compiler, op, dst_save, dstw_save, dst_save, dstw_save, TMP_REG1, 0); +#endif /* SLJIT_CONFIG_X86_64 */ +} + +SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_select(struct sljit_compiler *compiler, sljit_s32 type, + sljit_s32 dst_reg, + sljit_s32 src1, sljit_sw src1w, + sljit_s32 src2_reg) +{ +#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) + sljit_s32 dst = dst_reg; + sljit_sw dstw = 0; +#endif /* SLJIT_CONFIG_X86_32 */ + sljit_sw src2w = 0; + + CHECK_ERROR(); + CHECK(check_sljit_emit_select(compiler, type, dst_reg, src1, src1w, src2_reg)); + + ADJUST_LOCAL_OFFSET(src1, src1w); + + CHECK_EXTRA_REGS(dst, dstw, (void)0); + CHECK_EXTRA_REGS(src1, src1w, (void)0); + CHECK_EXTRA_REGS(src2_reg, src2w, (void)0); + +#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) + compiler->mode32 = type & SLJIT_32; +#endif /* SLJIT_CONFIG_X86_64 */ + type &= ~SLJIT_32; + +#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) + if (dst & SLJIT_MEM) { + if (src1 == SLJIT_IMM || (!(src1 & SLJIT_MEM) && (src2_reg & SLJIT_MEM))) { + EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w); + src1 = src2_reg; + src1w = src2w; + type ^= 0x1; + } else + EMIT_MOV(compiler, TMP_REG1, 0, src2_reg, src2w); + + dst_reg = TMP_REG1; + } else { +#endif /* SLJIT_CONFIG_X86_32 */ + if (dst_reg != src2_reg) { + if (dst_reg == src1) { + src1 = src2_reg; + src1w = src2w; + type ^= 0x1; + } else { + if (ADDRESSING_DEPENDS_ON(src1, dst_reg)) { + EMIT_MOV(compiler, dst_reg, 0, src1, src1w); + src1 = src2_reg; + src1w = src2w; + type ^= 0x1; + } else + EMIT_MOV(compiler, dst_reg, 0, src2_reg, src2w); + } + } + + if (SLJIT_UNLIKELY(src1 == SLJIT_IMM)) { + SLJIT_ASSERT(dst_reg != TMP_REG1); + EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w); + src1 = TMP_REG1; + src1w = 0; + } +#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) + } +#endif /* SLJIT_CONFIG_X86_32 */ + + if (sljit_has_cpu_feature(SLJIT_HAS_CMOV)) + FAIL_IF(emit_groupf(compiler, U8(get_jump_code((sljit_uw)type) - 0x40), dst_reg, src1, src1w)); + else + FAIL_IF(emit_cmov_generic(compiler, type, dst_reg, src1, src1w)); + +#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) + if (dst_reg == TMP_REG1) + return emit_mov(compiler, dst, dstw, TMP_REG1, 0); +#endif /* SLJIT_CONFIG_X86_32 */ + return SLJIT_SUCCESS; +} + +SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fselect(struct sljit_compiler *compiler, sljit_s32 type, + sljit_s32 dst_freg, + sljit_s32 src1, sljit_sw src1w, + sljit_s32 src2_freg) +{ + sljit_u8* inst; + sljit_uw size; + + CHECK_ERROR(); + CHECK(check_sljit_emit_fselect(compiler, type, dst_freg, src1, src1w, src2_freg)); + + ADJUST_LOCAL_OFFSET(src1, src1w); + +#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) + compiler->mode32 = 1; +#endif /* SLJIT_CONFIG_X86_64 */ + + if (dst_freg != src2_freg) { + if (dst_freg == src1) { + src1 = src2_freg; + src1w = 0; + type ^= 0x1; + } else + FAIL_IF(emit_sse2_load(compiler, type & SLJIT_32, dst_freg, src2_freg, 0)); + } + + inst = (sljit_u8*)ensure_buf(compiler, 1 + 2); + FAIL_IF(!inst); + INC_SIZE(2); + inst[0] = U8(get_jump_code((sljit_uw)(type & ~SLJIT_32) ^ 0x1) - 0x10); + + size = compiler->size; + FAIL_IF(emit_sse2_load(compiler, type & SLJIT_32, dst_freg, src1, src1w)); + + inst[1] = U8(compiler->size - size); + return SLJIT_SUCCESS; +} + +SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *compiler, sljit_s32 type, + sljit_s32 freg, + sljit_s32 srcdst, sljit_sw srcdstw) +{ + sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); + sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type); + sljit_s32 alignment = SLJIT_SIMD_GET_ELEM2_SIZE(type); + sljit_uw op; + + CHECK_ERROR(); + CHECK(check_sljit_emit_simd_mov(compiler, type, freg, srcdst, srcdstw)); + + ADJUST_LOCAL_OFFSET(srcdst, srcdstw); + +#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) + compiler->mode32 = 1; +#endif /* SLJIT_CONFIG_X86_64 */ + + switch (reg_size) { + case 4: + op = EX86_SSE2; + break; + case 5: + if (!(cpu_feature_list & CPU_FEATURE_AVX2)) + return SLJIT_ERR_UNSUPPORTED; + op = EX86_SSE2 | VEX_256; + break; + default: + return SLJIT_ERR_UNSUPPORTED; + } + + if (!(srcdst & SLJIT_MEM)) + alignment = reg_size; + + if (type & SLJIT_SIMD_FLOAT) { + if (elem_size == 2 || elem_size == 3) { + op |= alignment >= reg_size ? MOVAPS_x_xm : MOVUPS_x_xm; + + if (elem_size == 3) + op |= EX86_PREF_66; + + if (type & SLJIT_SIMD_STORE) + op += 1; + } else + return SLJIT_ERR_UNSUPPORTED; + } else { + op |= ((type & SLJIT_SIMD_STORE) ? MOVDQA_xm_x : MOVDQA_x_xm) + | (alignment >= reg_size ? EX86_PREF_66 : EX86_PREF_F3); + } + + if (type & SLJIT_SIMD_TEST) + return SLJIT_SUCCESS; + + if (op & VEX_256) + return emit_vex_instruction(compiler, op, freg, 0, srcdst, srcdstw); + + return emit_groupf(compiler, op, freg, srcdst, srcdstw); +} + +SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compiler *compiler, sljit_s32 type, + sljit_s32 freg, + sljit_s32 src, sljit_sw srcw) +{ + sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); + sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type); + sljit_u8 *inst; + sljit_u8 opcode = 0; + sljit_uw size; + + CHECK_ERROR(); + CHECK(check_sljit_emit_simd_replicate(compiler, type, freg, src, srcw)); + + ADJUST_LOCAL_OFFSET(src, srcw); + + if (!(type & SLJIT_SIMD_FLOAT)) { + CHECK_EXTRA_REGS(src, srcw, (void)0); + } + +#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) + if ((type & SLJIT_SIMD_FLOAT) ? (elem_size < 2 || elem_size > 3) : (elem_size > 2)) + return SLJIT_ERR_UNSUPPORTED; +#else /* !SLJIT_CONFIG_X86_32 */ + compiler->mode32 = 1; + + if (elem_size > 3 || ((type & SLJIT_SIMD_FLOAT) && elem_size < 2)) + return SLJIT_ERR_UNSUPPORTED; +#endif /* SLJIT_CONFIG_X86_32 */ + + if (cpu_feature_list & CPU_FEATURE_AVX2) { + if (reg_size < 4 || reg_size > 5) + return SLJIT_ERR_UNSUPPORTED; + + if (src != SLJIT_IMM && (reg_size == 5 || elem_size < 3 || !(type & SLJIT_SIMD_FLOAT))) { + if (type & SLJIT_SIMD_TEST) + return SLJIT_SUCCESS; + + if (!(src & SLJIT_MEM) && !(type & SLJIT_SIMD_FLOAT)) { +#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) + if (elem_size >= 3) + compiler->mode32 = 0; +#endif /* SLJIT_CONFIG_X86_64 */ + FAIL_IF(emit_groupf(compiler, MOVD_x_rm | EX86_PREF_66 | EX86_SSE2_OP1, freg, src, srcw)); +#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) + compiler->mode32 = 1; +#endif /* SLJIT_CONFIG_X86_64 */ + src = freg; + srcw = 0; + } + + switch (elem_size) { + case 0: + size = VPBROADCASTB_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2; + break; + case 1: + size = VPBROADCASTW_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2; + break; + case 2: + size = ((type & SLJIT_SIMD_FLOAT) ? VBROADCASTSS_x_xm : VPBROADCASTD_x_xm) | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2; + break; + default: +#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) + size = VBROADCASTSD_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2; +#else /* !SLJIT_CONFIG_X86_32 */ + size = ((type & SLJIT_SIMD_FLOAT) ? VBROADCASTSD_x_xm : VPBROADCASTQ_x_xm) | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2; +#endif /* SLJIT_CONFIG_X86_32 */ + break; + } + + if (reg_size == 5) + size |= VEX_256; + + return emit_vex_instruction(compiler, size, freg, 0, src, srcw); + } + } else if (reg_size != 4) + return SLJIT_ERR_UNSUPPORTED; + + if (type & SLJIT_SIMD_TEST) + return SLJIT_SUCCESS; + + if (type & SLJIT_SIMD_FLOAT) { + if (src == SLJIT_IMM) { + if (reg_size == 5) + return emit_vex_instruction(compiler, XORPD_x_xm | VEX_256 | (elem_size == 3 ? EX86_PREF_66 : 0) | EX86_SSE2 | VEX_SSE2_OPV, freg, freg, freg, 0); + + return emit_groupf(compiler, XORPD_x_xm | (elem_size == 3 ? EX86_PREF_66 : 0) | EX86_SSE2, freg, freg, 0); + } + + if (elem_size == 2 && freg != src) { + FAIL_IF(emit_sse2_load(compiler, 1, freg, src, srcw)); + src = freg; + srcw = 0; + } + + FAIL_IF(emit_groupf(compiler, (elem_size == 2 ? SHUFPS_x_xm : MOVDDUP_x_xm) | (elem_size == 2 ? 0 : EX86_PREF_F2) | EX86_SSE2, freg, src, srcw)); + + if (elem_size == 2) + return emit_byte(compiler, 0); + return SLJIT_SUCCESS; + } + + if (src == SLJIT_IMM) { + if (elem_size == 0) { + srcw = (sljit_u8)srcw; + srcw |= srcw << 8; + srcw |= srcw << 16; + elem_size = 2; + } else if (elem_size == 1) { + srcw = (sljit_u16)srcw; + srcw |= srcw << 16; + elem_size = 2; + } + +#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) + if (elem_size == 2 && (sljit_s32)srcw == -1) + srcw = -1; +#endif /* SLJIT_CONFIG_X86_64 */ + + if (srcw == 0 || srcw == -1) { + if (reg_size == 5) + return emit_vex_instruction(compiler, (srcw == 0 ? PXOR_x_xm : PCMPEQD_x_xm) | VEX_256 | EX86_PREF_66 | EX86_SSE2 | VEX_SSE2_OPV, freg, freg, freg, 0); + + return emit_groupf(compiler, (srcw == 0 ? PXOR_x_xm : PCMPEQD_x_xm) | EX86_PREF_66 | EX86_SSE2, freg, freg, 0); + } + +#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) + if (elem_size == 3) + FAIL_IF(emit_load_imm64(compiler, TMP_REG1, srcw)); + else +#endif /* SLJIT_CONFIG_X86_64 */ + EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, srcw); + + src = TMP_REG1; + srcw = 0; + } + + size = 2; + opcode = MOVD_x_rm; + + switch (elem_size) { + case 0: + if (!FAST_IS_REG(src)) { + opcode = 0x3a /* Prefix of PINSRB_x_rm_i8. */; + size = 3; + } + break; + case 1: + if (!FAST_IS_REG(src)) + opcode = PINSRW_x_rm_i8; + break; + case 2: + break; +#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) + case 3: + /* MOVQ */ + compiler->mode32 = 0; + break; +#endif /* SLJIT_CONFIG_X86_64 */ + } + + inst = emit_x86_instruction(compiler, size | EX86_PREF_66 | EX86_SSE2_OP1, freg, 0, src, srcw); + FAIL_IF(!inst); + inst[0] = GROUP_0F; + inst[1] = opcode; + + if (reg_size == 5) { + SLJIT_ASSERT(opcode == MOVD_x_rm); +#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) + size = VPBROADCASTD_x_xm; +#else /* !SLJIT_CONFIG_X86_32 */ + size = (elem_size == 3) ? VPBROADCASTQ_x_xm : VPBROADCASTD_x_xm; +#endif /* SLJIT_CONFIG_X86_32 */ + return emit_vex_instruction(compiler, size | VEX_256 | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, 0, freg, 0); + } + + if (size == 3) { + SLJIT_ASSERT(opcode == 0x3a); + inst[2] = PINSRB_x_rm_i8; + } + + if (opcode != MOVD_x_rm) + FAIL_IF(emit_byte(compiler, 0)); + + switch (elem_size) { + case 0: + FAIL_IF(emit_groupf(compiler, PXOR_x_xm | EX86_PREF_66 | EX86_SSE2, TMP_FREG, TMP_FREG, 0)); + return emit_groupf_ext(compiler, PSHUFB_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, TMP_FREG, 0); + case 1: + FAIL_IF(emit_groupf(compiler, PSHUFLW_x_xm | EX86_PREF_F2 | EX86_SSE2, freg, freg, 0)); + FAIL_IF(emit_byte(compiler, 0)); + /* fallthrough */ + default: + FAIL_IF(emit_groupf(compiler, PSHUFD_x_xm | EX86_PREF_66 | EX86_SSE2, freg, freg, 0)); + return emit_byte(compiler, 0); +#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) + case 3: + compiler->mode32 = 1; + FAIL_IF(emit_groupf(compiler, PSHUFD_x_xm | EX86_PREF_66 | EX86_SSE2, freg, freg, 0)); + return emit_byte(compiler, 0x44); +#endif /* SLJIT_CONFIG_X86_64 */ + } +} + +SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compiler *compiler, sljit_s32 type, + sljit_s32 freg, sljit_s32 lane_index, + sljit_s32 srcdst, sljit_sw srcdstw) +{ + sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); + sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type); + sljit_u8 *inst; + sljit_u8 opcode = 0; + sljit_uw size; + sljit_s32 freg_orig = freg; +#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) + sljit_s32 srcdst_is_ereg = 0; + sljit_s32 srcdst_orig = 0; + sljit_sw srcdstw_orig = 0; +#endif /* SLJIT_CONFIG_X86_32 */ + + CHECK_ERROR(); + CHECK(check_sljit_emit_simd_lane_mov(compiler, type, freg, lane_index, srcdst, srcdstw)); + + ADJUST_LOCAL_OFFSET(srcdst, srcdstw); + + if (reg_size == 5) { + if (!(cpu_feature_list & CPU_FEATURE_AVX2)) + return SLJIT_ERR_UNSUPPORTED; + } else if (reg_size != 4) + return SLJIT_ERR_UNSUPPORTED; + +#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) + if ((type & SLJIT_SIMD_FLOAT) ? (elem_size < 2 || elem_size > 3) : elem_size > 2) + return SLJIT_ERR_UNSUPPORTED; +#else /* SLJIT_CONFIG_X86_32 */ + if (elem_size > 3 || ((type & SLJIT_SIMD_FLOAT) && elem_size < 2)) + return SLJIT_ERR_UNSUPPORTED; +#endif /* SLJIT_CONFIG_X86_32 */ + + if (type & SLJIT_SIMD_TEST) + return SLJIT_SUCCESS; + +#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) + compiler->mode32 = 1; +#else /* !SLJIT_CONFIG_X86_64 */ + if (!(type & SLJIT_SIMD_FLOAT)) { + CHECK_EXTRA_REGS(srcdst, srcdstw, srcdst_is_ereg = 1); + + if ((type & SLJIT_SIMD_STORE) && ((srcdst_is_ereg && elem_size < 2) || (elem_size == 0 && (type & SLJIT_SIMD_LANE_SIGNED) && FAST_IS_REG(srcdst) && reg_map[srcdst] >= 4))) { + srcdst_orig = srcdst; + srcdstw_orig = srcdstw; + srcdst = TMP_REG1; + srcdstw = 0; + } + } +#endif /* SLJIT_CONFIG_X86_64 */ + + if (type & SLJIT_SIMD_LANE_ZERO) { + if (lane_index == 0) { + if (!(type & SLJIT_SIMD_FLOAT)) { +#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) + if (elem_size == 3) { + compiler->mode32 = 0; + elem_size = 2; + } +#endif /* SLJIT_CONFIG_X86_64 */ + if (srcdst == SLJIT_IMM) { + if (elem_size == 0) + srcdstw = (sljit_u8)srcdstw; + else if (elem_size == 1) + srcdstw = (sljit_u16)srcdstw; + + EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, srcdstw); + srcdst = TMP_REG1; + srcdstw = 0; + elem_size = 2; + } + + if (elem_size == 2) { + if (reg_size == 4) + return emit_groupf(compiler, MOVD_x_rm | EX86_PREF_66 | EX86_SSE2_OP1, freg, srcdst, srcdstw); + return emit_vex_instruction(compiler, MOVD_x_rm | VEX_AUTO_W | EX86_PREF_66 | EX86_SSE2_OP1, freg, 0, srcdst, srcdstw); + } + } else if (srcdst & SLJIT_MEM) { + SLJIT_ASSERT(elem_size == 2 || elem_size == 3); + + if (reg_size == 4) + return emit_groupf(compiler, MOVSD_x_xm | (elem_size == 2 ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, freg, srcdst, srcdstw); + return emit_vex_instruction(compiler, MOVSD_x_xm | (elem_size == 2 ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, freg, 0, srcdst, srcdstw); + } else if (elem_size == 3) { + if (reg_size == 4) + return emit_groupf(compiler, MOVQ_x_xm | EX86_PREF_F3 | EX86_SSE2, freg, srcdst, 0); + return emit_vex_instruction(compiler, MOVQ_x_xm | EX86_PREF_F3 | EX86_SSE2, freg, 0, srcdst, 0); + } + } + + if (reg_size == 5 && lane_index >= (1 << (4 - elem_size))) { + freg = TMP_FREG; + lane_index -= (1 << (4 - elem_size)); + } else if ((type & SLJIT_SIMD_FLOAT) && freg == srcdst) { + FAIL_IF(emit_sse2_load(compiler, elem_size == 2, TMP_FREG, srcdst, srcdstw)); + srcdst = TMP_FREG; + srcdstw = 0; + } + + size = ((!(type & SLJIT_SIMD_FLOAT) || elem_size != 2) ? EX86_PREF_66 : 0) + | ((type & SLJIT_SIMD_FLOAT) ? XORPD_x_xm : PXOR_x_xm) | EX86_SSE2; + + if (reg_size == 5) + FAIL_IF(emit_vex_instruction(compiler, size | VEX_256 | VEX_SSE2_OPV, freg, freg, freg, 0)); + else + FAIL_IF(emit_groupf(compiler, size, freg, freg, 0)); + } else if (reg_size == 5 && lane_index >= (1 << (4 - elem_size))) { + FAIL_IF(emit_vex_instruction(compiler, ((type & SLJIT_SIMD_FLOAT) ? VEXTRACTF128_x_ym : VEXTRACTI128_x_ym) | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2, freg, 0, TMP_FREG, 0)); + FAIL_IF(emit_byte(compiler, 1)); + + freg = TMP_FREG; + lane_index -= (1 << (4 - elem_size)); + } + + if (type & SLJIT_SIMD_FLOAT) { + if (elem_size == 3) { + if (srcdst & SLJIT_MEM) { + if (type & SLJIT_SIMD_STORE) + size = lane_index == 0 ? MOVLPD_m_x : MOVHPD_m_x; + else + size = lane_index == 0 ? MOVLPD_x_m : MOVHPD_x_m; + + FAIL_IF(emit_groupf(compiler, size | EX86_PREF_66 | EX86_SSE2, freg, srcdst, srcdstw)); + + /* In case of store, freg is not TMP_FREG. */ + } else if (type & SLJIT_SIMD_STORE) { + if (lane_index == 1) + return emit_groupf(compiler, MOVHLPS_x_x | EX86_SSE2, srcdst, freg, 0); + return emit_sse2_load(compiler, 0, srcdst, freg, 0); + } else { + if (lane_index == 1) + FAIL_IF(emit_groupf(compiler, MOVLHPS_x_x | EX86_SSE2, freg, srcdst, 0)); + else + FAIL_IF(emit_sse2_store(compiler, 0, freg, 0, srcdst)); + } + } else if (type & SLJIT_SIMD_STORE) { + if (lane_index == 0) + return emit_sse2_store(compiler, 1, srcdst, srcdstw, freg); + + if (srcdst & SLJIT_MEM) { + FAIL_IF(emit_groupf_ext(compiler, EXTRACTPS_x_xm | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2, freg, srcdst, srcdstw)); + return emit_byte(compiler, U8(lane_index)); + } + + if (srcdst == freg) + size = SHUFPS_x_xm | EX86_SSE2; + else { + if (cpu_feature_list & CPU_FEATURE_AVX) { + FAIL_IF(emit_vex_instruction(compiler, SHUFPS_x_xm | EX86_SSE2 | VEX_SSE2_OPV, srcdst, freg, freg, 0)); + return emit_byte(compiler, U8(lane_index)); + } + + switch (lane_index) { + case 1: + size = MOVSHDUP_x_xm | EX86_PREF_F3 | EX86_SSE2; + break; + case 2: + size = MOVHLPS_x_x | EX86_SSE2; + break; + default: + SLJIT_ASSERT(lane_index == 3); + size = PSHUFD_x_xm | EX86_PREF_66 | EX86_SSE2; + break; + } + } + + FAIL_IF(emit_groupf(compiler, size, srcdst, freg, 0)); + + size &= 0xff; + if (size == SHUFPS_x_xm || size == PSHUFD_x_xm) + return emit_byte(compiler, U8(lane_index)); + return SLJIT_SUCCESS; + } else { + if (lane_index != 0 || (srcdst & SLJIT_MEM)) { + FAIL_IF(emit_groupf_ext(compiler, INSERTPS_x_xm | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2, freg, srcdst, srcdstw)); + FAIL_IF(emit_byte(compiler, U8(lane_index << 4))); + } else + FAIL_IF(emit_sse2_store(compiler, 1, freg, 0, srcdst)); } - /* Low byte is not accessible. */ - if (cpu_has_cmov == -1) - get_cpu_features(); + if (freg != TMP_FREG || (type & SLJIT_SIMD_STORE)) + return SLJIT_SUCCESS; - if (cpu_has_cmov) { - EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, 1); - /* a xor reg, reg operation would overwrite the flags. */ - EMIT_MOV(compiler, dst, 0, SLJIT_IMM, 0); + SLJIT_ASSERT(reg_size == 5); - inst = (sljit_u8*)ensure_buf(compiler, 1 + 3); - FAIL_IF(!inst); - INC_SIZE(3); + if (type & SLJIT_SIMD_LANE_ZERO) { + FAIL_IF(emit_vex_instruction(compiler, VPERMPD_y_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | VEX_W | EX86_SSE2, freg_orig, 0, TMP_FREG, 0)); + return emit_byte(compiler, 0x4e); + } + + FAIL_IF(emit_vex_instruction(compiler, VINSERTF128_y_y_xm | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2 | VEX_SSE2_OPV, freg_orig, freg_orig, TMP_FREG, 0)); + return emit_byte(compiler, 1); + } + + if (srcdst == SLJIT_IMM) { + EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, srcdstw); + srcdst = TMP_REG1; + srcdstw = 0; + } + + size = 3; + + switch (elem_size) { + case 0: + opcode = (type & SLJIT_SIMD_STORE) ? PEXTRB_rm_x_i8 : PINSRB_x_rm_i8; + break; + case 1: + if (!(type & SLJIT_SIMD_STORE)) { + size = 2; + opcode = PINSRW_x_rm_i8; + } else + opcode = PEXTRW_rm_x_i8; + break; + case 2: + opcode = (type & SLJIT_SIMD_STORE) ? PEXTRD_rm_x_i8 : PINSRD_x_rm_i8; + break; +#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) + case 3: + /* PINSRQ / PEXTRQ */ + opcode = (type & SLJIT_SIMD_STORE) ? PEXTRD_rm_x_i8 : PINSRD_x_rm_i8; + compiler->mode32 = 0; + break; +#endif /* SLJIT_CONFIG_X86_64 */ + } - *inst++ = GROUP_0F; - /* cmovcc = setcc - 0x50. */ - *inst++ = cond_set - 0x50; - *inst++ = MOD_REG | (reg_map[dst] << 3) | reg_map[TMP_REG1]; + inst = emit_x86_instruction(compiler, size | EX86_PREF_66 | EX86_SSE2_OP1, freg, 0, srcdst, srcdstw); + FAIL_IF(!inst); + inst[0] = GROUP_0F; + + if (size == 3) { + inst[1] = 0x3a; + inst[2] = opcode; + } else + inst[1] = opcode; + + FAIL_IF(emit_byte(compiler, U8(lane_index))); + + if (!(type & SLJIT_SIMD_LANE_SIGNED) || (srcdst & SLJIT_MEM)) { + if (freg == TMP_FREG && !(type & SLJIT_SIMD_STORE)) { + SLJIT_ASSERT(reg_size == 5); + + if (type & SLJIT_SIMD_LANE_ZERO) { + FAIL_IF(emit_vex_instruction(compiler, VPERMQ_y_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | VEX_W | EX86_SSE2, freg_orig, 0, TMP_FREG, 0)); + return emit_byte(compiler, 0x4e); + } + + FAIL_IF(emit_vex_instruction(compiler, VINSERTI128_y_y_xm | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2 | VEX_SSE2_OPV, freg_orig, freg_orig, TMP_FREG, 0)); + return emit_byte(compiler, 1); + } + +#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) + if (srcdst_orig & SLJIT_MEM) + return emit_mov(compiler, srcdst_orig, srcdstw_orig, TMP_REG1, 0); +#endif /* SLJIT_CONFIG_X86_32 */ + return SLJIT_SUCCESS; + } + +#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) + if (elem_size >= 3) + return SLJIT_SUCCESS; + + compiler->mode32 = (type & SLJIT_32); + + size = 2; + + if (elem_size == 0) + size |= EX86_REX; + + if (elem_size == 2) { + if (type & SLJIT_32) return SLJIT_SUCCESS; + + SLJIT_ASSERT(!(compiler->mode32)); + size = 1; + } + + inst = emit_x86_instruction(compiler, size, srcdst, 0, srcdst, 0); + FAIL_IF(!inst); + + if (size != 1) { + inst[0] = GROUP_0F; + inst[1] = U8((elem_size == 0) ? MOVSX_r_rm8 : MOVSX_r_rm16); + } else + inst[0] = MOVSXD_r_rm; +#else /* !SLJIT_CONFIG_X86_64 */ + if (elem_size >= 2) + return SLJIT_SUCCESS; + + FAIL_IF(emit_groupf(compiler, (elem_size == 0) ? MOVSX_r_rm8 : MOVSX_r_rm16, + (srcdst_orig != 0 && FAST_IS_REG(srcdst_orig)) ? srcdst_orig : srcdst, srcdst, 0)); + + if (srcdst_orig & SLJIT_MEM) + return emit_mov(compiler, srcdst_orig, srcdstw_orig, TMP_REG1, 0); +#endif /* SLJIT_CONFIG_X86_64 */ + return SLJIT_SUCCESS; +} + +SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_compiler *compiler, sljit_s32 type, + sljit_s32 freg, + sljit_s32 src, sljit_s32 src_lane_index) +{ + sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); + sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type); + sljit_uw pref; + sljit_u8 byte; +#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) + sljit_s32 opcode3 = TMP_REG1; +#else /* !SLJIT_CONFIG_X86_32 */ + sljit_s32 opcode3 = SLJIT_S0; +#endif /* SLJIT_CONFIG_X86_32 */ + + CHECK_ERROR(); + CHECK(check_sljit_emit_simd_lane_replicate(compiler, type, freg, src, src_lane_index)); + +#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) + compiler->mode32 = 1; +#endif /* SLJIT_CONFIG_X86_64 */ + SLJIT_ASSERT(reg_map[opcode3] == 3); + + if (reg_size == 5) { + if (!(cpu_feature_list & CPU_FEATURE_AVX2)) + return SLJIT_ERR_UNSUPPORTED; + } else if (reg_size != 4) + return SLJIT_ERR_UNSUPPORTED; + + if (type & SLJIT_SIMD_FLOAT) { + pref = 0; + byte = U8(src_lane_index); + + if (elem_size == 3) { + if (type & SLJIT_SIMD_TEST) + return SLJIT_SUCCESS; + + if (reg_size == 5) { + if (src_lane_index == 0) + return emit_vex_instruction(compiler, VBROADCASTSD_x_xm | VEX_256 | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, 0, src, 0); + + FAIL_IF(emit_vex_instruction(compiler, VPERMPD_y_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | VEX_W | EX86_SSE2, freg, 0, src, 0)); + + byte = U8(byte | (byte << 2)); + return emit_byte(compiler, U8(byte | (byte << 4))); + } + + if (src_lane_index == 0) + return emit_groupf(compiler, MOVDDUP_x_xm | EX86_PREF_F2 | EX86_SSE2, freg, src, 0); + + /* Changes it to SHUFPD_x_xm. */ + pref = EX86_PREF_66; + } else if (elem_size != 2) + return SLJIT_ERR_UNSUPPORTED; + else if (type & SLJIT_SIMD_TEST) + return SLJIT_SUCCESS; + + if (reg_size == 5) { + SLJIT_ASSERT(elem_size == 2); + + if (src_lane_index == 0) + return emit_vex_instruction(compiler, VBROADCASTSS_x_xm | VEX_256 | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, 0, src, 0); + + FAIL_IF(emit_vex_instruction(compiler, VPERMPD_y_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | VEX_W | EX86_SSE2, freg, 0, src, 0)); + + byte = 0x44; + if (src_lane_index >= 4) { + byte = 0xee; + src_lane_index -= 4; + } + + FAIL_IF(emit_byte(compiler, byte)); + FAIL_IF(emit_vex_instruction(compiler, SHUFPS_x_xm | VEX_256 | pref | EX86_SSE2 | VEX_SSE2_OPV, freg, freg, freg, 0)); + byte = U8(src_lane_index); + } else if (freg != src && (cpu_feature_list & CPU_FEATURE_AVX)) { + FAIL_IF(emit_vex_instruction(compiler, SHUFPS_x_xm | pref | EX86_SSE2 | VEX_SSE2_OPV, freg, src, src, 0)); + } else { + if (freg != src) + FAIL_IF(emit_groupf(compiler, MOVAPS_x_xm | pref | EX86_SSE2, freg, src, 0)); + + FAIL_IF(emit_groupf(compiler, SHUFPS_x_xm | pref | EX86_SSE2, freg, freg, 0)); } - inst = (sljit_u8*)ensure_buf(compiler, 1 + 1 + 3 + 3 + 1); - FAIL_IF(!inst); - INC_SIZE(1 + 3 + 3 + 1); - *inst++ = XCHG_EAX_r + reg_map[TMP_REG1]; - /* Set al to conditional flag. */ - *inst++ = GROUP_0F; - *inst++ = cond_set; - *inst++ = MOD_REG | 0 /* eax */; - - *inst++ = GROUP_0F; - *inst++ = MOVZX_r_rm8; - *inst++ = MOD_REG | (reg_map[dst] << 3) | 0 /* eax */; - *inst++ = XCHG_EAX_r + reg_map[TMP_REG1]; + if (elem_size == 2) { + byte = U8(byte | (byte << 2)); + byte = U8(byte | (byte << 4)); + } else + byte = U8(byte | (byte << 1)); + + return emit_byte(compiler, U8(byte)); + } + + if (type & SLJIT_SIMD_TEST) return SLJIT_SUCCESS; + + if (elem_size == 0) { + if (reg_size == 5 && src_lane_index >= 16) { + FAIL_IF(emit_vex_instruction(compiler, VPERMQ_y_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | VEX_W | EX86_SSE2, freg, 0, src, 0)); + FAIL_IF(emit_byte(compiler, src_lane_index >= 24 ? 0xff : 0xaa)); + src_lane_index &= 0x7; + src = freg; + } + + if ((freg != src && !(cpu_feature_list & CPU_FEATURE_AVX2)) || src_lane_index != 0) { + pref = 0; + + if ((src_lane_index & 0x3) == 0) { + pref = EX86_PREF_66; + byte = U8(src_lane_index >> 2); + } else if (src_lane_index < 8 && (src_lane_index & 0x1) == 0) { + pref = EX86_PREF_F2; + byte = U8(src_lane_index >> 1); + } else { + if (freg == src || !(cpu_feature_list & CPU_FEATURE_AVX2)) { + if (freg != src) + FAIL_IF(emit_groupf(compiler, MOVDQA_x_xm | EX86_PREF_66 | EX86_SSE2, freg, src, 0)); + + FAIL_IF(emit_groupf(compiler, PSRLDQ_x | EX86_PREF_66 | EX86_SSE2_OP2, opcode3, freg, 0)); + } else + FAIL_IF(emit_vex_instruction(compiler, PSRLDQ_x | EX86_PREF_66 | EX86_SSE2_OP2 | VEX_SSE2_OPV, opcode3, freg, src, 0)); + + FAIL_IF(emit_byte(compiler, U8(src_lane_index))); + } + + if (pref != 0) { + FAIL_IF(emit_groupf(compiler, PSHUFLW_x_xm | pref | EX86_SSE2, freg, src, 0)); + FAIL_IF(emit_byte(compiler, byte)); + } + + src = freg; + } + + if (cpu_feature_list & CPU_FEATURE_AVX2) + return emit_vex_instruction(compiler, VPBROADCASTB_x_xm | (reg_size == 5 ? VEX_256 : 0) | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, 0, src, 0); + + SLJIT_ASSERT(reg_size == 4); + FAIL_IF(emit_groupf(compiler, PXOR_x_xm | EX86_PREF_66 | EX86_SSE2, TMP_FREG, TMP_FREG, 0)); + return emit_groupf_ext(compiler, PSHUFB_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, TMP_FREG, 0); } - if (GET_OPCODE(op) == SLJIT_OR && !GET_ALL_FLAGS(op) && FAST_IS_REG(dst) && reg_map[dst] <= 4) { - SLJIT_ASSERT(reg_map[SLJIT_R0] == 0); + if ((cpu_feature_list & CPU_FEATURE_AVX2) && src_lane_index == 0 && elem_size <= 3) { + switch (elem_size) { + case 1: + pref = VPBROADCASTW_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2; + break; + case 2: + pref = VPBROADCASTD_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2; + break; + default: + pref = VPBROADCASTQ_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2; + break; + } - if (dst != SLJIT_R0) { - inst = (sljit_u8*)ensure_buf(compiler, 1 + 1 + 3 + 2 + 1); - FAIL_IF(!inst); - INC_SIZE(1 + 3 + 2 + 1); - /* Set low register to conditional flag. */ - *inst++ = XCHG_EAX_r + reg_map[TMP_REG1]; - *inst++ = GROUP_0F; - *inst++ = cond_set; - *inst++ = MOD_REG | 0 /* eax */; - *inst++ = OR_rm8_r8; - *inst++ = MOD_REG | (0 /* eax */ << 3) | reg_map[dst]; - *inst++ = XCHG_EAX_r + reg_map[TMP_REG1]; + if (reg_size == 5) + pref |= VEX_256; + + return emit_vex_instruction(compiler, pref, freg, 0, src, 0); + } + + if (reg_size == 5) { + switch (elem_size) { + case 1: + byte = U8(src_lane_index & 0x3); + src_lane_index >>= 2; + pref = PSHUFLW_x_xm | VEX_256 | ((src_lane_index & 1) == 0 ? EX86_PREF_F2 : EX86_PREF_F3) | EX86_SSE2; + break; + case 2: + byte = U8(src_lane_index & 0x3); + src_lane_index >>= 1; + pref = PSHUFD_x_xm | VEX_256 | EX86_PREF_66 | EX86_SSE2; + break; + case 3: + pref = 0; + break; + default: + FAIL_IF(emit_vex_instruction(compiler, VPERMQ_y_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | VEX_W | EX86_SSE2, freg, 0, src, 0)); + return emit_byte(compiler, U8(src_lane_index == 0 ? 0x44 : 0xee)); } - else { - inst = (sljit_u8*)ensure_buf(compiler, 1 + 2 + 3 + 2 + 2); + + if (pref != 0) { + FAIL_IF(emit_vex_instruction(compiler, pref, freg, 0, src, 0)); + byte = U8(byte | (byte << 2)); + FAIL_IF(emit_byte(compiler, U8(byte | (byte << 4)))); + + if (src_lane_index == 0) + return emit_vex_instruction(compiler, VPBROADCASTQ_x_xm | VEX_256 | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, 0, freg, 0); + + src = freg; + } + + FAIL_IF(emit_vex_instruction(compiler, VPERMQ_y_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | VEX_W | EX86_SSE2, freg, 0, src, 0)); + byte = U8(src_lane_index); + byte = U8(byte | (byte << 2)); + return emit_byte(compiler, U8(byte | (byte << 4))); + } + + switch (elem_size) { + case 1: + byte = U8(src_lane_index & 0x3); + src_lane_index >>= 1; + pref = (src_lane_index & 2) == 0 ? EX86_PREF_F2 : EX86_PREF_F3; + + FAIL_IF(emit_groupf(compiler, PSHUFLW_x_xm | pref | EX86_SSE2, freg, src, 0)); + byte = U8(byte | (byte << 2)); + FAIL_IF(emit_byte(compiler, U8(byte | (byte << 4)))); + + if ((cpu_feature_list & CPU_FEATURE_AVX2) && pref == EX86_PREF_F2) + return emit_vex_instruction(compiler, VPBROADCASTD_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, 0, freg, 0); + + src = freg; + /* fallthrough */ + case 2: + byte = U8(src_lane_index); + byte = U8(byte | (byte << 2)); + break; + default: + byte = U8(src_lane_index << 1); + byte = U8(byte | (byte << 2) | 0x4); + break; + } + + FAIL_IF(emit_groupf(compiler, PSHUFD_x_xm | EX86_PREF_66 | EX86_SSE2, freg, src, 0)); + return emit_byte(compiler, U8(byte | (byte << 4))); +} + +SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler *compiler, sljit_s32 type, + sljit_s32 freg, + sljit_s32 src, sljit_sw srcw) +{ + sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); + sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type); + sljit_s32 elem2_size = SLJIT_SIMD_GET_ELEM2_SIZE(type); + sljit_u8 opcode; + + CHECK_ERROR(); + CHECK(check_sljit_emit_simd_extend(compiler, type, freg, src, srcw)); + + ADJUST_LOCAL_OFFSET(src, srcw); + +#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) + compiler->mode32 = 1; +#endif /* SLJIT_CONFIG_X86_64 */ + + if (reg_size == 5) { + if (!(cpu_feature_list & CPU_FEATURE_AVX2)) + return SLJIT_ERR_UNSUPPORTED; + } else if (reg_size != 4) + return SLJIT_ERR_UNSUPPORTED; + + if (type & SLJIT_SIMD_FLOAT) { + if (elem_size != 2 || elem2_size != 3) + return SLJIT_ERR_UNSUPPORTED; + + if (type & SLJIT_SIMD_TEST) + return SLJIT_SUCCESS; + + if (reg_size == 4) + return emit_groupf(compiler, CVTPS2PD_x_xm | EX86_SSE2, freg, src, srcw); + return emit_vex_instruction(compiler, CVTPS2PD_x_xm | VEX_256 | EX86_SSE2, freg, 0, src, srcw); + } + + switch (elem_size) { + case 0: + if (elem2_size == 1) + opcode = (type & SLJIT_SIMD_EXTEND_SIGNED) ? PMOVSXBW_x_xm : PMOVZXBW_x_xm; + else if (elem2_size == 2) + opcode = (type & SLJIT_SIMD_EXTEND_SIGNED) ? PMOVSXBD_x_xm : PMOVZXBD_x_xm; + else if (elem2_size == 3) + opcode = (type & SLJIT_SIMD_EXTEND_SIGNED) ? PMOVSXBQ_x_xm : PMOVZXBQ_x_xm; + else + return SLJIT_ERR_UNSUPPORTED; + break; + case 1: + if (elem2_size == 2) + opcode = (type & SLJIT_SIMD_EXTEND_SIGNED) ? PMOVSXWD_x_xm : PMOVZXWD_x_xm; + else if (elem2_size == 3) + opcode = (type & SLJIT_SIMD_EXTEND_SIGNED) ? PMOVSXWQ_x_xm : PMOVZXWQ_x_xm; + else + return SLJIT_ERR_UNSUPPORTED; + break; + case 2: + if (elem2_size == 3) + opcode = (type & SLJIT_SIMD_EXTEND_SIGNED) ? PMOVSXDQ_x_xm : PMOVZXDQ_x_xm; + else + return SLJIT_ERR_UNSUPPORTED; + break; + default: + return SLJIT_ERR_UNSUPPORTED; + } + + if (type & SLJIT_SIMD_TEST) + return SLJIT_SUCCESS; + + if (reg_size == 4) + return emit_groupf_ext(compiler, opcode | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, src, srcw); + return emit_vex_instruction(compiler, opcode | VEX_256 | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, 0, src, srcw); +} + +SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *compiler, sljit_s32 type, + sljit_s32 freg, + sljit_s32 dst, sljit_sw dstw) +{ + sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); + sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type); + sljit_s32 dst_r; + sljit_uw pref; + sljit_u8 *inst; + + CHECK_ERROR(); + CHECK(check_sljit_emit_simd_sign(compiler, type, freg, dst, dstw)); + + ADJUST_LOCAL_OFFSET(dst, dstw); + + CHECK_EXTRA_REGS(dst, dstw, (void)0); +#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) + compiler->mode32 = 1; +#endif /* SLJIT_CONFIG_X86_64 */ + + if (elem_size > 3 || ((type & SLJIT_SIMD_FLOAT) && elem_size < 2)) + return SLJIT_ERR_UNSUPPORTED; + + if (reg_size == 4) { + if (type & SLJIT_SIMD_TEST) + return SLJIT_SUCCESS; + + pref = EX86_PREF_66 | EX86_SSE2_OP2; + + switch (elem_size) { + case 1: + FAIL_IF(emit_groupf(compiler, PACKSSWB_x_xm | EX86_PREF_66 | EX86_SSE2, TMP_FREG, freg, 0)); + freg = TMP_FREG; + break; + case 2: + pref = EX86_SSE2_OP2; + break; + } + + dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1; + FAIL_IF(emit_groupf(compiler, (elem_size < 2 ? PMOVMSKB_r_x : MOVMSKPS_r_x) | pref, dst_r, freg, 0)); + +#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) + compiler->mode32 = type & SLJIT_32; +#endif /* SLJIT_CONFIG_X86_64 */ + + if (elem_size == 1) { + inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 8, dst_r, 0); FAIL_IF(!inst); - INC_SIZE(2 + 3 + 2 + 2); - /* Set low register to conditional flag. */ - *inst++ = XCHG_r_rm; - *inst++ = MOD_REG | (1 /* ecx */ << 3) | reg_map[TMP_REG1]; - *inst++ = GROUP_0F; - *inst++ = cond_set; - *inst++ = MOD_REG | 1 /* ecx */; - *inst++ = OR_rm8_r8; - *inst++ = MOD_REG | (1 /* ecx */ << 3) | 0 /* eax */; - *inst++ = XCHG_r_rm; - *inst++ = MOD_REG | (1 /* ecx */ << 3) | reg_map[TMP_REG1]; + inst[1] |= SHR; } + + if (dst_r == TMP_REG1) + return emit_mov(compiler, dst, dstw, TMP_REG1, 0); + return SLJIT_SUCCESS; } - /* Set TMP_REG1 to the bit. */ - inst = (sljit_u8*)ensure_buf(compiler, 1 + 1 + 3 + 3 + 1); - FAIL_IF(!inst); - INC_SIZE(1 + 3 + 3 + 1); - *inst++ = XCHG_EAX_r + reg_map[TMP_REG1]; - /* Set al to conditional flag. */ - *inst++ = GROUP_0F; - *inst++ = cond_set; - *inst++ = MOD_REG | 0 /* eax */; + if (reg_size != 5 || !(cpu_feature_list & CPU_FEATURE_AVX2)) + return SLJIT_ERR_UNSUPPORTED; + + if (type & SLJIT_SIMD_TEST) + return SLJIT_SUCCESS; - *inst++ = GROUP_0F; - *inst++ = MOVZX_r_rm8; - *inst++ = MOD_REG | (0 << 3) /* eax */ | 0 /* eax */; + dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1; - *inst++ = XCHG_EAX_r + reg_map[TMP_REG1]; + if (elem_size == 1) { + FAIL_IF(emit_vex_instruction(compiler, VEXTRACTI128_x_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2, freg, 0, TMP_FREG, 0)); + FAIL_IF(emit_byte(compiler, 1)); + FAIL_IF(emit_vex_instruction(compiler, PACKSSWB_x_xm | VEX_256 | EX86_PREF_66 | EX86_SSE2 | VEX_SSE2_OPV, TMP_FREG, freg, TMP_FREG, 0)); + FAIL_IF(emit_groupf(compiler, PMOVMSKB_r_x | EX86_PREF_66 | EX86_SSE2_OP2, dst_r, TMP_FREG, 0)); + } else { + pref = MOVMSKPS_r_x | VEX_256 | EX86_SSE2_OP2; - if (GET_OPCODE(op) < SLJIT_ADD) + if (elem_size == 0) + pref = PMOVMSKB_r_x | VEX_256 | EX86_PREF_66 | EX86_SSE2_OP2; + else if (elem_size == 3) + pref |= EX86_PREF_66; + + FAIL_IF(emit_vex_instruction(compiler, pref, dst_r, 0, freg, 0)); + } + + if (dst_r == TMP_REG1) { +#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) + compiler->mode32 = type & SLJIT_32; +#endif /* SLJIT_CONFIG_X86_64 */ return emit_mov(compiler, dst, dstw, TMP_REG1, 0); + } -#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \ - || (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS) - compiler->skip_checks = 1; -#endif - return sljit_emit_op2(compiler, op, dst_save, dstw_save, dst_save, dstw_save, TMP_REG1, 0); + return SLJIT_SUCCESS; +} + +static sljit_s32 emit_simd_mov(struct sljit_compiler *compiler, sljit_s32 type, + sljit_s32 dst_freg, sljit_s32 src_freg) +{ + sljit_uw op = ((type & SLJIT_SIMD_FLOAT) ? MOVAPS_x_xm : MOVDQA_x_xm) | EX86_SSE2; + + SLJIT_ASSERT(SLJIT_SIMD_GET_REG_SIZE(type) == 4); + + if (!(type & SLJIT_SIMD_FLOAT) || SLJIT_SIMD_GET_ELEM_SIZE(type) == 3) + op |= EX86_PREF_66; + + return emit_groupf(compiler, op, dst_freg, src_freg, 0); +} + +SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *compiler, sljit_s32 type, + sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2_freg) +{ + sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); + sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type); + sljit_s32 needs_move = 0; + sljit_uw op = 0; + + CHECK_ERROR(); + CHECK(check_sljit_emit_simd_op2(compiler, type, dst_freg, src1_freg, src2_freg)); + +#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) + compiler->mode32 = 1; #endif /* SLJIT_CONFIG_X86_64 */ + + if (reg_size == 5) { + if (!(cpu_feature_list & CPU_FEATURE_AVX2)) + return SLJIT_ERR_UNSUPPORTED; + } else if (reg_size != 4) + return SLJIT_ERR_UNSUPPORTED; + + if ((type & SLJIT_SIMD_FLOAT) && (elem_size < 2 || elem_size > 3)) + return SLJIT_ERR_UNSUPPORTED; + + switch (SLJIT_SIMD_GET_OPCODE(type)) { + case SLJIT_SIMD_OP2_AND: + op = (type & SLJIT_SIMD_FLOAT) ? ANDPD_x_xm : PAND_x_xm; + + if (!(type & SLJIT_SIMD_FLOAT) || elem_size == 3) + op |= EX86_PREF_66; + break; + case SLJIT_SIMD_OP2_OR: + op = (type & SLJIT_SIMD_FLOAT) ? ORPD_x_xm : POR_x_xm; + + if (!(type & SLJIT_SIMD_FLOAT) || elem_size == 3) + op |= EX86_PREF_66; + break; + case SLJIT_SIMD_OP2_XOR: + op = (type & SLJIT_SIMD_FLOAT) ? XORPD_x_xm : PXOR_x_xm; + + if (!(type & SLJIT_SIMD_FLOAT) || elem_size == 3) + op |= EX86_PREF_66; + break; + } + + if (type & SLJIT_SIMD_TEST) + return SLJIT_SUCCESS; + + needs_move = dst_freg != src1_freg && dst_freg != src2_freg; + + if (reg_size == 5 || (needs_move && (cpu_feature_list & CPU_FEATURE_AVX2))) { + if (reg_size == 5) + op |= VEX_256; + + return emit_vex_instruction(compiler, op | EX86_SSE2 | VEX_SSE2_OPV, dst_freg, src1_freg, src2_freg, 0); + } + + if (needs_move) { + FAIL_IF(emit_simd_mov(compiler, type, dst_freg, src1_freg)); + } else if (dst_freg != src1_freg) { + SLJIT_ASSERT(dst_freg == src2_freg); + src2_freg = src1_freg; + } + + FAIL_IF(emit_groupf(compiler, op | EX86_SSE2, dst_freg, src2_freg, 0)); + return SLJIT_SUCCESS; } -SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_cmov(struct sljit_compiler *compiler, sljit_s32 type, +SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_load(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 dst_reg, - sljit_s32 src, sljit_sw srcw) + sljit_s32 mem_reg) { - sljit_u8* inst; + CHECK_ERROR(); + CHECK(check_sljit_emit_atomic_load(compiler, op, dst_reg, mem_reg)); + + SLJIT_SKIP_CHECKS(compiler); + return sljit_emit_op1(compiler, op, dst_reg, 0, SLJIT_MEM1(mem_reg), 0); +} + +SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_store(struct sljit_compiler *compiler, sljit_s32 op, + sljit_s32 src_reg, + sljit_s32 mem_reg, + sljit_s32 temp_reg) +{ + sljit_uw pref; + sljit_s32 free_reg = TMP_REG1; +#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) + sljit_sw srcw = 0; + sljit_sw tempw = 0; +#endif /* SLJIT_CONFIG_X86_32 */ CHECK_ERROR(); - CHECK(check_sljit_emit_cmov(compiler, type, dst_reg, src, srcw)); + CHECK(check_sljit_emit_atomic_store(compiler, op, src_reg, mem_reg, temp_reg)); + CHECK_EXTRA_REGS(src_reg, srcw, (void)0); + CHECK_EXTRA_REGS(temp_reg, tempw, (void)0); + + SLJIT_ASSERT(FAST_IS_REG(src_reg) || src_reg == SLJIT_MEM1(SLJIT_SP)); + SLJIT_ASSERT(FAST_IS_REG(temp_reg) || temp_reg == SLJIT_MEM1(SLJIT_SP)); + op = GET_OPCODE(op); #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) - dst_reg &= ~SLJIT_I32_OP; + if ((src_reg & SLJIT_MEM) || (op == SLJIT_MOV_U8 && reg_map[src_reg] >= 4)) { + /* Src is virtual register or its low byte is not accessible. */ + SLJIT_ASSERT(src_reg != SLJIT_R1); + free_reg = src_reg; - if (!sljit_has_cpu_feature(SLJIT_HAS_CMOV) || (dst_reg >= SLJIT_R3 && dst_reg <= SLJIT_S3)) - return sljit_emit_cmov_generic(compiler, type, dst_reg, src, srcw); -#else - if (!sljit_has_cpu_feature(SLJIT_HAS_CMOV)) - return sljit_emit_cmov_generic(compiler, type, dst_reg, src, srcw); -#endif + EMIT_MOV(compiler, TMP_REG1, 0, src_reg, srcw); + src_reg = TMP_REG1; - /* ADJUST_LOCAL_OFFSET is not needed. */ - CHECK_EXTRA_REGS(src, srcw, (void)0); + if (mem_reg == src_reg) + mem_reg = TMP_REG1; + } +#endif /* SLJIT_CONFIG_X86_32 */ + if (temp_reg != SLJIT_R0) { #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) - compiler->mode32 = dst_reg & SLJIT_I32_OP; - dst_reg &= ~SLJIT_I32_OP; -#endif + compiler->mode32 = 0; - if (SLJIT_UNLIKELY(src & SLJIT_IMM)) { - EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, srcw); - src = TMP_REG1; - srcw = 0; + EMIT_MOV(compiler, free_reg, 0, SLJIT_R0, 0); + EMIT_MOV(compiler, SLJIT_R0, 0, temp_reg, 0); + + if (src_reg == SLJIT_R0) + src_reg = free_reg; + if (mem_reg == SLJIT_R0) + mem_reg = free_reg; +#else /* !SLJIT_CONFIG_X86_64 */ + if (src_reg == TMP_REG1 && mem_reg == SLJIT_R0 && (free_reg & SLJIT_MEM)) { + EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), 0, SLJIT_R1, 0); + EMIT_MOV(compiler, SLJIT_R1, 0, SLJIT_R0, 0); + EMIT_MOV(compiler, SLJIT_R0, 0, temp_reg, tempw); + + mem_reg = SLJIT_R1; + free_reg = SLJIT_R1; + } else { + EMIT_MOV(compiler, free_reg, 0, SLJIT_R0, 0); + EMIT_MOV(compiler, SLJIT_R0, 0, temp_reg, tempw); + + if (src_reg == SLJIT_R0) + src_reg = free_reg; + if (mem_reg == SLJIT_R0) + mem_reg = free_reg; + } +#endif /* SLJIT_CONFIG_X86_64 */ } - inst = emit_x86_instruction(compiler, 2, dst_reg, 0, src, srcw); - FAIL_IF(!inst); - *inst++ = GROUP_0F; - *inst = get_jump_code(type & 0xff) - 0x40; +#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) + compiler->mode32 = op != SLJIT_MOV && op != SLJIT_MOV_P; +#endif /* SLJIT_CONFIG_X86_64 */ + + /* Lock prefix. */ + FAIL_IF(emit_byte(compiler, GROUP_LOCK)); + + pref = 0; + if (op == SLJIT_MOV_U16) + pref = EX86_HALF_ARG | EX86_PREF_66; +#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) + if (op == SLJIT_MOV_U8) + pref = EX86_REX; +#endif /* SLJIT_CONFIG_X86_64 */ + + FAIL_IF(emit_groupf(compiler, (op == SLJIT_MOV_U8 ? CMPXCHG_rm8_r : CMPXCHG_rm_r) | pref, src_reg, SLJIT_MEM1(mem_reg), 0)); + + if (temp_reg != SLJIT_R0) { +#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) + compiler->mode32 = 0; + return emit_mov(compiler, SLJIT_R0, 0, TMP_REG1, 0); +#else /* !SLJIT_CONFIG_X86_64 */ + EMIT_MOV(compiler, SLJIT_R0, 0, free_reg, 0); + if (free_reg != TMP_REG1) + return emit_mov(compiler, free_reg, 0, (free_reg == SLJIT_R1) ? SLJIT_MEM1(SLJIT_SP) : TMP_REG1, 0); +#endif /* SLJIT_CONFIG_X86_64 */ + } return SLJIT_SUCCESS; } @@ -3059,8 +4730,8 @@ SLJIT_API_FUNC_ATTRIBUTE struct sljit_const* sljit_emit_const(struct sljit_compi inst = (sljit_u8*)ensure_buf(compiler, 2); PTR_FAIL_IF(!inst); - *inst++ = 0; - *inst++ = 2; + inst[0] = 0; + inst[1] = 2; #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) if (dst & SLJIT_MEM) @@ -3113,8 +4784,8 @@ SLJIT_API_FUNC_ATTRIBUTE struct sljit_put_label* sljit_emit_put_label(struct slj inst = (sljit_u8*)ensure_buf(compiler, 2); PTR_FAIL_IF(!inst); - *inst++ = 0; - *inst++ = 3; + inst[0] = 0; + inst[1] = 3; return put_label; } @@ -3125,9 +4796,9 @@ SLJIT_API_FUNC_ATTRIBUTE void sljit_set_jump_addr(sljit_uw addr, sljit_uw new_ta SLJIT_UPDATE_WX_FLAGS((void*)addr, (void*)(addr + sizeof(sljit_uw)), 0); #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) - sljit_unaligned_store_sw((void*)addr, new_target - (addr + 4) - (sljit_uw)executable_offset); + sljit_unaligned_store_sw((void*)addr, (sljit_sw)(new_target - (addr + 4) - (sljit_uw)executable_offset)); #else - sljit_unaligned_store_sw((void*)addr, (sljit_sw) new_target); + sljit_unaligned_store_sw((void*)addr, (sljit_sw)new_target); #endif SLJIT_UPDATE_WX_FLAGS((void*)addr, (void*)(addr + sizeof(sljit_uw)), 1); } |