summaryrefslogtreecommitdiffstats
path: root/llvm/test/CodeGen
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen')
-rw-r--r--llvm/test/CodeGen/AArch64/GlobalISel/vastart.ll2
-rw-r--r--llvm/test/CodeGen/AArch64/and-sink.ll9
-rw-r--r--llvm/test/CodeGen/AArch64/combine-comparisons-by-cse.ll122
-rw-r--r--llvm/test/CodeGen/AMDGPU/bf16.ll22
-rw-r--r--llvm/test/CodeGen/AMDGPU/build_vector.ll97
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-constant-i16.ll22
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-constant-i8.ll90
-rw-r--r--llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir26
-rw-r--r--llvm/test/CodeGen/AMDGPU/vopd-combine.mir32
-rw-r--r--llvm/test/CodeGen/Hexagon/vect/zext-v4i1.ll18
-rw-r--r--llvm/test/CodeGen/RISCV/prefer-w-inst.ll105
-rw-r--r--llvm/test/CodeGen/RISCV/prefer-w-inst.mir262
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/calling-conv.ll203
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll5
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll19
-rw-r--r--llvm/test/CodeGen/RISCV/strip-w-suffix.ll74
-rw-r--r--llvm/test/CodeGen/X86/GlobalISel/fconstant.ll13
-rw-r--r--llvm/test/CodeGen/X86/GlobalISel/regbankselect-sse-intrinsics.ll153
-rw-r--r--llvm/test/CodeGen/X86/GlobalISel/regbankselect-x87.ll29
-rw-r--r--llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll42
-rw-r--r--llvm/test/CodeGen/X86/dpbusd.ll2
-rw-r--r--llvm/test/CodeGen/X86/dpbusd_i4.ll2
-rw-r--r--llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll1300
-rw-r--r--llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll602
-rw-r--r--llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll31
-rw-r--r--llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll44
-rw-r--r--llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll15
-rw-r--r--llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll28
29 files changed, 2073 insertions, 1316 deletions
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/vastart.ll b/llvm/test/CodeGen/AArch64/GlobalISel/vastart.ll
index bd576d0f70e9..8c6e01d934c2 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/vastart.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/vastart.ll
@@ -3,7 +3,7 @@
declare void @llvm.va_start(ptr)
-define void @test_va_start(ptr %list) {
+define void @test_va_start(ptr %list, ...) {
; CHECK-LABEL: name: test_va_start
; CHECK: [[LIST:%[0-9]+]]:_(p0) = COPY $x0
; CHECK-IOS: G_VASTART [[LIST]](p0) :: (store (s64) into %ir.list, align 1)
diff --git a/llvm/test/CodeGen/AArch64/and-sink.ll b/llvm/test/CodeGen/AArch64/and-sink.ll
index f298a55dab72..a57e9d54f307 100644
--- a/llvm/test/CodeGen/AArch64/and-sink.ll
+++ b/llvm/test/CodeGen/AArch64/and-sink.ll
@@ -11,15 +11,14 @@
define dso_local i32 @and_sink1(i32 %a, i1 %c) {
; CHECK-LABEL: and_sink1:
; CHECK: // %bb.0:
-; CHECK-NEXT: tbz w1, #0, .LBB0_3
+; CHECK-NEXT: tbz w1, #0, .LBB0_2
; CHECK-NEXT: // %bb.1: // %bb0
+; CHECK-NEXT: tst w0, #0x4
; CHECK-NEXT: adrp x8, A
+; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: str wzr, [x8, :lo12:A]
-; CHECK-NEXT: tbnz w0, #2, .LBB0_3
-; CHECK-NEXT: // %bb.2:
-; CHECK-NEXT: mov w0, #1 // =0x1
; CHECK-NEXT: ret
-; CHECK-NEXT: .LBB0_3: // %bb2
+; CHECK-NEXT: .LBB0_2:
; CHECK-NEXT: mov w0, wzr
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/combine-comparisons-by-cse.ll b/llvm/test/CodeGen/AArch64/combine-comparisons-by-cse.ll
index 6449c3e11d66..dde3e81833a6 100644
--- a/llvm/test/CodeGen/AArch64/combine-comparisons-by-cse.ll
+++ b/llvm/test/CodeGen/AArch64/combine-comparisons-by-cse.ll
@@ -13,10 +13,10 @@ define i32 @combine_gt_ge_10() #0 {
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: adrp x8, :got:a
; CHECK-NEXT: ldr x8, [x8, :got_lo12:a]
-; CHECK-NEXT: ldr w8, [x8]
-; CHECK-NEXT: cmp w8, #10
+; CHECK-NEXT: ldr w9, [x8]
; CHECK-NEXT: adrp x8, :got:b
; CHECK-NEXT: ldr x8, [x8, :got_lo12:b]
+; CHECK-NEXT: cmp w9, #10
; CHECK-NEXT: b.le .LBB0_3
; CHECK-NEXT: // %bb.1: // %land.lhs.true
; CHECK-NEXT: adrp x9, :got:c
@@ -29,18 +29,17 @@ define i32 @combine_gt_ge_10() #0 {
; CHECK-NEXT: mov w0, #1 // =0x1
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB0_3: // %lor.lhs.false
-; CHECK-NEXT: b.lt .LBB0_6
+; CHECK-NEXT: cmp w9, #10
+; CHECK-NEXT: b.lt .LBB0_5
; CHECK-NEXT: .LBB0_4: // %land.lhs.true3
; CHECK-NEXT: adrp x9, :got:d
; CHECK-NEXT: ldr x9, [x9, :got_lo12:d]
; CHECK-NEXT: ldr w8, [x8]
; CHECK-NEXT: ldr w9, [x9]
; CHECK-NEXT: cmp w8, w9
-; CHECK-NEXT: b.ne .LBB0_6
-; CHECK-NEXT: // %bb.5:
-; CHECK-NEXT: mov w0, #1 // =0x1
+; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: ret
-; CHECK-NEXT: .LBB0_6: // %if.end
+; CHECK-NEXT: .LBB0_5:
; CHECK-NEXT: mov w0, wzr
; CHECK-NEXT: ret
entry:
@@ -145,10 +144,10 @@ define i32 @combine_lt_ge_5() #0 {
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: adrp x8, :got:a
; CHECK-NEXT: ldr x8, [x8, :got_lo12:a]
-; CHECK-NEXT: ldr w8, [x8]
-; CHECK-NEXT: cmp w8, #5
+; CHECK-NEXT: ldr w9, [x8]
; CHECK-NEXT: adrp x8, :got:b
; CHECK-NEXT: ldr x8, [x8, :got_lo12:b]
+; CHECK-NEXT: cmp w9, #5
; CHECK-NEXT: b.ge .LBB2_3
; CHECK-NEXT: // %bb.1: // %land.lhs.true
; CHECK-NEXT: adrp x9, :got:c
@@ -161,18 +160,17 @@ define i32 @combine_lt_ge_5() #0 {
; CHECK-NEXT: mov w0, #1 // =0x1
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB2_3: // %lor.lhs.false
-; CHECK-NEXT: b.gt .LBB2_6
+; CHECK-NEXT: cmp w9, #5
+; CHECK-NEXT: b.gt .LBB2_5
; CHECK-NEXT: .LBB2_4: // %land.lhs.true3
; CHECK-NEXT: adrp x9, :got:d
; CHECK-NEXT: ldr x9, [x9, :got_lo12:d]
; CHECK-NEXT: ldr w8, [x8]
; CHECK-NEXT: ldr w9, [x9]
; CHECK-NEXT: cmp w8, w9
-; CHECK-NEXT: b.ne .LBB2_6
-; CHECK-NEXT: // %bb.5:
-; CHECK-NEXT: mov w0, #1 // =0x1
+; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: ret
-; CHECK-NEXT: .LBB2_6: // %if.end
+; CHECK-NEXT: .LBB2_5:
; CHECK-NEXT: mov w0, wzr
; CHECK-NEXT: ret
entry:
@@ -499,24 +497,17 @@ define i32 @do_nothing_if_resultant_opcodes_would_differ() #0 {
; CHECK-NEXT: // %bb.3: // %while.cond.while.end_crit_edge
; CHECK-NEXT: ldr w8, [x19]
; CHECK-NEXT: .LBB7_4: // %while.end
-; CHECK-NEXT: cmp w8, #1
-; CHECK-NEXT: b.gt .LBB7_7
-; CHECK-NEXT: // %bb.5: // %land.lhs.true
-; CHECK-NEXT: adrp x8, :got:b
-; CHECK-NEXT: adrp x9, :got:d
-; CHECK-NEXT: ldr x8, [x8, :got_lo12:b]
-; CHECK-NEXT: ldr x9, [x9, :got_lo12:d]
-; CHECK-NEXT: ldr w8, [x8]
-; CHECK-NEXT: ldr w9, [x9]
-; CHECK-NEXT: cmp w8, w9
-; CHECK-NEXT: b.ne .LBB7_7
-; CHECK-NEXT: // %bb.6:
-; CHECK-NEXT: mov w0, #123 // =0x7b
-; CHECK-NEXT: b .LBB7_8
-; CHECK-NEXT: .LBB7_7: // %if.end
-; CHECK-NEXT: mov w0, wzr
-; CHECK-NEXT: .LBB7_8: // %return
+; CHECK-NEXT: adrp x9, :got:b
+; CHECK-NEXT: adrp x10, :got:d
+; CHECK-NEXT: ldr x9, [x9, :got_lo12:b]
+; CHECK-NEXT: ldr x10, [x10, :got_lo12:d]
; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldr w9, [x9]
+; CHECK-NEXT: ldr w10, [x10]
+; CHECK-NEXT: cmp w9, w10
+; CHECK-NEXT: ccmp w8, #2, #0, eq
+; CHECK-NEXT: mov w8, #123 // =0x7b
+; CHECK-NEXT: csel w0, w8, wzr, lt
; CHECK-NEXT: ldr x30, [sp], #32 // 8-byte Folded Reload
; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: .cfi_restore w19
@@ -564,52 +555,42 @@ return: ; preds = %if.end, %land.lhs.t
define i32 @do_nothing_if_compares_can_not_be_adjusted_to_each_other() #0 {
; CHECK-LABEL: do_nothing_if_compares_can_not_be_adjusted_to_each_other:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: stp x30, x19, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: .cfi_offset w19, -8
-; CHECK-NEXT: .cfi_offset w30, -16
-; CHECK-NEXT: .cfi_remember_state
; CHECK-NEXT: adrp x8, :got:a
; CHECK-NEXT: ldr x8, [x8, :got_lo12:a]
; CHECK-NEXT: ldr w8, [x8]
; CHECK-NEXT: cmp w8, #0
-; CHECK-NEXT: b.gt .LBB8_3
+; CHECK-NEXT: b.gt .LBB8_4
; CHECK-NEXT: // %bb.1: // %while.body.preheader
+; CHECK-NEXT: stp x30, x19, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset w19, -8
+; CHECK-NEXT: .cfi_offset w30, -16
; CHECK-NEXT: sub w19, w8, #1
; CHECK-NEXT: .LBB8_2: // %while.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: bl do_something
; CHECK-NEXT: adds w19, w19, #1
; CHECK-NEXT: b.mi .LBB8_2
-; CHECK-NEXT: .LBB8_3: // %while.end
-; CHECK-NEXT: adrp x8, :got:c
-; CHECK-NEXT: ldr x8, [x8, :got_lo12:c]
-; CHECK-NEXT: ldr w8, [x8]
-; CHECK-NEXT: cmn w8, #2
-; CHECK-NEXT: b.lt .LBB8_6
-; CHECK-NEXT: // %bb.4: // %land.lhs.true
+; CHECK-NEXT: // %bb.3:
+; CHECK-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w19
+; CHECK-NEXT: .cfi_restore w30
+; CHECK-NEXT: .LBB8_4: // %while.end
; CHECK-NEXT: adrp x8, :got:b
; CHECK-NEXT: adrp x9, :got:d
+; CHECK-NEXT: adrp x10, :got:c
; CHECK-NEXT: ldr x8, [x8, :got_lo12:b]
; CHECK-NEXT: ldr x9, [x9, :got_lo12:d]
+; CHECK-NEXT: ldr x10, [x10, :got_lo12:c]
; CHECK-NEXT: ldr w8, [x8]
; CHECK-NEXT: ldr w9, [x9]
+; CHECK-NEXT: ldr w10, [x10]
; CHECK-NEXT: cmp w8, w9
-; CHECK-NEXT: b.ne .LBB8_6
-; CHECK-NEXT: // %bb.5:
-; CHECK-NEXT: mov w0, #123 // =0x7b
-; CHECK-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload
-; CHECK-NEXT: .cfi_def_cfa_offset 0
-; CHECK-NEXT: .cfi_restore w19
-; CHECK-NEXT: .cfi_restore w30
-; CHECK-NEXT: ret
-; CHECK-NEXT: .LBB8_6: // %if.end
-; CHECK-NEXT: .cfi_restore_state
-; CHECK-NEXT: mov w0, wzr
-; CHECK-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload
-; CHECK-NEXT: .cfi_def_cfa_offset 0
-; CHECK-NEXT: .cfi_restore w19
-; CHECK-NEXT: .cfi_restore w30
+; CHECK-NEXT: mov w8, #-3 // =0xfffffffd
+; CHECK-NEXT: ccmp w10, w8, #4, eq
+; CHECK-NEXT: mov w8, #123 // =0x7b
+; CHECK-NEXT: csel w0, w8, wzr, gt
; CHECK-NEXT: ret
entry:
%0 = load i32, ptr @a, align 4
@@ -782,12 +763,14 @@ define i32 @combine_gt_ge_sel(i64 %v, ptr %p) #0 {
; CHECK-NEXT: cmp w8, #0
; CHECK-NEXT: csel x9, x0, xzr, gt
; CHECK-NEXT: str x9, [x1]
-; CHECK-NEXT: b.le .LBB11_2
+; CHECK-NEXT: b.le .LBB11_3
; CHECK-NEXT: // %bb.1: // %lor.lhs.false
; CHECK-NEXT: cmp w8, #2
-; CHECK-NEXT: b.ge .LBB11_4
-; CHECK-NEXT: b .LBB11_6
-; CHECK-NEXT: .LBB11_2: // %land.lhs.true
+; CHECK-NEXT: b.ge .LBB11_5
+; CHECK-NEXT: // %bb.2:
+; CHECK-NEXT: mov w0, wzr
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB11_3: // %land.lhs.true
; CHECK-NEXT: adrp x8, :got:b
; CHECK-NEXT: adrp x9, :got:c
; CHECK-NEXT: ldr x8, [x8, :got_lo12:b]
@@ -795,11 +778,11 @@ define i32 @combine_gt_ge_sel(i64 %v, ptr %p) #0 {
; CHECK-NEXT: ldr w8, [x8]
; CHECK-NEXT: ldr w9, [x9]
; CHECK-NEXT: cmp w8, w9
-; CHECK-NEXT: b.ne .LBB11_4
-; CHECK-NEXT: // %bb.3:
+; CHECK-NEXT: b.ne .LBB11_5
+; CHECK-NEXT: // %bb.4:
; CHECK-NEXT: mov w0, #1 // =0x1
; CHECK-NEXT: ret
-; CHECK-NEXT: .LBB11_4: // %land.lhs.true3
+; CHECK-NEXT: .LBB11_5: // %land.lhs.true3
; CHECK-NEXT: adrp x8, :got:b
; CHECK-NEXT: adrp x9, :got:d
; CHECK-NEXT: ldr x8, [x8, :got_lo12:b]
@@ -807,12 +790,7 @@ define i32 @combine_gt_ge_sel(i64 %v, ptr %p) #0 {
; CHECK-NEXT: ldr w8, [x8]
; CHECK-NEXT: ldr w9, [x9]
; CHECK-NEXT: cmp w8, w9
-; CHECK-NEXT: b.ne .LBB11_6
-; CHECK-NEXT: // %bb.5:
-; CHECK-NEXT: mov w0, #1 // =0x1
-; CHECK-NEXT: ret
-; CHECK-NEXT: .LBB11_6: // %if.end
-; CHECK-NEXT: mov w0, wzr
+; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: ret
entry:
%0 = load i32, ptr @a, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index bf4302c156d8..4c9c34de7194 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -38342,12 +38342,11 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
; GFX10-NEXT: v_and_b32_e32 v4, 1, v4
; GFX10-NEXT: v_and_b32_e32 v6, 1, v6
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: v_and_b32_e32 v8, 1, v8
; GFX10-NEXT: v_and_b32_e32 v10, 1, v10
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
; GFX10-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX10-NEXT: v_writelane_b32 v40, s34, 2
; GFX10-NEXT: v_and_b32_e32 v5, 1, v5
; GFX10-NEXT: v_and_b32_e32 v7, 1, v7
; GFX10-NEXT: v_and_b32_e32 v9, 1, v9
@@ -38366,7 +38365,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX10-NEXT: v_cmp_eq_u32_e64 s17, 1, v4
; GFX10-NEXT: v_cmp_eq_u32_e64 s18, 1, v2
; GFX10-NEXT: v_cmp_eq_u32_e64 s19, 1, v0
-; GFX10-NEXT: v_writelane_b32 v40, s35, 3
+; GFX10-NEXT: v_writelane_b32 v40, s34, 2
; GFX10-NEXT: v_cmp_eq_u32_e64 s20, 1, v27
; GFX10-NEXT: v_cmp_eq_u32_e64 s21, 1, v25
; GFX10-NEXT: v_cmp_eq_u32_e64 s22, 1, v23
@@ -38377,10 +38376,10 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX10-NEXT: v_cmp_eq_u32_e64 s27, 1, v13
; GFX10-NEXT: v_cmp_eq_u32_e64 s28, 1, v11
; GFX10-NEXT: v_cmp_eq_u32_e64 s29, 1, v7
-; GFX10-NEXT: v_cmp_eq_u32_e64 s30, 1, v3
-; GFX10-NEXT: v_cmp_eq_u32_e64 s31, 1, v1
-; GFX10-NEXT: v_cmp_eq_u32_e64 s34, 1, v5
-; GFX10-NEXT: v_cmp_eq_u32_e64 s35, 1, v9
+; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_hi, 1, v3
+; GFX10-NEXT: v_cmp_eq_u32_e64 s30, 1, v1
+; GFX10-NEXT: v_cmp_eq_u32_e64 s31, 1, v5
+; GFX10-NEXT: v_cmp_eq_u32_e64 s34, 1, v9
; GFX10-NEXT: s_waitcnt vmcnt(32)
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v31
; GFX10-NEXT: s_waitcnt vmcnt(31)
@@ -38460,10 +38459,10 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX10-NEXT: v_cndmask_b32_e64 v6, v29, v39, s27
; GFX10-NEXT: v_cndmask_b32_e64 v5, v28, v26, s28
; GFX10-NEXT: v_cndmask_b32_e64 v20, v51, v20, s29
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v14, v12, s31
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v55, v16, s30
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v53, v18, s34
-; GFX10-NEXT: v_cndmask_b32_e64 v12, v24, v22, s35
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v14, v12, s30
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v55, v16, vcc_hi
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v53, v18, s31
+; GFX10-NEXT: v_cndmask_b32_e64 v12, v24, v22, s34
; GFX10-NEXT: v_cndmask_b32_e64 v16, v4, v3, s4
; GFX10-NEXT: v_perm_b32 v0, v0, v64, 0x5040100
; GFX10-NEXT: v_perm_b32 v1, v1, v54, 0x5040100
@@ -38481,7 +38480,6 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX10-NEXT: v_perm_b32 v13, v66, v13, 0x5040100
; GFX10-NEXT: v_perm_b32 v14, v65, v17, 0x5040100
; GFX10-NEXT: v_perm_b32 v15, v16, v15, 0x5040100
-; GFX10-NEXT: v_readlane_b32 s35, v40, 3
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll
index 37412ac3aa54..99755133f36d 100644
--- a/llvm/test/CodeGen/AMDGPU/build_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll
@@ -3,6 +3,7 @@
; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefixes=GFX8,GFX678,ALL
; RUN: llc < %s -mtriple=amdgcn-amd-amdpal -mcpu=gfx1030 -verify-machineinstrs | FileCheck %s --check-prefixes=GFX10,GFX1011,ALL
; RUN: llc < %s -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs | FileCheck %s --check-prefixes=GFX11,GFX1011,ALL
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx940 | FileCheck %s --check-prefixes=GFX940,ALL
; ALL-LABEL: {{^}}build_vector2:
; R600: MOV
@@ -96,3 +97,99 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32
store <2 x i16> %ins.1, ptr addrspace(1) %out
ret void
}
+
+; R600-LABEL: build_v2i32_from_v4i16_shuffle:
+; R600: ; %bb.0: ; %entry
+; R600-NEXT: ALU 0, @10, KC0[], KC1[]
+; R600-NEXT: TEX 1 @6
+; R600-NEXT: ALU 4, @11, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: Fetch clause starting at 6:
+; R600-NEXT: VTX_READ_16 T1.X, T0.X, 48, #3
+; R600-NEXT: VTX_READ_16 T0.X, T0.X, 44, #3
+; R600-NEXT: ALU clause starting at 10:
+; R600-NEXT: MOV * T0.X, 0.0,
+; R600-NEXT: ALU clause starting at 11:
+; R600-NEXT: LSHL * T0.Y, T1.X, literal.x,
+; R600-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; R600-NEXT: LSHL T0.X, T0.X, literal.x,
+; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; R600-NEXT: 16(2.242078e-44), 2(2.802597e-45)
+;
+; GFX6-LABEL: build_v2i32_from_v4i16_shuffle:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_lshl_b32 s3, s3, 16
+; GFX6-NEXT: s_lshl_b32 s2, s2, 16
+; GFX6-NEXT: s_mov_b32 s6, -1
+; GFX6-NEXT: s_mov_b32 s4, s0
+; GFX6-NEXT: s_mov_b32 s5, s1
+; GFX6-NEXT: v_mov_b32_e32 v0, s2
+; GFX6-NEXT: v_mov_b32_e32 v1, s3
+; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX6-NEXT: s_endpgm
+;
+; GFX8-LABEL: build_v2i32_from_v4i16_shuffle:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_mov_b32 s4, s0
+; GFX8-NEXT: s_mov_b32 s5, s1
+; GFX8-NEXT: s_lshl_b32 s0, s3, 16
+; GFX8-NEXT: s_lshl_b32 s1, s2, 16
+; GFX8-NEXT: v_mov_b32_e32 v0, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX8-NEXT: s_endpgm
+;
+; GFX10-LABEL: build_v2i32_from_v4i16_shuffle:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_lshl_b32 s2, s2, 16
+; GFX10-NEXT: s_lshl_b32 s3, s3, 16
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: build_v2i32_from_v4i16_shuffle:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX940-LABEL: build_v2i32_from_v4i16_shuffle:
+; GFX940: ; %bb.0: ; %entry
+; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NEXT: s_lshl_b32 s3, s3, 16
+; GFX940-NEXT: s_lshl_b32 s2, s2, 16
+; GFX940-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
+; GFX940-NEXT: s_endpgm
+define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out, <4 x i16> %in) {
+entry:
+ %shuf = shufflevector <4 x i16> %in, <4 x i16> zeroinitializer, <2 x i32> <i32 0, i32 2>
+ %zextended = zext <2 x i16> %shuf to <2 x i32>
+ %shifted = shl <2 x i32> %zextended, <i32 16, i32 16>
+ store <2 x i32> %shifted, ptr addrspace(1) %out
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
index ec3c08ec7952..da64c379672e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
@@ -1259,17 +1259,17 @@ define <4 x i1> @isnan_v4f16(<4 x half> %x) nounwind {
; GFX10SELDAG-LABEL: isnan_v4f16:
; GFX10SELDAG: ; %bb.0:
; GFX10SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10SELDAG-NEXT: v_mov_b32_e32 v2, 3
-; GFX10SELDAG-NEXT: v_cmp_class_f16_e64 s5, v0, 3
-; GFX10SELDAG-NEXT: v_cmp_class_f16_sdwa s4, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
-; GFX10SELDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, s5
-; GFX10SELDAG-NEXT: v_cmp_class_f16_sdwa s5, v0, v2 src0_sel:WORD_1 src1_sel:DWORD
+; GFX10SELDAG-NEXT: v_cmp_class_f16_e64 s4, v0, 3
+; GFX10SELDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX10SELDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4
+; GFX10SELDAG-NEXT: v_cmp_class_f16_e64 s4, v1, 3
+; GFX10SELDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX10SELDAG-NEXT: v_cmp_class_f16_sdwa s4, v0, v3 src0_sel:WORD_1 src1_sel:DWORD
+; GFX10SELDAG-NEXT: v_mov_b32_e32 v0, v5
+; GFX10SELDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4
+; GFX10SELDAG-NEXT: v_cmp_class_f16_sdwa s4, v1, v3 src0_sel:WORD_1 src1_sel:DWORD
+; GFX10SELDAG-NEXT: v_mov_b32_e32 v1, v4
; GFX10SELDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
-; GFX10SELDAG-NEXT: v_mov_b32_e32 v0, v4
-; GFX10SELDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1, s5
-; GFX10SELDAG-NEXT: v_cmp_class_f16_e64 s5, v1, 3
-; GFX10SELDAG-NEXT: v_mov_b32_e32 v1, v5
-; GFX10SELDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s5
; GFX10SELDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX10GLISEL-LABEL: isnan_v4f16:
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index ab6a9dcf71ac..a87fa8bf36d9 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -7404,35 +7404,35 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %
; GFX12-NEXT: v_dual_mov_b32 v4, s22 :: v_dual_mov_b32 v9, s31
; GFX12-NEXT: v_dual_mov_b32 v8, s30 :: v_dual_mov_b32 v11, s35
; GFX12-NEXT: v_dual_mov_b32 v10, s34 :: v_dual_mov_b32 v3, s5
-; GFX12-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x100000
-; GFX12-NEXT: s_lshr_b32 s12, s0, 16
-; GFX12-NEXT: s_mov_b32 s14, s1
-; GFX12-NEXT: s_lshr_b32 s16, s1, 16
-; GFX12-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x100000
+; GFX12-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x100000
; GFX12-NEXT: s_lshr_b32 s2, s2, 16
; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000
; GFX12-NEXT: v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v5, s23
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v13, s25
+; GFX12-NEXT: s_mov_b32 s12, s1
+; GFX12-NEXT: s_lshr_b32 s14, s1, 16
; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x100000
; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x100000
; GFX12-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
; GFX12-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v7, s7
+; GFX12-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x100000
+; GFX12-NEXT: s_lshr_b32 s0, s0, 16
; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000
; GFX12-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v17, s19
+; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000
; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000
-; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x100000
; GFX12-NEXT: v_dual_mov_b32 v16, s18 :: v_dual_mov_b32 v19, s21
; GFX12-NEXT: v_mov_b32_e32 v18, s20
-; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000
+; GFX12-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v24, v[8:11], s[8:9] offset:80
; GFX12-NEXT: global_store_b128 v24, v[0:3], s[8:9] offset:64
-; GFX12-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
+; GFX12-NEXT: v_dual_mov_b32 v1, s17 :: v_dual_mov_b32 v0, s16
; GFX12-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
-; GFX12-NEXT: v_dual_mov_b32 v9, s15 :: v_dual_mov_b32 v8, s14
-; GFX12-NEXT: v_dual_mov_b32 v11, s17 :: v_dual_mov_b32 v10, s16
+; GFX12-NEXT: v_dual_mov_b32 v9, s13 :: v_dual_mov_b32 v8, s12
+; GFX12-NEXT: v_dual_mov_b32 v11, s15 :: v_dual_mov_b32 v10, s14
; GFX12-NEXT: v_dual_mov_b32 v21, s11 :: v_dual_mov_b32 v20, s10
-; GFX12-NEXT: v_dual_mov_b32 v23, s13 :: v_dual_mov_b32 v22, s12
+; GFX12-NEXT: v_dual_mov_b32 v23, s1 :: v_dual_mov_b32 v22, s0
; GFX12-NEXT: s_clause 0x5
; GFX12-NEXT: global_store_b128 v24, v[12:15], s[8:9] offset:112
; GFX12-NEXT: global_store_b128 v24, v[4:7], s[8:9] offset:96
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
index 952827b8cd0e..889755c23bbc 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -8808,73 +8808,73 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: v_lshrrev_b16 v2, 8, s6
; GFX12-NEXT: v_lshrrev_b16 v4, 8, s5
; GFX12-NEXT: v_lshrrev_b16 v8, 8, s2
-; GFX12-NEXT: s_lshr_b32 s24, s7, 16
+; GFX12-NEXT: s_lshr_b32 s22, s7, 16
; GFX12-NEXT: v_bfe_i32 v31, v1, 0, 8
-; GFX12-NEXT: s_lshr_b32 s42, s2, 24
-; GFX12-NEXT: s_mov_b32 s48, s7
+; GFX12-NEXT: s_lshr_b32 s40, s2, 24
+; GFX12-NEXT: s_mov_b32 s46, s7
; GFX12-NEXT: v_lshrrev_b16 v5, 8, s4
; GFX12-NEXT: v_lshrrev_b16 v7, 8, s1
-; GFX12-NEXT: s_lshr_b32 s26, s6, 16
-; GFX12-NEXT: s_lshr_b32 s44, s1, 16
+; GFX12-NEXT: s_lshr_b32 s24, s6, 16
+; GFX12-NEXT: s_lshr_b32 s42, s1, 16
; GFX12-NEXT: s_ashr_i64 s[58:59], s[6:7], 56
-; GFX12-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
; GFX12-NEXT: v_lshrrev_b16 v6, 8, s3
; GFX12-NEXT: v_lshrrev_b16 v3, 8, s0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v33, s24
-; GFX12-NEXT: s_lshr_b32 s28, s6, 24
-; GFX12-NEXT: s_lshr_b32 s30, s5, 16
-; GFX12-NEXT: s_lshr_b32 s40, s2, 16
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v33, s22
+; GFX12-NEXT: s_lshr_b32 s26, s6, 24
+; GFX12-NEXT: s_lshr_b32 s28, s5, 16
+; GFX12-NEXT: s_lshr_b32 s38, s2, 16
; GFX12-NEXT: v_bfe_i32 v11, v8, 0, 8
; GFX12-NEXT: v_bfe_i32 v23, v4, 0, 8
; GFX12-NEXT: v_bfe_i32 v27, v2, 0, 8
; GFX12-NEXT: v_ashrrev_i32_e32 v32, 31, v31
-; GFX12-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000
-; GFX12-NEXT: v_dual_mov_b32 v34, s25 :: v_dual_mov_b32 v35, s58
-; GFX12-NEXT: v_dual_mov_b32 v36, s59 :: v_dual_mov_b32 v37, s26
-; GFX12-NEXT: v_dual_mov_b32 v56, s43 :: v_dual_mov_b32 v29, s48
-; GFX12-NEXT: v_mov_b32_e32 v30, s49
-; GFX12-NEXT: s_lshr_b32 s46, s0, 24
-; GFX12-NEXT: s_mov_b32 s50, s5
-; GFX12-NEXT: s_mov_b32 s52, s3
-; GFX12-NEXT: s_lshr_b32 s34, s4, 16
-; GFX12-NEXT: s_lshr_b32 s36, s4, 24
-; GFX12-NEXT: s_ashr_i64 s[22:23], s[2:3], 56
+; GFX12-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
+; GFX12-NEXT: v_dual_mov_b32 v34, s23 :: v_dual_mov_b32 v35, s58
+; GFX12-NEXT: v_dual_mov_b32 v36, s59 :: v_dual_mov_b32 v37, s24
+; GFX12-NEXT: v_dual_mov_b32 v56, s41 :: v_dual_mov_b32 v29, s46
+; GFX12-NEXT: v_mov_b32_e32 v30, s47
+; GFX12-NEXT: s_lshr_b32 s44, s0, 24
+; GFX12-NEXT: s_mov_b32 s48, s5
+; GFX12-NEXT: s_mov_b32 s50, s3
+; GFX12-NEXT: s_lshr_b32 s30, s4, 16
+; GFX12-NEXT: s_lshr_b32 s34, s4, 24
+; GFX12-NEXT: s_ashr_i64 s[54:55], s[2:3], 56
; GFX12-NEXT: s_ashr_i64 s[56:57], s[4:5], 56
; GFX12-NEXT: v_bfe_i32 v7, v7, 0, 8
; GFX12-NEXT: v_bfe_i32 v19, v5, 0, 8
-; GFX12-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000
; GFX12-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000
-; GFX12-NEXT: s_lshr_b32 s38, s3, 16
-; GFX12-NEXT: s_mov_b32 s54, s1
+; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000
+; GFX12-NEXT: s_lshr_b32 s36, s3, 16
+; GFX12-NEXT: s_mov_b32 s52, s1
; GFX12-NEXT: s_bfe_i64 s[12:13], s[2:3], 0x80000
; GFX12-NEXT: s_bfe_i64 s[14:15], s[4:5], 0x80000
; GFX12-NEXT: s_bfe_i64 s[16:17], s[6:7], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[2:3], s[52:53], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[4:5], s[50:51], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[6:7], s[46:47], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[2:3], s[50:51], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[4:5], s[48:49], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[6:7], s[44:45], 0x80000
; GFX12-NEXT: s_lshr_b32 s20, s0, 16
; GFX12-NEXT: s_ashr_i64 s[18:19], s[0:1], 56
; GFX12-NEXT: v_bfe_i32 v3, v3, 0, 8
; GFX12-NEXT: v_bfe_i32 v15, v6, 0, 8
-; GFX12-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000
; GFX12-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000
-; GFX12-NEXT: v_dual_mov_b32 v38, s27 :: v_dual_mov_b32 v39, s28
-; GFX12-NEXT: v_dual_mov_b32 v40, s29 :: v_dual_mov_b32 v41, s30
-; GFX12-NEXT: v_dual_mov_b32 v42, s31 :: v_dual_mov_b32 v43, s56
-; GFX12-NEXT: v_dual_mov_b32 v44, s57 :: v_dual_mov_b32 v45, s34
-; GFX12-NEXT: v_dual_mov_b32 v52, s23 :: v_dual_mov_b32 v53, s40
-; GFX12-NEXT: v_dual_mov_b32 v54, s41 :: v_dual_mov_b32 v55, s42
+; GFX12-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000
+; GFX12-NEXT: v_dual_mov_b32 v38, s25 :: v_dual_mov_b32 v39, s26
+; GFX12-NEXT: v_dual_mov_b32 v40, s27 :: v_dual_mov_b32 v41, s28
+; GFX12-NEXT: v_dual_mov_b32 v42, s29 :: v_dual_mov_b32 v43, s56
+; GFX12-NEXT: v_dual_mov_b32 v44, s57 :: v_dual_mov_b32 v45, s30
+; GFX12-NEXT: v_dual_mov_b32 v52, s55 :: v_dual_mov_b32 v53, s38
+; GFX12-NEXT: v_dual_mov_b32 v54, s39 :: v_dual_mov_b32 v55, s40
; GFX12-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[0:1], s[54:55], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[0:1], s[52:53], 0x80000
; GFX12-NEXT: v_ashrrev_i32_e32 v12, 31, v11
; GFX12-NEXT: v_ashrrev_i32_e32 v24, 31, v23
; GFX12-NEXT: v_ashrrev_i32_e32 v28, 31, v27
; GFX12-NEXT: global_store_b128 v0, v[33:36], s[8:9] offset:240
-; GFX12-NEXT: v_mov_b32_e32 v33, s44
+; GFX12-NEXT: v_mov_b32_e32 v33, s42
; GFX12-NEXT: global_store_b128 v0, v[29:32], s[8:9] offset:224
; GFX12-NEXT: v_dual_mov_b32 v25, s16 :: v_dual_mov_b32 v26, s17
; GFX12-NEXT: v_dual_mov_b32 v32, s7 :: v_dual_mov_b32 v21, s4
@@ -8882,16 +8882,16 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: v_dual_mov_b32 v14, s3 :: v_dual_mov_b32 v9, s12
; GFX12-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v5, s0
; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000
-; GFX12-NEXT: v_dual_mov_b32 v46, s35 :: v_dual_mov_b32 v47, s36
-; GFX12-NEXT: v_dual_mov_b32 v48, s37 :: v_dual_mov_b32 v49, s38
-; GFX12-NEXT: v_dual_mov_b32 v34, s45 :: v_dual_mov_b32 v35, s18
+; GFX12-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000
+; GFX12-NEXT: v_dual_mov_b32 v46, s31 :: v_dual_mov_b32 v47, s34
+; GFX12-NEXT: v_dual_mov_b32 v48, s35 :: v_dual_mov_b32 v49, s36
+; GFX12-NEXT: v_dual_mov_b32 v34, s43 :: v_dual_mov_b32 v35, s18
; GFX12-NEXT: v_dual_mov_b32 v36, s19 :: v_dual_mov_b32 v29, s20
; GFX12-NEXT: v_ashrrev_i32_e32 v8, 31, v7
; GFX12-NEXT: v_ashrrev_i32_e32 v20, 31, v19
; GFX12-NEXT: v_dual_mov_b32 v18, s15 :: v_dual_mov_b32 v13, s2
; GFX12-NEXT: v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v1, s10
-; GFX12-NEXT: v_dual_mov_b32 v50, s39 :: v_dual_mov_b32 v51, s22
+; GFX12-NEXT: v_dual_mov_b32 v50, s37 :: v_dual_mov_b32 v51, s54
; GFX12-NEXT: v_dual_mov_b32 v30, s21 :: v_dual_mov_b32 v31, s6
; GFX12-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX12-NEXT: v_ashrrev_i32_e32 v16, 31, v15
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
index f8e7cb397b47..8a5f75332557 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
@@ -28,18 +28,17 @@ body: |
; GCN-LABEL: name: test_main
; GCN: bb.0:
; GCN-NEXT: successors: %bb.1(0x80000000)
- ; GCN-NEXT: liveins: $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $sgpr102, $sgpr103, $vgpr0, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GCN-NEXT: liveins: $vcc_hi, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $sgpr102, $sgpr103, $vgpr0, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GCN-NEXT: {{ $}}
- ; GCN-NEXT: $sgpr0 = COPY $sgpr33
+ ; GCN-NEXT: $vcc_hi = frame-setup COPY $sgpr33
; GCN-NEXT: $sgpr33 = frame-setup COPY $sgpr32
- ; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+ ; GCN-NEXT: $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr3, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.69, addrspace 5)
; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr4, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.70, addrspace 5)
; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr5, $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.71, addrspace 5)
; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr2, $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.72, addrspace 5)
- ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.74, addrspace 5)
- ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
- ; GCN-NEXT: $vgpr5 = SI_SPILL_S32_TO_VGPR $sgpr0, 4, undef $vgpr5
+ ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.73, addrspace 5)
+ ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0
; GCN-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 24, implicit-def dead $scc
; GCN-NEXT: renamable $vgpr2 = IMPLICIT_DEF
; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, $vgpr3
@@ -116,18 +115,18 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: bb.1:
; GCN-NEXT: successors: %bb.2(0x80000000)
- ; GCN-NEXT: liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GCN-NEXT: liveins: $vcc_hi, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GCN-NEXT: {{ $}}
; GCN-NEXT: KILL implicit-def $vcc_lo, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63, implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, implicit-def $sgpr96_sgpr97_sgpr98_sgpr99_sgpr100_sgpr101_sgpr102_sgpr103
; GCN-NEXT: {{ $}}
; GCN-NEXT: bb.2:
; GCN-NEXT: successors: %bb.3(0x80000000)
- ; GCN-NEXT: liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GCN-NEXT: liveins: $vcc_hi, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GCN-NEXT: {{ $}}
; GCN-NEXT: $sgpr22 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0
; GCN-NEXT: {{ $}}
; GCN-NEXT: bb.3:
- ; GCN-NEXT: liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GCN-NEXT: liveins: $vcc_hi, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GCN-NEXT: {{ $}}
; GCN-NEXT: $sgpr103 = SI_RESTORE_S32_FROM_VGPR $vgpr5, 3
; GCN-NEXT: $sgpr102 = SI_RESTORE_S32_FROM_VGPR $vgpr5, 2
@@ -198,16 +197,15 @@ body: |
; GCN-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 1
; GCN-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 0
; GCN-NEXT: KILL killed renamable $vgpr2
- ; GCN-NEXT: $sgpr0 = SI_RESTORE_S32_FROM_VGPR $vgpr5, 4
- ; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+ ; GCN-NEXT: $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
; GCN-NEXT: $vgpr3 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.69, addrspace 5)
; GCN-NEXT: $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.70, addrspace 5)
; GCN-NEXT: $vgpr5 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.71, addrspace 5)
; GCN-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.72, addrspace 5)
- ; GCN-NEXT: $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.74, addrspace 5)
- ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
+ ; GCN-NEXT: $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.73, addrspace 5)
+ ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0
; GCN-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -24, implicit-def dead $scc
- ; GCN-NEXT: $sgpr33 = COPY $sgpr0
+ ; GCN-NEXT: $sgpr33 = frame-destroy COPY $vcc_hi
; GCN-NEXT: S_ENDPGM 0
bb.0:
liveins: $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/vopd-combine.mir b/llvm/test/CodeGen/AMDGPU/vopd-combine.mir
index 63bef40c3474..b8ac50c3aeb5 100644
--- a/llvm/test/CodeGen/AMDGPU/vopd-combine.mir
+++ b/llvm/test/CodeGen/AMDGPU/vopd-combine.mir
@@ -160,7 +160,7 @@ body: |
; PAIR-GFX11-NEXT: $vgpr3 = IMPLICIT_DEF
; PAIR-GFX11-NEXT: $sgpr20 = IMPLICIT_DEF
; PAIR-GFX11-NEXT: $vgpr4 = V_FMAMK_F32 $sgpr20, 12345, $vgpr3, implicit $mode, implicit $exec
- ; PAIR-GFX11-NEXT: $vgpr2, $vgpr5 = V_DUAL_FMAC_F32_e32_X_CNDMASK_B32_e32_gfx11 $sgpr20, killed $vgpr1, killed $vgpr2, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
+ ; PAIR-GFX11-NEXT: $vgpr2, $vgpr5 = V_DUAL_FMAC_F32_e32_X_CNDMASK_B32_e32_gfx11 $sgpr20, killed $vgpr1, killed $vgpr2, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
; PAIR-GFX11-NEXT: $vgpr7 = V_CNDMASK_B32_e32 killed $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo
; PAIR-GFX11-NEXT: $vgpr6 = V_ADD_F32_e32 $sgpr20, $vgpr3, implicit $mode, implicit $exec
; PAIR-GFX11-NEXT: $vgpr9 = V_CNDMASK_B32_e32 killed $sgpr20, killed $vgpr3, implicit $mode, implicit $exec, implicit killed $vcc_lo
@@ -174,7 +174,7 @@ body: |
; PAIR-GFX12-NEXT: $vgpr3 = IMPLICIT_DEF
; PAIR-GFX12-NEXT: $sgpr20 = IMPLICIT_DEF
; PAIR-GFX12-NEXT: $vgpr4 = V_FMAMK_F32 $sgpr20, 12345, $vgpr3, implicit $mode, implicit $exec
- ; PAIR-GFX12-NEXT: $vgpr2, $vgpr5 = V_DUAL_FMAC_F32_e32_X_CNDMASK_B32_e32_gfx12 $sgpr20, killed $vgpr1, killed $vgpr2, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
+ ; PAIR-GFX12-NEXT: $vgpr2, $vgpr5 = V_DUAL_FMAC_F32_e32_X_CNDMASK_B32_e32_gfx12 $sgpr20, killed $vgpr1, killed $vgpr2, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
; PAIR-GFX12-NEXT: $vgpr7 = V_CNDMASK_B32_e32 killed $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo
; PAIR-GFX12-NEXT: $vgpr6 = V_ADD_F32_e32 $sgpr20, $vgpr3, implicit $mode, implicit $exec
; PAIR-GFX12-NEXT: $vgpr9 = V_CNDMASK_B32_e32 killed $sgpr20, killed $vgpr3, implicit $mode, implicit $exec, implicit killed $vcc_lo
@@ -458,9 +458,9 @@ body: |
; PAIR-GFX11-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
; PAIR-GFX11-NEXT: $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
; PAIR-GFX11-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
- ; PAIR-GFX11-NEXT: $vgpr12, $vgpr19 = V_DUAL_ADD_F32_e32_X_CNDMASK_B32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
+ ; PAIR-GFX11-NEXT: $vgpr12, $vgpr19 = V_DUAL_ADD_F32_e32_X_CNDMASK_B32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
; PAIR-GFX11-NEXT: $vgpr11 = V_CNDMASK_B32_e32 $vgpr0, killed $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo
- ; PAIR-GFX11-NEXT: $vgpr17, $vgpr10 = V_DUAL_MUL_F32_e32_X_CNDMASK_B32_e32_gfx11 killed $vgpr0, $vgpr0, $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
+ ; PAIR-GFX11-NEXT: $vgpr17, $vgpr10 = V_DUAL_MUL_F32_e32_X_CNDMASK_B32_e32_gfx11 killed $vgpr0, $vgpr0, $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
; PAIR-GFX11-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit killed $vcc_lo
; PAIR-GFX11-NEXT: $vgpr16 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; PAIR-GFX11-NEXT: $vgpr14 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
@@ -476,9 +476,9 @@ body: |
; PAIR-GFX12-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx12 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
; PAIR-GFX12-NEXT: $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
; PAIR-GFX12-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
- ; PAIR-GFX12-NEXT: $vgpr12, $vgpr19 = V_DUAL_ADD_F32_e32_X_CNDMASK_B32_e32_gfx12 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
+ ; PAIR-GFX12-NEXT: $vgpr12, $vgpr19 = V_DUAL_ADD_F32_e32_X_CNDMASK_B32_e32_gfx12 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
; PAIR-GFX12-NEXT: $vgpr11 = V_CNDMASK_B32_e32 $vgpr0, killed $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo
- ; PAIR-GFX12-NEXT: $vgpr17, $vgpr10 = V_DUAL_MUL_F32_e32_X_CNDMASK_B32_e32_gfx12 killed $vgpr0, $vgpr0, $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
+ ; PAIR-GFX12-NEXT: $vgpr17, $vgpr10 = V_DUAL_MUL_F32_e32_X_CNDMASK_B32_e32_gfx12 killed $vgpr0, $vgpr0, $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
; PAIR-GFX12-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit killed $vcc_lo
; PAIR-GFX12-NEXT: $vgpr16 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; PAIR-GFX12-NEXT: $vgpr14 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
@@ -559,12 +559,12 @@ body: |
; PAIR-GFX11-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
; PAIR-GFX11-NEXT: $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
; PAIR-GFX11-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
- ; PAIR-GFX11-NEXT: $vgpr4, $vgpr29 = V_DUAL_SUB_F32_e32_X_CNDMASK_B32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
- ; PAIR-GFX11-NEXT: $vgpr19, $vgpr20 = V_DUAL_CNDMASK_B32_e32_X_FMAC_F32_e32_gfx11 $vgpr0, $vgpr3, 10, $vgpr1, killed $vgpr20, implicit $vcc, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
+ ; PAIR-GFX11-NEXT: $vgpr4, $vgpr29 = V_DUAL_SUB_F32_e32_X_CNDMASK_B32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
+ ; PAIR-GFX11-NEXT: $vgpr19, $vgpr20 = V_DUAL_CNDMASK_B32_e32_X_FMAC_F32_e32_gfx11 $vgpr0, $vgpr3, 10, $vgpr1, killed $vgpr20, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
; PAIR-GFX11-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo
- ; PAIR-GFX11-NEXT: $vgpr10, $vgpr17 = V_DUAL_CNDMASK_B32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr2, $vgpr0, $vgpr0, implicit $vcc, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
- ; PAIR-GFX11-NEXT: $vgpr11, $vgpr12 = V_DUAL_CNDMASK_B32_e32_X_ADD_F32_e32_gfx11 $vgpr0, $vgpr3, $vgpr1, $vgpr1, implicit $vcc, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
- ; PAIR-GFX11-NEXT: $vgpr37, $vgpr14 = V_DUAL_CNDMASK_B32_e32_X_SUB_F32_e32_gfx11 $vgpr0, killed $vgpr3, $vgpr1, $vgpr1, implicit $vcc, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
+ ; PAIR-GFX11-NEXT: $vgpr10, $vgpr17 = V_DUAL_CNDMASK_B32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr2, $vgpr0, $vgpr0, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
+ ; PAIR-GFX11-NEXT: $vgpr11, $vgpr12 = V_DUAL_CNDMASK_B32_e32_X_ADD_F32_e32_gfx11 $vgpr0, $vgpr3, $vgpr1, $vgpr1, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
+ ; PAIR-GFX11-NEXT: $vgpr37, $vgpr14 = V_DUAL_CNDMASK_B32_e32_X_SUB_F32_e32_gfx11 $vgpr0, killed $vgpr3, $vgpr1, $vgpr1, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
; PAIR-GFX11-NEXT: $vgpr20 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; PAIR-GFX11-NEXT: $vgpr21, $vgpr24 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr1, killed $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
; PAIR-GFX11-NEXT: $vgpr28 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo
@@ -586,12 +586,12 @@ body: |
; PAIR-GFX12-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx12 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
; PAIR-GFX12-NEXT: $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
; PAIR-GFX12-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
- ; PAIR-GFX12-NEXT: $vgpr4, $vgpr29 = V_DUAL_SUB_F32_e32_X_CNDMASK_B32_e32_gfx12 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
- ; PAIR-GFX12-NEXT: $vgpr19, $vgpr20 = V_DUAL_CNDMASK_B32_e32_X_FMAC_F32_e32_gfx12 $vgpr0, $vgpr3, 10, $vgpr1, killed $vgpr20, implicit $vcc, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
+ ; PAIR-GFX12-NEXT: $vgpr4, $vgpr29 = V_DUAL_SUB_F32_e32_X_CNDMASK_B32_e32_gfx12 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
+ ; PAIR-GFX12-NEXT: $vgpr19, $vgpr20 = V_DUAL_CNDMASK_B32_e32_X_FMAC_F32_e32_gfx12 $vgpr0, $vgpr3, 10, $vgpr1, killed $vgpr20, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
; PAIR-GFX12-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo
- ; PAIR-GFX12-NEXT: $vgpr10, $vgpr17 = V_DUAL_CNDMASK_B32_e32_X_MUL_F32_e32_gfx12 $vgpr1, $vgpr2, $vgpr0, $vgpr0, implicit $vcc, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
- ; PAIR-GFX12-NEXT: $vgpr11, $vgpr12 = V_DUAL_CNDMASK_B32_e32_X_ADD_F32_e32_gfx12 $vgpr0, $vgpr3, $vgpr1, $vgpr1, implicit $vcc, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
- ; PAIR-GFX12-NEXT: $vgpr37, $vgpr14 = V_DUAL_CNDMASK_B32_e32_X_SUB_F32_e32_gfx12 $vgpr0, killed $vgpr3, $vgpr1, $vgpr1, implicit $vcc, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
+ ; PAIR-GFX12-NEXT: $vgpr10, $vgpr17 = V_DUAL_CNDMASK_B32_e32_X_MUL_F32_e32_gfx12 $vgpr1, $vgpr2, $vgpr0, $vgpr0, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
+ ; PAIR-GFX12-NEXT: $vgpr11, $vgpr12 = V_DUAL_CNDMASK_B32_e32_X_ADD_F32_e32_gfx12 $vgpr0, $vgpr3, $vgpr1, $vgpr1, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
+ ; PAIR-GFX12-NEXT: $vgpr37, $vgpr14 = V_DUAL_CNDMASK_B32_e32_X_SUB_F32_e32_gfx12 $vgpr0, killed $vgpr3, $vgpr1, $vgpr1, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
; PAIR-GFX12-NEXT: $vgpr20 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; PAIR-GFX12-NEXT: $vgpr21, $vgpr24 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx12 $vgpr1, $vgpr1, killed $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
; PAIR-GFX12-NEXT: $vgpr28 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo
diff --git a/llvm/test/CodeGen/Hexagon/vect/zext-v4i1.ll b/llvm/test/CodeGen/Hexagon/vect/zext-v4i1.ll
index dddc4bd953d7..c33c81841be6 100644
--- a/llvm/test/CodeGen/Hexagon/vect/zext-v4i1.ll
+++ b/llvm/test/CodeGen/Hexagon/vect/zext-v4i1.ll
@@ -10,12 +10,13 @@ define i32 @fred(ptr %a0) #0 {
; CHECK-LABEL: fred:
; CHECK: // %bb.0: // %b0
; CHECK-NEXT: {
-; CHECK-NEXT: if (p0) jump:nt .LBB0_2
+; CHECK-NEXT: r1:0 = combine(r0,#0)
+; CHECK-NEXT: if (p0) jumpr r31
; CHECK-NEXT: }
-; CHECK-NEXT: // %bb.1: // %b2
+; CHECK-NEXT: .LBB0_1: // %b2
; CHECK-NEXT: {
; CHECK-NEXT: r3:2 = combine(#0,#0)
-; CHECK-NEXT: r1:0 = memd(r0+#0)
+; CHECK-NEXT: r1:0 = memd(r1+#0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: p0 = vcmph.eq(r1:0,r3:2)
@@ -27,16 +28,7 @@ define i32 @fred(ptr %a0) #0 {
; CHECK-NEXT: r0 = and(r0,#1)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: p0 = cmp.eq(r0,#11)
-; CHECK-NEXT: r0 = #1
-; CHECK-NEXT: }
-; CHECK-NEXT: {
-; CHECK-NEXT: if (p0) r0 = #0
-; CHECK-NEXT: jumpr r31
-; CHECK-NEXT: }
-; CHECK-NEXT: .LBB0_2: // %b14
-; CHECK-NEXT: {
-; CHECK-NEXT: r0 = #0
+; CHECK-NEXT: r0 = !cmp.eq(r0,#11)
; CHECK-NEXT: jumpr r31
; CHECK-NEXT: }
b0:
diff --git a/llvm/test/CodeGen/RISCV/prefer-w-inst.ll b/llvm/test/CodeGen/RISCV/prefer-w-inst.ll
new file mode 100644
index 000000000000..34ab74d78a76
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/prefer-w-inst.ll
@@ -0,0 +1,105 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs < %s \
+; RUN: | FileCheck -check-prefixes=NO-PREFER-W-INST %s
+; RUN: llc -mtriple=riscv64 -mattr=+m -riscv-disable-strip-w-suffix -verify-machineinstrs < %s \
+; RUN: | FileCheck -check-prefixes=NO-STRIP %s
+; RUN: llc -mtriple=riscv64 -mattr=+m,+prefer-w-inst -verify-machineinstrs < %s \
+; RUN: | FileCheck -check-prefixes=PREFER-W-INST %s
+
+define i32 @addiw(i32 %a) {
+; NO-PREFER-W-INST-LABEL: addiw:
+; NO-PREFER-W-INST: # %bb.0:
+; NO-PREFER-W-INST-NEXT: lui a1, 1
+; NO-PREFER-W-INST-NEXT: addi a1, a1, -1
+; NO-PREFER-W-INST-NEXT: addw a0, a0, a1
+; NO-PREFER-W-INST-NEXT: ret
+;
+; NO-STRIP-LABEL: addiw:
+; NO-STRIP: # %bb.0:
+; NO-STRIP-NEXT: lui a1, 1
+; NO-STRIP-NEXT: addiw a1, a1, -1
+; NO-STRIP-NEXT: addw a0, a0, a1
+; NO-STRIP-NEXT: ret
+;
+; PREFER-W-INST-LABEL: addiw:
+; PREFER-W-INST: # %bb.0:
+; PREFER-W-INST-NEXT: lui a1, 1
+; PREFER-W-INST-NEXT: addiw a1, a1, -1
+; PREFER-W-INST-NEXT: addw a0, a0, a1
+; PREFER-W-INST-NEXT: ret
+ %ret = add i32 %a, 4095
+ ret i32 %ret
+}
+
+define i32 @addw(i32 %a, i32 %b) {
+; NO-PREFER-W-INST-LABEL: addw:
+; NO-PREFER-W-INST: # %bb.0:
+; NO-PREFER-W-INST-NEXT: add a0, a0, a1
+; NO-PREFER-W-INST-NEXT: addiw a0, a0, 1024
+; NO-PREFER-W-INST-NEXT: ret
+;
+; NO-STRIP-LABEL: addw:
+; NO-STRIP: # %bb.0:
+; NO-STRIP-NEXT: addw a0, a0, a1
+; NO-STRIP-NEXT: addiw a0, a0, 1024
+; NO-STRIP-NEXT: ret
+;
+; PREFER-W-INST-LABEL: addw:
+; PREFER-W-INST: # %bb.0:
+; PREFER-W-INST-NEXT: addw a0, a0, a1
+; PREFER-W-INST-NEXT: addiw a0, a0, 1024
+; PREFER-W-INST-NEXT: ret
+ %add = add i32 %a, %b
+ %ret = add i32 %add, 1024
+ ret i32 %ret
+}
+
+define i32 @mulw(i32 %a, i32 %b) {
+; NO-PREFER-W-INST-LABEL: mulw:
+; NO-PREFER-W-INST: # %bb.0:
+; NO-PREFER-W-INST-NEXT: mul a1, a0, a1
+; NO-PREFER-W-INST-NEXT: mul a0, a0, a1
+; NO-PREFER-W-INST-NEXT: addiw a0, a0, 1024
+; NO-PREFER-W-INST-NEXT: ret
+;
+; NO-STRIP-LABEL: mulw:
+; NO-STRIP: # %bb.0:
+; NO-STRIP-NEXT: mulw a1, a0, a1
+; NO-STRIP-NEXT: mulw a0, a0, a1
+; NO-STRIP-NEXT: addiw a0, a0, 1024
+; NO-STRIP-NEXT: ret
+;
+; PREFER-W-INST-LABEL: mulw:
+; PREFER-W-INST: # %bb.0:
+; PREFER-W-INST-NEXT: mulw a1, a0, a1
+; PREFER-W-INST-NEXT: mulw a0, a0, a1
+; PREFER-W-INST-NEXT: addiw a0, a0, 1024
+; PREFER-W-INST-NEXT: ret
+ %mul1 = mul i32 %a, %b
+ %mul = mul i32 %a, %mul1
+ %ret = add i32 %mul, 1024
+ ret i32 %ret
+}
+
+define i32 @slliw(i32 %a) {
+; NO-PREFER-W-INST-LABEL: slliw:
+; NO-PREFER-W-INST: # %bb.0:
+; NO-PREFER-W-INST-NEXT: slli a0, a0, 1
+; NO-PREFER-W-INST-NEXT: addiw a0, a0, 1024
+; NO-PREFER-W-INST-NEXT: ret
+;
+; NO-STRIP-LABEL: slliw:
+; NO-STRIP: # %bb.0:
+; NO-STRIP-NEXT: slliw a0, a0, 1
+; NO-STRIP-NEXT: addiw a0, a0, 1024
+; NO-STRIP-NEXT: ret
+;
+; PREFER-W-INST-LABEL: slliw:
+; PREFER-W-INST: # %bb.0:
+; PREFER-W-INST-NEXT: slliw a0, a0, 1
+; PREFER-W-INST-NEXT: addiw a0, a0, 1024
+; PREFER-W-INST-NEXT: ret
+ %shl = shl i32 %a, 1
+ %ret = add i32 %shl, 1024
+ ret i32 %ret
+}
diff --git a/llvm/test/CodeGen/RISCV/prefer-w-inst.mir b/llvm/test/CodeGen/RISCV/prefer-w-inst.mir
new file mode 100644
index 000000000000..e05e27af4271
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/prefer-w-inst.mir
@@ -0,0 +1,262 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc %s -mtriple=riscv64 -run-pass=riscv-opt-w-instrs -verify-machineinstrs \
+# RUN: -mattr=+m -o - | FileCheck %s -check-prefixes=NO-PREFER-W-INST
+# RUN: llc %s -mtriple=riscv64 -run-pass=riscv-opt-w-instrs -verify-machineinstrs \
+# RUN: -mattr=+m,+prefer-w-inst -o - | FileCheck %s -check-prefixes=PREFER-W-INST
+
+---
+name: addi
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ ; NO-PREFER-W-INST-LABEL: name: addi
+ ; NO-PREFER-W-INST: liveins: $x10, $x11
+ ; NO-PREFER-W-INST-NEXT: {{ $}}
+ ; NO-PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+ ; NO-PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+ ; NO-PREFER-W-INST-NEXT: [[ADDI:%[0-9]+]]:gpr = ADDI [[COPY]], 1
+ ; NO-PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[ADDI]], 1
+ ; NO-PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]]
+ ; NO-PREFER-W-INST-NEXT: PseudoRET
+ ;
+ ; PREFER-W-INST-LABEL: name: addi
+ ; PREFER-W-INST: liveins: $x10, $x11
+ ; PREFER-W-INST-NEXT: {{ $}}
+ ; PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+ ; PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+ ; PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[COPY]], 1
+ ; PREFER-W-INST-NEXT: [[ADDIW1:%[0-9]+]]:gpr = ADDIW [[ADDIW]], 1
+ ; PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW1]]
+ ; PREFER-W-INST-NEXT: PseudoRET
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = ADDI %1, 1
+ %4:gpr = ADDIW %3, 1
+ $x10 = COPY %4
+ PseudoRET
+...
+
+---
+name: add
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ ; NO-PREFER-W-INST-LABEL: name: add
+ ; NO-PREFER-W-INST: liveins: $x10, $x11
+ ; NO-PREFER-W-INST-NEXT: {{ $}}
+ ; NO-PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+ ; NO-PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+ ; NO-PREFER-W-INST-NEXT: [[ADD:%[0-9]+]]:gpr = ADD [[COPY]], [[COPY1]]
+ ; NO-PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[ADD]], 1
+ ; NO-PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]]
+ ; NO-PREFER-W-INST-NEXT: PseudoRET
+ ;
+ ; PREFER-W-INST-LABEL: name: add
+ ; PREFER-W-INST: liveins: $x10, $x11
+ ; PREFER-W-INST-NEXT: {{ $}}
+ ; PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+ ; PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+ ; PREFER-W-INST-NEXT: [[ADDW:%[0-9]+]]:gpr = ADDW [[COPY]], [[COPY1]]
+ ; PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[ADDW]], 1
+ ; PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]]
+ ; PREFER-W-INST-NEXT: PseudoRET
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = ADD %1, %2
+ %4:gpr = ADDIW %3, 1
+ $x10 = COPY %4
+ PseudoRET
+...
+
+---
+name: sub
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ ; NO-PREFER-W-INST-LABEL: name: sub
+ ; NO-PREFER-W-INST: liveins: $x10, $x11
+ ; NO-PREFER-W-INST-NEXT: {{ $}}
+ ; NO-PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+ ; NO-PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+ ; NO-PREFER-W-INST-NEXT: [[SUB:%[0-9]+]]:gpr = SUB [[COPY]], [[COPY1]]
+ ; NO-PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[SUB]], 1
+ ; NO-PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]]
+ ; NO-PREFER-W-INST-NEXT: PseudoRET
+ ;
+ ; PREFER-W-INST-LABEL: name: sub
+ ; PREFER-W-INST: liveins: $x10, $x11
+ ; PREFER-W-INST-NEXT: {{ $}}
+ ; PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+ ; PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+ ; PREFER-W-INST-NEXT: [[SUBW:%[0-9]+]]:gpr = SUBW [[COPY]], [[COPY1]]
+ ; PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[SUBW]], 1
+ ; PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]]
+ ; PREFER-W-INST-NEXT: PseudoRET
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = SUB %1, %2
+ %4:gpr = ADDIW %3, 1
+ $x10 = COPY %4
+ PseudoRET
+...
+
+---
+name: mul
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ ; NO-PREFER-W-INST-LABEL: name: mul
+ ; NO-PREFER-W-INST: liveins: $x10, $x11
+ ; NO-PREFER-W-INST-NEXT: {{ $}}
+ ; NO-PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+ ; NO-PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+ ; NO-PREFER-W-INST-NEXT: [[MUL:%[0-9]+]]:gpr = MUL [[COPY]], [[COPY1]]
+ ; NO-PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[MUL]], 1
+ ; NO-PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]]
+ ; NO-PREFER-W-INST-NEXT: PseudoRET
+ ;
+ ; PREFER-W-INST-LABEL: name: mul
+ ; PREFER-W-INST: liveins: $x10, $x11
+ ; PREFER-W-INST-NEXT: {{ $}}
+ ; PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+ ; PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+ ; PREFER-W-INST-NEXT: [[MULW:%[0-9]+]]:gpr = MULW [[COPY]], [[COPY1]]
+ ; PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[MULW]], 1
+ ; PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]]
+ ; PREFER-W-INST-NEXT: PseudoRET
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = MUL %1, %2
+ %4:gpr = ADDIW %3, 1
+ $x10 = COPY %4
+ PseudoRET
+...
+
+
+---
+name: slli_31
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ ; NO-PREFER-W-INST-LABEL: name: slli_31
+ ; NO-PREFER-W-INST: liveins: $x10, $x11
+ ; NO-PREFER-W-INST-NEXT: {{ $}}
+ ; NO-PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+ ; NO-PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+ ; NO-PREFER-W-INST-NEXT: [[SLLI:%[0-9]+]]:gpr = SLLI [[COPY]], 31
+ ; NO-PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[SLLI]], 1
+ ; NO-PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]]
+ ; NO-PREFER-W-INST-NEXT: PseudoRET
+ ;
+ ; PREFER-W-INST-LABEL: name: slli_31
+ ; PREFER-W-INST: liveins: $x10, $x11
+ ; PREFER-W-INST-NEXT: {{ $}}
+ ; PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+ ; PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+ ; PREFER-W-INST-NEXT: [[SLLIW:%[0-9]+]]:gpr = SLLIW [[COPY]], 31
+ ; PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[SLLIW]], 1
+ ; PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]]
+ ; PREFER-W-INST-NEXT: PseudoRET
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = SLLI %1, 31
+ %4:gpr = ADDIW %3, 1
+ $x10 = COPY %4
+ PseudoRET
+...
+
+---
+name: slli_32
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ ; NO-PREFER-W-INST-LABEL: name: slli_32
+ ; NO-PREFER-W-INST: liveins: $x10, $x11
+ ; NO-PREFER-W-INST-NEXT: {{ $}}
+ ; NO-PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+ ; NO-PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+ ; NO-PREFER-W-INST-NEXT: [[SLLI:%[0-9]+]]:gpr = SLLI [[COPY]], 32
+ ; NO-PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[SLLI]], 1
+ ; NO-PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]]
+ ; NO-PREFER-W-INST-NEXT: PseudoRET
+ ;
+ ; PREFER-W-INST-LABEL: name: slli_32
+ ; PREFER-W-INST: liveins: $x10, $x11
+ ; PREFER-W-INST-NEXT: {{ $}}
+ ; PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+ ; PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+ ; PREFER-W-INST-NEXT: [[SLLI:%[0-9]+]]:gpr = SLLI [[COPY]], 32
+ ; PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[SLLI]], 1
+ ; PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]]
+ ; PREFER-W-INST-NEXT: PseudoRET
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = SLLI %1, 32
+ %4:gpr = ADDIW %3, 1
+ $x10 = COPY %4
+ PseudoRET
+...
+
+---
+name: ld
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ ; NO-PREFER-W-INST-LABEL: name: ld
+ ; NO-PREFER-W-INST: liveins: $x10, $x11
+ ; NO-PREFER-W-INST-NEXT: {{ $}}
+ ; NO-PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+ ; NO-PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+ ; NO-PREFER-W-INST-NEXT: [[LD:%[0-9]+]]:gpr = LD [[COPY]], 0
+ ; NO-PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[LD]], 1
+ ; NO-PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]]
+ ; NO-PREFER-W-INST-NEXT: PseudoRET
+ ;
+ ; PREFER-W-INST-LABEL: name: ld
+ ; PREFER-W-INST: liveins: $x10, $x11
+ ; PREFER-W-INST-NEXT: {{ $}}
+ ; PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+ ; PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+ ; PREFER-W-INST-NEXT: [[LW:%[0-9]+]]:gpr = LW [[COPY]], 0
+ ; PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[LW]], 1
+ ; PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]]
+ ; PREFER-W-INST-NEXT: PseudoRET
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = LD %1, 0
+ %4:gpr = ADDIW %3, 1
+ $x10 = COPY %4
+ PseudoRET
+...
+
+---
+name: lwu
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ ; NO-PREFER-W-INST-LABEL: name: lwu
+ ; NO-PREFER-W-INST: liveins: $x10, $x11
+ ; NO-PREFER-W-INST-NEXT: {{ $}}
+ ; NO-PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+ ; NO-PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+ ; NO-PREFER-W-INST-NEXT: [[LWU:%[0-9]+]]:gpr = LWU [[COPY]], 0
+ ; NO-PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[LWU]], 1
+ ; NO-PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]]
+ ; NO-PREFER-W-INST-NEXT: PseudoRET
+ ;
+ ; PREFER-W-INST-LABEL: name: lwu
+ ; PREFER-W-INST: liveins: $x10, $x11
+ ; PREFER-W-INST-NEXT: {{ $}}
+ ; PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+ ; PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+ ; PREFER-W-INST-NEXT: [[LW:%[0-9]+]]:gpr = LW [[COPY]], 0
+ ; PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[LW]], 1
+ ; PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]]
+ ; PREFER-W-INST-NEXT: PseudoRET
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = LWU %1, 0
+ %4:gpr = ADDIW %3, 1
+ $x10 = COPY %4
+ PseudoRET
+...
diff --git a/llvm/test/CodeGen/RISCV/rvv/calling-conv.ll b/llvm/test/CodeGen/RISCV/rvv/calling-conv.ll
index 78e8700a9fef..647d3158b616 100644
--- a/llvm/test/CodeGen/RISCV/rvv/calling-conv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/calling-conv.ll
@@ -162,3 +162,206 @@ define void @caller_tuple_argument({<vscale x 4 x i32>, <vscale x 4 x i32>} %x)
}
declare void @callee_tuple_argument({<vscale x 4 x i32>, <vscale x 4 x i32>})
+
+; %0 -> v8
+; %1 -> v9
+define <vscale x 1 x i64> @case1(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1) {
+; CHECK-LABEL: case1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT: vadd.vv v8, v8, v9
+; CHECK-NEXT: ret
+ %a = add <vscale x 1 x i64> %0, %1
+ ret <vscale x 1 x i64> %a
+}
+
+; %0 -> v8
+; %1 -> v10-v11
+; %2 -> v9
+define <vscale x 1 x i64> @case2_1(<vscale x 1 x i64> %0, <vscale x 2 x i64> %1, <vscale x 1 x i64> %2) {
+; CHECK-LABEL: case2_1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT: vadd.vv v8, v8, v9
+; CHECK-NEXT: ret
+ %a = add <vscale x 1 x i64> %0, %2
+ ret <vscale x 1 x i64> %a
+}
+define <vscale x 2 x i64> @case2_2(<vscale x 1 x i64> %0, <vscale x 2 x i64> %1, <vscale x 1 x i64> %2) {
+; CHECK-LABEL: case2_2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-NEXT: vadd.vv v8, v10, v10
+; CHECK-NEXT: ret
+ %a = add <vscale x 2 x i64> %1, %1
+ ret <vscale x 2 x i64> %a
+}
+
+; %0 -> v8
+; %1 -> {v10-v11, v12-v13}
+; %2 -> v9
+define <vscale x 1 x i64> @case3_1(<vscale x 1 x i64> %0, {<vscale x 2 x i64>, <vscale x 2 x i64>} %1, <vscale x 1 x i64> %2) {
+; CHECK-LABEL: case3_1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT: vadd.vv v8, v8, v9
+; CHECK-NEXT: ret
+ %add = add <vscale x 1 x i64> %0, %2
+ ret <vscale x 1 x i64> %add
+}
+define <vscale x 2 x i64> @case3_2(<vscale x 1 x i64> %0, {<vscale x 2 x i64>, <vscale x 2 x i64>} %1, <vscale x 1 x i64> %2) {
+; CHECK-LABEL: case3_2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-NEXT: vadd.vv v8, v10, v12
+; CHECK-NEXT: ret
+ %a = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %1, 0
+ %b = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %1, 1
+ %add = add <vscale x 2 x i64> %a, %b
+ ret <vscale x 2 x i64> %add
+}
+
+; %0 -> v8
+; %1 -> {by-ref, by-ref}
+; %2 -> v9
+define <vscale x 8 x i64> @case4_1(<vscale x 1 x i64> %0, {<vscale x 8 x i64>, <vscale x 8 x i64>} %1, <vscale x 1 x i64> %2) {
+; CHECK-LABEL: case4_1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, a0, a1
+; CHECK-NEXT: vl8re64.v v8, (a1)
+; CHECK-NEXT: vl8re64.v v16, (a0)
+; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT: vadd.vv v8, v16, v8
+; CHECK-NEXT: ret
+ %a = extractvalue { <vscale x 8 x i64>, <vscale x 8 x i64> } %1, 0
+ %b = extractvalue { <vscale x 8 x i64>, <vscale x 8 x i64> } %1, 1
+ %add = add <vscale x 8 x i64> %a, %b
+ ret <vscale x 8 x i64> %add
+}
+define <vscale x 1 x i64> @case4_2(<vscale x 1 x i64> %0, {<vscale x 8 x i64>, <vscale x 8 x i64>} %1, <vscale x 1 x i64> %2) {
+; CHECK-LABEL: case4_2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT: vadd.vv v8, v8, v9
+; CHECK-NEXT: ret
+ %add = add <vscale x 1 x i64> %0, %2
+ ret <vscale x 1 x i64> %add
+}
+
+declare <vscale x 1 x i64> @callee1()
+declare void @callee2(<vscale x 1 x i64>)
+declare void @callee3(<vscale x 4 x i32>)
+define void @caller() {
+; RV32-LABEL: caller:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT: .cfi_offset ra, -4
+; RV32-NEXT: call callee1
+; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; RV32-NEXT: vadd.vv v8, v8, v8
+; RV32-NEXT: call callee2
+; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: caller:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT: .cfi_offset ra, -8
+; RV64-NEXT: call callee1
+; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; RV64-NEXT: vadd.vv v8, v8, v8
+; RV64-NEXT: call callee2
+; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT: addi sp, sp, 16
+; RV64-NEXT: ret
+ %a = call <vscale x 1 x i64> @callee1()
+ %add = add <vscale x 1 x i64> %a, %a
+ call void @callee2(<vscale x 1 x i64> %add)
+ ret void
+}
+
+declare {<vscale x 4 x i32>, <vscale x 4 x i32>} @callee_tuple()
+define void @caller_tuple() {
+; RV32-LABEL: caller_tuple:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT: .cfi_offset ra, -4
+; RV32-NEXT: call callee_tuple
+; RV32-NEXT: vsetvli a0, zero, e32, m2, ta, ma
+; RV32-NEXT: vadd.vv v8, v8, v10
+; RV32-NEXT: call callee3
+; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: caller_tuple:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT: .cfi_offset ra, -8
+; RV64-NEXT: call callee_tuple
+; RV64-NEXT: vsetvli a0, zero, e32, m2, ta, ma
+; RV64-NEXT: vadd.vv v8, v8, v10
+; RV64-NEXT: call callee3
+; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT: addi sp, sp, 16
+; RV64-NEXT: ret
+ %a = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @callee_tuple()
+ %b = extractvalue {<vscale x 4 x i32>, <vscale x 4 x i32>} %a, 0
+ %c = extractvalue {<vscale x 4 x i32>, <vscale x 4 x i32>} %a, 1
+ %add = add <vscale x 4 x i32> %b, %c
+ call void @callee3(<vscale x 4 x i32> %add)
+ ret void
+}
+
+declare {<vscale x 4 x i32>, {<vscale x 4 x i32>, <vscale x 4 x i32>}} @callee_nested()
+define void @caller_nested() {
+; RV32-LABEL: caller_nested:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT: .cfi_offset ra, -4
+; RV32-NEXT: call callee_nested
+; RV32-NEXT: vsetvli a0, zero, e32, m2, ta, ma
+; RV32-NEXT: vadd.vv v8, v8, v10
+; RV32-NEXT: vadd.vv v8, v8, v12
+; RV32-NEXT: call callee3
+; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: caller_nested:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT: .cfi_offset ra, -8
+; RV64-NEXT: call callee_nested
+; RV64-NEXT: vsetvli a0, zero, e32, m2, ta, ma
+; RV64-NEXT: vadd.vv v8, v8, v10
+; RV64-NEXT: vadd.vv v8, v8, v12
+; RV64-NEXT: call callee3
+; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT: addi sp, sp, 16
+; RV64-NEXT: ret
+ %a = call {<vscale x 4 x i32>, {<vscale x 4 x i32>, <vscale x 4 x i32>}} @callee_nested()
+ %b = extractvalue {<vscale x 4 x i32>, {<vscale x 4 x i32>, <vscale x 4 x i32>}} %a, 0
+ %c = extractvalue {<vscale x 4 x i32>, {<vscale x 4 x i32>, <vscale x 4 x i32>}} %a, 1
+ %c0 = extractvalue {<vscale x 4 x i32>, <vscale x 4 x i32>} %c, 0
+ %c1 = extractvalue {<vscale x 4 x i32>, <vscale x 4 x i32>} %c, 1
+ %add0 = add <vscale x 4 x i32> %b, %c0
+ %add1 = add <vscale x 4 x i32> %add0, %c1
+ call void @callee3(<vscale x 4 x i32> %add1)
+ ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
index 4332bf36660a..f3ce21897651 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
@@ -19,9 +19,10 @@ define {<vscale x 16 x i1>, <vscale x 16 x i1>} @vector_deinterleave_load_nxv16i
; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: vmerge.vim v12, v10, 1, v0
; CHECK-NEXT: vnsrl.wi v8, v12, 0
-; CHECK-NEXT: vmsne.vi v0, v8, 0
-; CHECK-NEXT: vnsrl.wi v8, v12, 8
; CHECK-NEXT: vmsne.vi v8, v8, 0
+; CHECK-NEXT: vnsrl.wi v10, v12, 8
+; CHECK-NEXT: vmsne.vi v0, v10, 0
+; CHECK-NEXT: vmv1r.v v9, v0
; CHECK-NEXT: ret
%vec = load <vscale x 32 x i1>, ptr %p
%retval = call {<vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.experimental.vector.deinterleave2.nxv32i1(<vscale x 32 x i1> %vec)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
index ef4baf34d23f..1347dfb6ff2a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
@@ -8,18 +8,19 @@ define {<vscale x 16 x i1>, <vscale x 16 x i1>} @vector_deinterleave_nxv16i1_nxv
; CHECK-LABEL: vector_deinterleave_nxv16i1_nxv32i1:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-NEXT: vmv.v.i v10, 0
-; CHECK-NEXT: vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: vmerge.vim v12, v8, 1, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: srli a0, a0, 2
; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
; CHECK-NEXT: vslidedown.vx v0, v0, a0
; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-NEXT: vmerge.vim v10, v10, 1, v0
-; CHECK-NEXT: vnsrl.wi v12, v8, 0
-; CHECK-NEXT: vmsne.vi v0, v12, 0
-; CHECK-NEXT: vnsrl.wi v12, v8, 8
-; CHECK-NEXT: vmsne.vi v8, v12, 0
+; CHECK-NEXT: vmerge.vim v14, v8, 1, v0
+; CHECK-NEXT: vnsrl.wi v8, v12, 0
+; CHECK-NEXT: vmsne.vi v8, v8, 0
+; CHECK-NEXT: vnsrl.wi v10, v12, 8
+; CHECK-NEXT: vmsne.vi v0, v10, 0
+; CHECK-NEXT: vmv1r.v v9, v0
; CHECK-NEXT: ret
%retval = call {<vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.experimental.vector.deinterleave2.nxv32i1(<vscale x 32 x i1> %vec)
ret {<vscale x 16 x i1>, <vscale x 16 x i1>} %retval
@@ -107,7 +108,9 @@ define {<vscale x 64 x i1>, <vscale x 64 x i1>} @vector_deinterleave_nxv64i1_nxv
; CHECK-NEXT: vnsrl.wi v24, v16, 8
; CHECK-NEXT: vnsrl.wi v28, v8, 8
; CHECK-NEXT: vsetvli a0, zero, e8, m8, ta, ma
-; CHECK-NEXT: vmsne.vi v8, v24, 0
+; CHECK-NEXT: vmsne.vi v16, v24, 0
+; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmv1r.v v9, v16
; CHECK-NEXT: ret
%retval = call {<vscale x 64 x i1>, <vscale x 64 x i1>} @llvm.experimental.vector.deinterleave2.nxv128i1(<vscale x 128 x i1> %vec)
ret {<vscale x 64 x i1>, <vscale x 64 x i1>} %retval
diff --git a/llvm/test/CodeGen/RISCV/strip-w-suffix.ll b/llvm/test/CodeGen/RISCV/strip-w-suffix.ll
deleted file mode 100644
index 4124b3d0d360..000000000000
--- a/llvm/test/CodeGen/RISCV/strip-w-suffix.ll
+++ /dev/null
@@ -1,74 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs < %s \
-; RUN: | FileCheck -check-prefixes=STRIP %s
-; RUN: llc -mtriple=riscv64 -mattr=+m,+no-strip-w-suffix -verify-machineinstrs < %s \
-; RUN: | FileCheck -check-prefixes=NO-STRIP %s
-
-define i32 @addiw(i32 %a) {
-; STRIP-LABEL: addiw:
-; STRIP: # %bb.0:
-; STRIP-NEXT: lui a1, 1
-; STRIP-NEXT: addi a1, a1, -1
-; STRIP-NEXT: addw a0, a0, a1
-; STRIP-NEXT: ret
-;
-; NO-STRIP-LABEL: addiw:
-; NO-STRIP: # %bb.0:
-; NO-STRIP-NEXT: lui a1, 1
-; NO-STRIP-NEXT: addiw a1, a1, -1
-; NO-STRIP-NEXT: addw a0, a0, a1
-; NO-STRIP-NEXT: ret
- %ret = add i32 %a, 4095
- ret i32 %ret
-}
-
-define i32 @addw(i32 %a, i32 %b) {
-; STRIP-LABEL: addw:
-; STRIP: # %bb.0:
-; STRIP-NEXT: add a0, a0, a1
-; STRIP-NEXT: addiw a0, a0, 1024
-; STRIP-NEXT: ret
-;
-; NO-STRIP-LABEL: addw:
-; NO-STRIP: # %bb.0:
-; NO-STRIP-NEXT: addw a0, a0, a1
-; NO-STRIP-NEXT: addiw a0, a0, 1024
-; NO-STRIP-NEXT: ret
- %add = add i32 %a, %b
- %ret = add i32 %add, 1024
- ret i32 %ret
-}
-
-define i32 @mulw(i32 %a, i32 %b) {
-; STRIP-LABEL: mulw:
-; STRIP: # %bb.0:
-; STRIP-NEXT: mul a0, a0, a1
-; STRIP-NEXT: addiw a0, a0, 1024
-; STRIP-NEXT: ret
-;
-; NO-STRIP-LABEL: mulw:
-; NO-STRIP: # %bb.0:
-; NO-STRIP-NEXT: mulw a0, a0, a1
-; NO-STRIP-NEXT: addiw a0, a0, 1024
-; NO-STRIP-NEXT: ret
- %mul = mul i32 %a, %b
- %ret = add i32 %mul, 1024
- ret i32 %ret
-}
-
-define i32 @slliw(i32 %a) {
-; STRIP-LABEL: slliw:
-; STRIP: # %bb.0:
-; STRIP-NEXT: slli a0, a0, 1
-; STRIP-NEXT: addiw a0, a0, 1024
-; STRIP-NEXT: ret
-;
-; NO-STRIP-LABEL: slliw:
-; NO-STRIP: # %bb.0:
-; NO-STRIP-NEXT: slliw a0, a0, 1
-; NO-STRIP-NEXT: addiw a0, a0, 1024
-; NO-STRIP-NEXT: ret
- %shl = shl i32 %a, 1
- %ret = add i32 %shl, 1024
- ret i32 %ret
-}
diff --git a/llvm/test/CodeGen/X86/GlobalISel/fconstant.ll b/llvm/test/CodeGen/X86/GlobalISel/fconstant.ll
index a9b2037e9947..8d2ee3c50f21 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/fconstant.ll
+++ b/llvm/test/CodeGen/X86/GlobalISel/fconstant.ll
@@ -10,27 +10,22 @@ define void @test_float(ptr %a , float %b) {
; CHECK64_SMALL: # %bb.0: # %entry
; CHECK64_SMALL-NEXT: movss {{.*#+}} xmm1 = [5.5E+0,0.0E+0,0.0E+0,0.0E+0]
; CHECK64_SMALL-NEXT: addss %xmm0, %xmm1
-; CHECK64_SMALL-NEXT: movd %xmm1, %eax
-; CHECK64_SMALL-NEXT: movl %eax, (%rdi)
+; CHECK64_SMALL-NEXT: movss %xmm1, (%rdi)
; CHECK64_SMALL-NEXT: retq
;
; CHECK64_LARGE-LABEL: test_float:
; CHECK64_LARGE: # %bb.0: # %entry
; CHECK64_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
; CHECK64_LARGE-NEXT: addss (%rax), %xmm0
-; CHECK64_LARGE-NEXT: movd %xmm0, %eax
-; CHECK64_LARGE-NEXT: movl %eax, (%rdi)
+; CHECK64_LARGE-NEXT: movss %xmm0, (%rdi)
; CHECK64_LARGE-NEXT: retq
;
; CHECK32-LABEL: test_float:
; CHECK32: # %bb.0: # %entry
; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK32-NEXT: movss {{.*#+}} xmm0 = [5.5E+0,0.0E+0,0.0E+0,0.0E+0]
-; CHECK32-NEXT: movd %ecx, %xmm1
-; CHECK32-NEXT: addss %xmm0, %xmm1
-; CHECK32-NEXT: movd %xmm1, %ecx
-; CHECK32-NEXT: movl %ecx, (%eax)
+; CHECK32-NEXT: addss {{[0-9]+}}(%esp), %xmm0
+; CHECK32-NEXT: movss %xmm0, (%eax)
; CHECK32-NEXT: retl
entry:
%aa = fadd float 5.500000e+00, %b
diff --git a/llvm/test/CodeGen/X86/GlobalISel/regbankselect-sse-intrinsics.ll b/llvm/test/CodeGen/X86/GlobalISel/regbankselect-sse-intrinsics.ll
new file mode 100644
index 000000000000..3388af605d96
--- /dev/null
+++ b/llvm/test/CodeGen/X86/GlobalISel/regbankselect-sse-intrinsics.ll
@@ -0,0 +1,153 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse -global-isel -stop-after=regbankselect | FileCheck %s
+
+define void @test_x86_sse_max_ps(ptr %p1, ptr %p2) {
+ ; CHECK-LABEL: name: test_x86_sse_max_ps
+ ; CHECK: bb.1 (%ir-block.0):
+ ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:gpr(p0) = G_FRAME_INDEX %fixed-stack.1
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:gpr(p0) = G_LOAD [[FRAME_INDEX]](p0) :: (invariant load (p0) from %fixed-stack.1)
+ ; CHECK-NEXT: [[FRAME_INDEX1:%[0-9]+]]:gpr(p0) = G_FRAME_INDEX %fixed-stack.0
+ ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:gpr(p0) = G_LOAD [[FRAME_INDEX1]](p0) :: (invariant load (p0) from %fixed-stack.0)
+ ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:vecr(<4 x s32>) = G_LOAD [[LOAD]](p0) :: (load (<4 x s32>) from %ir.p1)
+ ; CHECK-NEXT: [[LOAD3:%[0-9]+]]:vecr(<4 x s32>) = G_LOAD [[LOAD1]](p0) :: (load (<4 x s32>) from %ir.p2)
+ ; CHECK-NEXT: [[INT:%[0-9]+]]:vecr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.x86.sse.max.ps), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>)
+ ; CHECK-NEXT: G_STORE [[INT]](<4 x s32>), [[LOAD]](p0) :: (store (<4 x s32>) into %ir.p1)
+ ; CHECK-NEXT: RET 0
+ %a0 = load <4 x float>, ptr %p1, align 16
+ %a1 = load <4 x float>, ptr %p2, align 16
+ %res = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
+ store <4 x float> %res, ptr %p1
+ ret void
+}
+declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
+
+
+define void @test_x86_sse_max_ss(ptr %p1, ptr %p2) {
+ ; CHECK-LABEL: name: test_x86_sse_max_ss
+ ; CHECK: bb.1 (%ir-block.0):
+ ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:gpr(p0) = G_FRAME_INDEX %fixed-stack.1
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:gpr(p0) = G_LOAD [[FRAME_INDEX]](p0) :: (invariant load (p0) from %fixed-stack.1)
+ ; CHECK-NEXT: [[FRAME_INDEX1:%[0-9]+]]:gpr(p0) = G_FRAME_INDEX %fixed-stack.0
+ ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:gpr(p0) = G_LOAD [[FRAME_INDEX1]](p0) :: (invariant load (p0) from %fixed-stack.0)
+ ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:vecr(<4 x s32>) = G_LOAD [[LOAD]](p0) :: (load (<4 x s32>) from %ir.p1)
+ ; CHECK-NEXT: [[LOAD3:%[0-9]+]]:vecr(<4 x s32>) = G_LOAD [[LOAD1]](p0) :: (load (<4 x s32>) from %ir.p2)
+ ; CHECK-NEXT: [[INT:%[0-9]+]]:vecr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.x86.sse.max.ss), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>)
+ ; CHECK-NEXT: G_STORE [[INT]](<4 x s32>), [[LOAD]](p0) :: (store (<4 x s32>) into %ir.p1)
+ ; CHECK-NEXT: RET 0
+ %a0 = load <4 x float>, ptr %p1, align 16
+ %a1 = load <4 x float>, ptr %p2, align 16
+ %res = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
+ store <4 x float> %res, ptr %p1
+ ret void
+}
+declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
+
+
+define void @test_x86_sse_min_ps(ptr %p1, ptr %p2) {
+ ; CHECK-LABEL: name: test_x86_sse_min_ps
+ ; CHECK: bb.1 (%ir-block.0):
+ ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:gpr(p0) = G_FRAME_INDEX %fixed-stack.1
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:gpr(p0) = G_LOAD [[FRAME_INDEX]](p0) :: (invariant load (p0) from %fixed-stack.1)
+ ; CHECK-NEXT: [[FRAME_INDEX1:%[0-9]+]]:gpr(p0) = G_FRAME_INDEX %fixed-stack.0
+ ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:gpr(p0) = G_LOAD [[FRAME_INDEX1]](p0) :: (invariant load (p0) from %fixed-stack.0)
+ ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:vecr(<4 x s32>) = G_LOAD [[LOAD]](p0) :: (load (<4 x s32>) from %ir.p1)
+ ; CHECK-NEXT: [[LOAD3:%[0-9]+]]:vecr(<4 x s32>) = G_LOAD [[LOAD1]](p0) :: (load (<4 x s32>) from %ir.p2)
+ ; CHECK-NEXT: [[INT:%[0-9]+]]:vecr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.x86.sse.min.ps), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>)
+ ; CHECK-NEXT: G_STORE [[INT]](<4 x s32>), [[LOAD]](p0) :: (store (<4 x s32>) into %ir.p1)
+ ; CHECK-NEXT: RET 0
+ %a0 = load <4 x float>, ptr %p1, align 16
+ %a1 = load <4 x float>, ptr %p2, align 16
+ %res = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
+ store <4 x float> %res, ptr %p1
+ ret void
+}
+declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
+
+
+define void @test_x86_sse_min_ss(ptr %p1, ptr %p2) {
+ ; CHECK-LABEL: name: test_x86_sse_min_ss
+ ; CHECK: bb.1 (%ir-block.0):
+ ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:gpr(p0) = G_FRAME_INDEX %fixed-stack.1
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:gpr(p0) = G_LOAD [[FRAME_INDEX]](p0) :: (invariant load (p0) from %fixed-stack.1)
+ ; CHECK-NEXT: [[FRAME_INDEX1:%[0-9]+]]:gpr(p0) = G_FRAME_INDEX %fixed-stack.0
+ ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:gpr(p0) = G_LOAD [[FRAME_INDEX1]](p0) :: (invariant load (p0) from %fixed-stack.0)
+ ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:vecr(<4 x s32>) = G_LOAD [[LOAD]](p0) :: (load (<4 x s32>) from %ir.p1)
+ ; CHECK-NEXT: [[LOAD3:%[0-9]+]]:vecr(<4 x s32>) = G_LOAD [[LOAD1]](p0) :: (load (<4 x s32>) from %ir.p2)
+ ; CHECK-NEXT: [[INT:%[0-9]+]]:vecr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.x86.sse.min.ss), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>)
+ ; CHECK-NEXT: G_STORE [[INT]](<4 x s32>), [[LOAD]](p0) :: (store (<4 x s32>) into %ir.p1)
+ ; CHECK-NEXT: RET 0
+ %a0 = load <4 x float>, ptr %p1, align 16
+ %a1 = load <4 x float>, ptr %p2, align 16
+ %res = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
+ store <4 x float> %res, ptr %p1
+ ret void
+}
+declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
+
+
+define void @test_x86_sse_rcp_ps(ptr %p1, ptr %p2) {
+ ; CHECK-LABEL: name: test_x86_sse_rcp_ps
+ ; CHECK: bb.1 (%ir-block.0):
+ ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:gpr(p0) = G_FRAME_INDEX %fixed-stack.1
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:gpr(p0) = G_LOAD [[FRAME_INDEX]](p0) :: (invariant load (p0) from %fixed-stack.1)
+ ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:vecr(<4 x s32>) = G_LOAD [[LOAD]](p0) :: (load (<4 x s32>) from %ir.p1)
+ ; CHECK-NEXT: [[INT:%[0-9]+]]:vecr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.x86.sse.rcp.ps), [[LOAD1]](<4 x s32>)
+ ; CHECK-NEXT: G_STORE [[INT]](<4 x s32>), [[LOAD]](p0) :: (store (<4 x s32>) into %ir.p1)
+ ; CHECK-NEXT: RET 0
+ %a0 = load <4 x float>, ptr %p1, align 16
+ %res = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
+ store <4 x float> %res, ptr %p1
+ ret void
+}
+declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
+
+
+define void @test_x86_sse_rcp_ss(ptr %p1, ptr %p2) {
+ ; CHECK-LABEL: name: test_x86_sse_rcp_ss
+ ; CHECK: bb.1 (%ir-block.0):
+ ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:gpr(p0) = G_FRAME_INDEX %fixed-stack.1
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:gpr(p0) = G_LOAD [[FRAME_INDEX]](p0) :: (invariant load (p0) from %fixed-stack.1)
+ ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:vecr(<4 x s32>) = G_LOAD [[LOAD]](p0) :: (load (<4 x s32>) from %ir.p1)
+ ; CHECK-NEXT: [[INT:%[0-9]+]]:vecr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.x86.sse.rcp.ss), [[LOAD1]](<4 x s32>)
+ ; CHECK-NEXT: G_STORE [[INT]](<4 x s32>), [[LOAD]](p0) :: (store (<4 x s32>) into %ir.p1)
+ ; CHECK-NEXT: RET 0
+ %a0 = load <4 x float>, ptr %p1, align 16
+ %res = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
+ store <4 x float> %res, ptr %p1
+ ret void
+}
+declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
+
+
+define void @test_x86_sse_rsqrt_ps(ptr %p1, ptr %p2) {
+ ; CHECK-LABEL: name: test_x86_sse_rsqrt_ps
+ ; CHECK: bb.1 (%ir-block.0):
+ ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:gpr(p0) = G_FRAME_INDEX %fixed-stack.1
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:gpr(p0) = G_LOAD [[FRAME_INDEX]](p0) :: (invariant load (p0) from %fixed-stack.1)
+ ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:vecr(<4 x s32>) = G_LOAD [[LOAD]](p0) :: (load (<4 x s32>) from %ir.p1)
+ ; CHECK-NEXT: [[INT:%[0-9]+]]:vecr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.x86.sse.rsqrt.ps), [[LOAD1]](<4 x s32>)
+ ; CHECK-NEXT: G_STORE [[INT]](<4 x s32>), [[LOAD]](p0) :: (store (<4 x s32>) into %ir.p1)
+ ; CHECK-NEXT: RET 0
+ %a0 = load <4 x float>, ptr %p1, align 16
+ %res = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
+ store <4 x float> %res, ptr %p1
+ ret void
+}
+declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
+
+
+define void @test_x86_sse_rsqrt_ss(ptr %p1, ptr %p2) {
+ ; CHECK-LABEL: name: test_x86_sse_rsqrt_ss
+ ; CHECK: bb.1 (%ir-block.0):
+ ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:gpr(p0) = G_FRAME_INDEX %fixed-stack.1
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:gpr(p0) = G_LOAD [[FRAME_INDEX]](p0) :: (invariant load (p0) from %fixed-stack.1)
+ ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:vecr(<4 x s32>) = G_LOAD [[LOAD]](p0) :: (load (<4 x s32>) from %ir.p1)
+ ; CHECK-NEXT: [[INT:%[0-9]+]]:vecr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.x86.sse.rsqrt.ss), [[LOAD1]](<4 x s32>)
+ ; CHECK-NEXT: G_STORE [[INT]](<4 x s32>), [[LOAD]](p0) :: (store (<4 x s32>) into %ir.p1)
+ ; CHECK-NEXT: RET 0
+ %a0 = load <4 x float>, ptr %p1, align 16
+ %res = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
+ store <4 x float> %res, ptr %p1
+ ret void
+}
+declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
diff --git a/llvm/test/CodeGen/X86/GlobalISel/regbankselect-x87.ll b/llvm/test/CodeGen/X86/GlobalISel/regbankselect-x87.ll
index d09db0f2474c..99d458a183a9 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/regbankselect-x87.ll
+++ b/llvm/test/CodeGen/X86/GlobalISel/regbankselect-x87.ll
@@ -142,7 +142,7 @@ define float @f4(float %val) {
; X86-LABEL: name: f4
; X86: bb.1 (%ir-block.0):
; X86-NEXT: [[FRAME_INDEX:%[0-9]+]]:gpr(p0) = G_FRAME_INDEX %fixed-stack.0
- ; X86-NEXT: [[LOAD:%[0-9]+]]:gpr(s32) = G_LOAD [[FRAME_INDEX]](p0) :: (invariant load (s32) from %fixed-stack.0)
+ ; X86-NEXT: [[LOAD:%[0-9]+]]:psr(s32) = G_LOAD [[FRAME_INDEX]](p0) :: (invariant load (s32) from %fixed-stack.0)
; X86-NEXT: $fp0 = COPY [[LOAD]](s32)
; X86-NEXT: RET 0, implicit $fp0
;
@@ -187,13 +187,10 @@ define void @f5(ptr %a, ptr %b) {
; X64-NEXT: {{ $}}
; X64-NEXT: [[COPY:%[0-9]+]]:gpr(p0) = COPY $rdi
; X64-NEXT: [[COPY1:%[0-9]+]]:gpr(p0) = COPY $rsi
- ; X64-NEXT: [[LOAD:%[0-9]+]]:gpr(s64) = G_LOAD [[COPY]](p0) :: (load (s64) from %ir.a)
- ; X64-NEXT: [[LOAD1:%[0-9]+]]:gpr(s64) = G_LOAD [[COPY1]](p0) :: (load (s64) from %ir.b)
- ; X64-NEXT: [[COPY2:%[0-9]+]]:psr(s64) = COPY [[LOAD]](s64)
- ; X64-NEXT: [[COPY3:%[0-9]+]]:psr(s64) = COPY [[LOAD1]](s64)
- ; X64-NEXT: [[FADD:%[0-9]+]]:psr(s64) = G_FADD [[COPY2]], [[COPY3]]
- ; X64-NEXT: [[COPY4:%[0-9]+]]:gpr(s64) = COPY [[FADD]](s64)
- ; X64-NEXT: G_STORE [[COPY4]](s64), [[COPY]](p0) :: (store (s64) into %ir.a)
+ ; X64-NEXT: [[LOAD:%[0-9]+]]:psr(s64) = G_LOAD [[COPY]](p0) :: (load (s64) from %ir.a)
+ ; X64-NEXT: [[LOAD1:%[0-9]+]]:psr(s64) = G_LOAD [[COPY1]](p0) :: (load (s64) from %ir.b)
+ ; X64-NEXT: [[FADD:%[0-9]+]]:psr(s64) = G_FADD [[LOAD]], [[LOAD1]]
+ ; X64-NEXT: G_STORE [[FADD]](s64), [[COPY]](p0) :: (store (s64) into %ir.a)
; X64-NEXT: RET 0
%load1 = load double, ptr %a, align 8
%load2 = load double, ptr %b, align 8
@@ -210,11 +207,9 @@ define void @f6(ptr %0, ptr %1) {
; X86-NEXT: [[FRAME_INDEX1:%[0-9]+]]:gpr(p0) = G_FRAME_INDEX %fixed-stack.0
; X86-NEXT: [[LOAD1:%[0-9]+]]:gpr(p0) = G_LOAD [[FRAME_INDEX1]](p0) :: (invariant load (p0) from %fixed-stack.0)
; X86-NEXT: [[C:%[0-9]+]]:psr(s32) = G_FCONSTANT float 2.000000e+01
- ; X86-NEXT: [[LOAD2:%[0-9]+]]:gpr(s32) = G_LOAD [[LOAD]](p0) :: (load (s32) from %ir.0)
- ; X86-NEXT: [[COPY:%[0-9]+]]:psr(s32) = COPY [[LOAD2]](s32)
- ; X86-NEXT: [[FADD:%[0-9]+]]:psr(s32) = G_FADD [[COPY]], [[C]]
- ; X86-NEXT: [[COPY1:%[0-9]+]]:gpr(s32) = COPY [[FADD]](s32)
- ; X86-NEXT: G_STORE [[COPY1]](s32), [[LOAD1]](p0) :: (store (s32) into %ir.1)
+ ; X86-NEXT: [[LOAD2:%[0-9]+]]:psr(s32) = G_LOAD [[LOAD]](p0) :: (load (s32) from %ir.0)
+ ; X86-NEXT: [[FADD:%[0-9]+]]:psr(s32) = G_FADD [[LOAD2]], [[C]]
+ ; X86-NEXT: G_STORE [[FADD]](s32), [[LOAD1]](p0) :: (store (s32) into %ir.1)
; X86-NEXT: RET 0
;
; X64-LABEL: name: f6
@@ -224,11 +219,9 @@ define void @f6(ptr %0, ptr %1) {
; X64-NEXT: [[COPY:%[0-9]+]]:gpr(p0) = COPY $rdi
; X64-NEXT: [[COPY1:%[0-9]+]]:gpr(p0) = COPY $rsi
; X64-NEXT: [[C:%[0-9]+]]:psr(s32) = G_FCONSTANT float 2.000000e+01
- ; X64-NEXT: [[LOAD:%[0-9]+]]:gpr(s32) = G_LOAD [[COPY]](p0) :: (load (s32) from %ir.0)
- ; X64-NEXT: [[COPY2:%[0-9]+]]:psr(s32) = COPY [[LOAD]](s32)
- ; X64-NEXT: [[FADD:%[0-9]+]]:psr(s32) = G_FADD [[COPY2]], [[C]]
- ; X64-NEXT: [[COPY3:%[0-9]+]]:gpr(s32) = COPY [[FADD]](s32)
- ; X64-NEXT: G_STORE [[COPY3]](s32), [[COPY1]](p0) :: (store (s32) into %ir.1)
+ ; X64-NEXT: [[LOAD:%[0-9]+]]:psr(s32) = G_LOAD [[COPY]](p0) :: (load (s32) from %ir.0)
+ ; X64-NEXT: [[FADD:%[0-9]+]]:psr(s32) = G_FADD [[LOAD]], [[C]]
+ ; X64-NEXT: G_STORE [[FADD]](s32), [[COPY1]](p0) :: (store (s32) into %ir.1)
; X64-NEXT: RET 0
%load1 = load float, ptr %0
%add = fadd float %load1, 20.0
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
index 4242d8483e72..39c7ce1413d1 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
@@ -314,8 +314,8 @@ define void @vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2(ptr %in.v
;
; AVX512F-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u]
; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
@@ -324,8 +324,8 @@ define void @vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2(ptr %in.v
;
; AVX512DQ-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
@@ -981,7 +981,7 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in.
; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7]
; AVX512F-NEXT: vmovdqa (%rdi), %ymm1
; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1
-; AVX512F-NEXT: vpermd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpermd %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512F-NEXT: vzeroupper
@@ -992,7 +992,7 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in.
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7]
; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1
-; AVX512DQ-NEXT: vpermd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpermd %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512DQ-NEXT: vzeroupper
@@ -3507,13 +3507,12 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i
;
; AVX512F-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
-; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm2
-; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
+; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0
+; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
@@ -3523,13 +3522,12 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i
;
; AVX512DQ-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
-; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm2
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
+; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
@@ -3768,10 +3766,10 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
;
; AVX512F-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
+; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512F-NEXT: vpbroadcastq %xmm0, %ymm2
; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0
@@ -3784,10 +3782,10 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
;
; AVX512DQ-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
+; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512DQ-NEXT: vpbroadcastq %xmm0, %ymm2
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0
@@ -4147,9 +4145,9 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr %
;
; AVX512F-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
+; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
; AVX512F-NEXT: vpbroadcastw %xmm0, %xmm0
@@ -4161,9 +4159,9 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr %
;
; AVX512DQ-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
+; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
; AVX512DQ-NEXT: vpbroadcastw %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/dpbusd.ll b/llvm/test/CodeGen/X86/dpbusd.ll
index fbea08eb1e55..04d7a9691b64 100644
--- a/llvm/test/CodeGen/X86/dpbusd.ll
+++ b/llvm/test/CodeGen/X86/dpbusd.ll
@@ -26,7 +26,7 @@ define i32 @no_dpbusd(ptr%a, ptr%b, i32 %c, i32 %n) {
; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; AVX512-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
diff --git a/llvm/test/CodeGen/X86/dpbusd_i4.ll b/llvm/test/CodeGen/X86/dpbusd_i4.ll
index 906fead7f8db..a212f99680ef 100644
--- a/llvm/test/CodeGen/X86/dpbusd_i4.ll
+++ b/llvm/test/CodeGen/X86/dpbusd_i4.ll
@@ -86,7 +86,7 @@ define i32 @mul_sext_i4i4(<16 x i4> %a, <16 x i4> %b, i32 %c) {
; CHECK-NEXT: vpsraw $12, %ymm0, %ymm0
; CHECK-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
-; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
index 1436922f9dd1..6d5fc9ed0ab5 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
@@ -1828,22 +1828,22 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX512-LABEL: load_i16_stride3_vf32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
; AVX512-NEXT: vmovdqa 128(%rdi), %ymm5
; AVX512-NEXT: vmovdqa 160(%rdi), %ymm6
-; AVX512-NEXT: vmovdqa %ymm1, %ymm0
-; AVX512-NEXT: vpternlogq $202, %ymm5, %ymm6, %ymm0
-; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7],ymm0[8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14],ymm2[15]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
-; AVX512-NEXT: vmovdqa 112(%rdi), %xmm0
+; AVX512-NEXT: vmovdqa %ymm0, %ymm1
+; AVX512-NEXT: vpternlogq $202, %ymm5, %ymm6, %ymm1
+; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
+; AVX512-NEXT: vmovdqa 112(%rdi), %xmm1
; AVX512-NEXT: vmovdqa 96(%rdi), %xmm2
-; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6],xmm0[7]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7]
; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u]
; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1,2],ymm3[3,4,5,6,7]
; AVX512-NEXT: vmovdqa (%rdi), %ymm8
; AVX512-NEXT: vmovdqa 32(%rdi), %ymm9
-; AVX512-NEXT: vmovdqa %ymm1, %ymm3
+; AVX512-NEXT: vmovdqa %ymm0, %ymm3
; AVX512-NEXT: vpternlogq $202, %ymm9, %ymm8, %ymm3
; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15]
@@ -1857,14 +1857,14 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,4,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm7
-; AVX512-NEXT: vmovdqa %ymm1, %ymm10
+; AVX512-NEXT: vmovdqa %ymm0, %ymm10
; AVX512-NEXT: vpternlogq $202, %ymm6, %ymm5, %ymm10
; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1]
; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7,8,9],ymm11[10],ymm10[11,12],ymm11[13],ymm10[14,15]
; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
; AVX512-NEXT: vpshufb %ymm11, %ymm10, %ymm10
-; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7]
-; AVX512-NEXT: vpshufb %xmm11, %xmm12, %xmm12
+; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm10[5,6,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7]
; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
@@ -1885,21 +1885,19 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1,2],ymm5[3],ymm12[4,5],ymm5[6],ymm12[7],ymm5[8],ymm12[9,10],ymm5[11],ymm12[12,13],ymm5[14],ymm12[15]
; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
; AVX512-NEXT: vpshufb %ymm6, %ymm5, %ymm5
-; AVX512-NEXT: vpternlogq $202, %ymm8, %ymm9, %ymm1
-; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2],ymm8[3],ymm1[4,5],ymm8[6],ymm1[7],ymm8[8],ymm1[9,10],ymm8[11],ymm1[12,13],ymm8[14],ymm1[15]
-; AVX512-NEXT: vpshufb %ymm6, %ymm1, %ymm1
-; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7]
-; AVX512-NEXT: vpshufb %xmm6, %xmm3, %xmm3
-; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
-; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm0
-; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm5[5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
-; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
+; AVX512-NEXT: vpternlogq $202, %ymm8, %ymm9, %ymm0
+; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7],ymm2[8],ymm0[9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15]
+; AVX512-NEXT: vpshufb %ymm6, %ymm0, %ymm0
+; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7]
+; AVX512-NEXT: vpshufb %xmm6, %xmm2, %xmm2
+; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512-NEXT: vmovdqa64 %zmm7, (%rsi)
; AVX512-NEXT: vmovdqa64 %zmm10, (%rdx)
; AVX512-NEXT: vmovdqa64 %zmm0, (%rcx)
@@ -1908,22 +1906,22 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX512-FCP-LABEL: load_i16_stride3_vf32:
; AVX512-FCP: # %bb.0:
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm5
; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm6
-; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm0
-; AVX512-FCP-NEXT: vpternlogq $202, %ymm5, %ymm6, %ymm0
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7],ymm0[8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14],ymm2[15]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
-; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm0
+; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm1
+; AVX512-FCP-NEXT: vpternlogq $202, %ymm5, %ymm6, %ymm1
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
+; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm1
; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm2
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6],xmm0[7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7]
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1,2],ymm3[3,4,5,6,7]
; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm8
; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm9
-; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm3
+; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm3
; AVX512-FCP-NEXT: vpternlogq $202, %ymm9, %ymm8, %ymm3
; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15]
@@ -1937,14 +1935,14 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,4,7]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm7
-; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm10
+; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm10
; AVX512-FCP-NEXT: vpternlogq $202, %ymm6, %ymm5, %ymm10
; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1]
; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7,8,9],ymm11[10],ymm10[11,12],ymm11[13],ymm10[14,15]
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm10
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7]
-; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm12, %xmm12
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm10[5,6,7]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7]
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
@@ -1965,21 +1963,19 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1,2],ymm5[3],ymm12[4,5],ymm5[6],ymm12[7],ymm5[8],ymm12[9,10],ymm5[11],ymm12[12,13],ymm5[14],ymm12[15]
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5
-; AVX512-FCP-NEXT: vpternlogq $202, %ymm8, %ymm9, %ymm1
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2],ymm8[3],ymm1[4,5],ymm8[6],ymm1[7],ymm8[8],ymm1[9,10],ymm8[11],ymm1[12,13],ymm8[14],ymm1[15]
-; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7]
-; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512-FCP-NEXT: vextracti32x4 $2, %zmm0, %xmm0
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm5[5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
+; AVX512-FCP-NEXT: vpternlogq $202, %ymm8, %ymm9, %ymm0
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7],ymm2[8],ymm0[9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15]
+; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7]
+; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm2
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%rsi)
; AVX512-FCP-NEXT: vmovdqa64 %zmm10, (%rdx)
; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rcx)
@@ -1988,22 +1984,22 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX512DQ-LABEL: load_i16_stride3_vf32:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm5
; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm6
-; AVX512DQ-NEXT: vmovdqa %ymm1, %ymm0
-; AVX512DQ-NEXT: vpternlogq $202, %ymm5, %ymm6, %ymm0
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7],ymm0[8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14],ymm2[15]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
-; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm0
+; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1
+; AVX512DQ-NEXT: vpternlogq $202, %ymm5, %ymm6, %ymm1
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
+; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm1
; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm2
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6],xmm0[7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1,2],ymm3[3,4,5,6,7]
; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm8
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm9
-; AVX512DQ-NEXT: vmovdqa %ymm1, %ymm3
+; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm3
; AVX512DQ-NEXT: vpternlogq $202, %ymm9, %ymm8, %ymm3
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15]
@@ -2017,14 +2013,14 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,4,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm7
-; AVX512DQ-NEXT: vmovdqa %ymm1, %ymm10
+; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm10
; AVX512DQ-NEXT: vpternlogq $202, %ymm6, %ymm5, %ymm10
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1]
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7,8,9],ymm11[10],ymm10[11,12],ymm11[13],ymm10[14,15]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
; AVX512DQ-NEXT: vpshufb %ymm11, %ymm10, %ymm10
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7]
-; AVX512DQ-NEXT: vpshufb %xmm11, %xmm12, %xmm12
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm10[5,6,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
@@ -2045,21 +2041,19 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1,2],ymm5[3],ymm12[4,5],ymm5[6],ymm12[7],ymm5[8],ymm12[9,10],ymm5[11],ymm12[12,13],ymm5[14],ymm12[15]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
; AVX512DQ-NEXT: vpshufb %ymm6, %ymm5, %ymm5
-; AVX512DQ-NEXT: vpternlogq $202, %ymm8, %ymm9, %ymm1
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2],ymm8[3],ymm1[4,5],ymm8[6],ymm1[7],ymm8[8],ymm1[9,10],ymm8[11],ymm1[12,13],ymm8[14],ymm1[15]
-; AVX512DQ-NEXT: vpshufb %ymm6, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7]
-; AVX512DQ-NEXT: vpshufb %xmm6, %xmm3, %xmm3
-; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512DQ-NEXT: vextracti32x4 $2, %zmm0, %xmm0
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm5[5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-NEXT: vpternlogq $202, %ymm8, %ymm9, %ymm0
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7],ymm2[8],ymm0[9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15]
+; AVX512DQ-NEXT: vpshufb %ymm6, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7]
+; AVX512DQ-NEXT: vpshufb %xmm6, %xmm2, %xmm2
+; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vmovdqa64 %zmm7, (%rsi)
; AVX512DQ-NEXT: vmovdqa64 %zmm10, (%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rcx)
@@ -2068,22 +2062,22 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX512DQ-FCP-LABEL: load_i16_stride3_vf32:
; AVX512DQ-FCP: # %bb.0:
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm5
; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm6
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm0
-; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm5, %ymm6, %ymm0
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7],ymm0[8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14],ymm2[15]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
-; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm0
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm5, %ymm6, %ymm1
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
+; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm1
; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6],xmm0[7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1,2],ymm3[3,4,5,6,7]
; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm8
; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm9
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm3
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm3
; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm9, %ymm8, %ymm3
; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15]
@@ -2097,14 +2091,14 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,4,7]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm7
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm10
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm10
; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm6, %ymm5, %ymm10
; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1]
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7,8,9],ymm11[10],ymm10[11,12],ymm11[13],ymm10[14,15]
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm10
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm12, %xmm12
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm10[5,6,7]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7]
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
@@ -2125,21 +2119,19 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1,2],ymm5[3],ymm12[4,5],ymm5[6],ymm12[7],ymm5[8],ymm12[9,10],ymm5[11],ymm12[12,13],ymm5[14],ymm12[15]
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5
-; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm8, %ymm9, %ymm1
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2],ymm8[3],ymm1[4,5],ymm8[6],ymm1[7],ymm8[8],ymm1[9,10],ymm8[11],ymm1[12,13],ymm8[14],ymm1[15]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512DQ-FCP-NEXT: vextracti32x4 $2, %zmm0, %xmm0
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm5[5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm8, %ymm9, %ymm0
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7],ymm2[8],ymm0[9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%rsi)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%rdx)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rcx)
@@ -3500,688 +3492,668 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-LABEL: load_i16_stride3_vf64:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
-; AVX512-NEXT: vmovdqa64 224(%rdi), %ymm20
-; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm21
+; AVX512-NEXT: vmovdqa64 224(%rdi), %ymm18
+; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm20
; AVX512-NEXT: vmovdqa %ymm0, %ymm1
-; AVX512-NEXT: vpternlogq $202, %ymm20, %ymm21, %ymm1
-; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7],ymm1[8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13,14],ymm3[15]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
-; AVX512-NEXT: vpshufb %ymm3, %ymm1, %ymm5
-; AVX512-NEXT: vmovdqa 272(%rdi), %xmm8
+; AVX512-NEXT: vpternlogq $202, %ymm18, %ymm20, %ymm1
+; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
+; AVX512-NEXT: vpshufb %ymm7, %ymm2, %ymm5
+; AVX512-NEXT: vmovdqa 272(%rdi), %xmm1
; AVX512-NEXT: vmovdqa 256(%rdi), %xmm2
-; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm8[2],xmm2[3,4],xmm8[5],xmm2[6,7]
-; AVX512-NEXT: vmovdqa %xmm2, %xmm14
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
-; AVX512-NEXT: vpshufb %xmm9, %xmm6, %xmm6
+; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
+; AVX512-NEXT: vmovdqa %xmm2, %xmm3
+; AVX512-NEXT: vmovdqa64 %xmm1, %xmm19
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
+; AVX512-NEXT: vpshufb %xmm13, %xmm6, %xmm6
; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
-; AVX512-NEXT: vmovdqa64 320(%rdi), %ymm22
-; AVX512-NEXT: vmovdqa64 352(%rdi), %ymm23
-; AVX512-NEXT: vmovdqa %ymm0, %ymm6
-; AVX512-NEXT: vpternlogq $202, %ymm22, %ymm23, %ymm6
-; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6],ymm7[7],ymm6[8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13,14],ymm7[15]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
-; AVX512-NEXT: vpshufb %ymm11, %ymm6, %ymm12
+; AVX512-NEXT: vmovdqa64 320(%rdi), %ymm21
+; AVX512-NEXT: vmovdqa64 352(%rdi), %ymm22
+; AVX512-NEXT: vmovdqa %ymm0, %ymm8
+; AVX512-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm8
+; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7],ymm8[8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13,14],ymm9[15]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
+; AVX512-NEXT: vpshufb %ymm10, %ymm8, %ymm11
; AVX512-NEXT: vmovdqa 304(%rdi), %xmm1
; AVX512-NEXT: vmovdqa 288(%rdi), %xmm2
-; AVX512-NEXT: vpblendw {{.*#+}} xmm13 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7]
; AVX512-NEXT: vmovdqa %xmm2, %xmm4
-; AVX512-NEXT: vmovdqa %xmm1, %xmm6
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15]
-; AVX512-NEXT: vpshufb %xmm15, %xmm13, %xmm13
-; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4,5,6,7]
-; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm16
-; AVX512-NEXT: vmovdqa64 128(%rdi), %ymm24
-; AVX512-NEXT: vmovdqa 160(%rdi), %ymm13
+; AVX512-NEXT: vmovdqa %xmm1, %xmm8
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15]
+; AVX512-NEXT: vpshufb %xmm14, %xmm12, %xmm12
+; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm16
+; AVX512-NEXT: vmovdqa64 128(%rdi), %ymm23
+; AVX512-NEXT: vmovdqa 160(%rdi), %ymm11
; AVX512-NEXT: vmovdqa %ymm0, %ymm5
-; AVX512-NEXT: vpternlogq $202, %ymm24, %ymm13, %ymm5
+; AVX512-NEXT: vpternlogq $202, %ymm23, %ymm11, %ymm5
; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm5[2,3,0,1]
; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6],ymm12[7],ymm5[8],ymm12[9],ymm5[10,11],ymm12[12],ymm5[13,14],ymm12[15]
-; AVX512-NEXT: vpshufb %ymm11, %ymm5, %ymm5
-; AVX512-NEXT: vmovdqa 112(%rdi), %xmm11
-; AVX512-NEXT: vmovdqa 96(%rdi), %xmm12
-; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm11[1],xmm12[2,3],xmm11[4],xmm12[5,6],xmm11[7]
-; AVX512-NEXT: vpshufb %xmm15, %xmm10, %xmm10
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm5[3,4,5,6,7]
-; AVX512-NEXT: vmovdqa64 (%rdi), %ymm17
-; AVX512-NEXT: vmovdqa 32(%rdi), %ymm5
+; AVX512-NEXT: vpshufb %ymm10, %ymm5, %ymm10
+; AVX512-NEXT: vmovdqa 112(%rdi), %xmm15
+; AVX512-NEXT: vmovdqa 96(%rdi), %xmm5
+; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0],xmm15[1],xmm5[2,3],xmm15[4],xmm5[5,6],xmm15[7]
+; AVX512-NEXT: vpshufb %xmm14, %xmm12, %xmm12
+; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2],ymm10[3,4,5,6,7]
+; AVX512-NEXT: vmovdqa64 (%rdi), %ymm24
+; AVX512-NEXT: vmovdqa 32(%rdi), %ymm12
; AVX512-NEXT: vmovdqa %ymm0, %ymm10
-; AVX512-NEXT: vpternlogq $202, %ymm5, %ymm17, %ymm10
-; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm10[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3],ymm15[4],ymm10[5,6],ymm15[7],ymm10[8],ymm15[9],ymm10[10,11],ymm15[12],ymm10[13,14],ymm15[15]
-; AVX512-NEXT: vpshufb %ymm3, %ymm10, %ymm2
+; AVX512-NEXT: vpternlogq $202, %ymm12, %ymm24, %ymm10
+; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm10[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm1[1],ymm10[2,3],ymm1[4],ymm10[5,6],ymm1[7],ymm10[8],ymm1[9],ymm10[10,11],ymm1[12],ymm10[13,14],ymm1[15]
+; AVX512-NEXT: vpshufb %ymm7, %ymm1, %ymm7
; AVX512-NEXT: vmovdqa 80(%rdi), %xmm10
-; AVX512-NEXT: vmovdqa 64(%rdi), %xmm15
-; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1],xmm10[2],xmm15[3,4],xmm10[5],xmm15[6,7]
-; AVX512-NEXT: vpshufb %xmm9, %xmm3, %xmm3
-; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm18
-; AVX512-NEXT: vmovdqa %ymm0, %ymm1
-; AVX512-NEXT: vpternlogq $202, %ymm23, %ymm22, %ymm1
-; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
-; AVX512-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX512-NEXT: vmovdqa64 %ymm2, %ymm28
-; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm6[2],xmm4[3,4],xmm6[5],xmm4[6,7]
-; AVX512-NEXT: vmovdqa64 %xmm6, %xmm25
+; AVX512-NEXT: vmovdqa 64(%rdi), %xmm1
+; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm10[2],xmm1[3,4],xmm10[5],xmm1[6,7]
+; AVX512-NEXT: vpshufb %xmm13, %xmm2, %xmm2
+; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7],ymm7[8,9,10],ymm2[11,12,13,14,15]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,4,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm17
+; AVX512-NEXT: vmovdqa %ymm0, %ymm2
+; AVX512-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm2
+; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
+; AVX512-NEXT: vpshufb %ymm9, %ymm2, %ymm2
+; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm8[2],xmm4[3,4],xmm8[5],xmm4[6,7]
+; AVX512-NEXT: vmovdqa64 %xmm8, %xmm25
; AVX512-NEXT: vmovdqa64 %xmm4, %xmm26
; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
-; AVX512-NEXT: vpshufb %xmm6, %xmm3, %xmm3
-; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm1[5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512-NEXT: vmovdqa %ymm9, %ymm1
-; AVX512-NEXT: vpternlogq $202, %ymm21, %ymm20, %ymm1
-; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15]
+; AVX512-NEXT: vpshufb %xmm6, %xmm7, %xmm7
+; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm2[5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512-NEXT: vmovdqa %ymm13, %ymm2
+; AVX512-NEXT: vpternlogq $202, %ymm20, %ymm18, %ymm2
+; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15]
; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
-; AVX512-NEXT: vpshufb %ymm4, %ymm1, %ymm1
-; AVX512-NEXT: vmovdqa %xmm14, %xmm7
-; AVX512-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm14[2],xmm8[3,4],xmm14[5],xmm8[6,7]
-; AVX512-NEXT: vmovdqa64 %xmm8, %xmm27
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
-; AVX512-NEXT: vpshufb %xmm2, %xmm14, %xmm14
-; AVX512-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm1[0,1,2],ymm14[3,4,5,6,7],ymm1[8,9,10],ymm14[11,12,13,14,15]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7]
-; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm19
-; AVX512-NEXT: vmovdqa %ymm0, %ymm1
-; AVX512-NEXT: vpternlogq $202, %ymm13, %ymm24, %ymm1
-; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15]
-; AVX512-NEXT: vmovdqa64 %ymm28, %ymm3
-; AVX512-NEXT: vpshufb %ymm3, %ymm1, %ymm1
-; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1],xmm11[2],xmm12[3,4],xmm11[5],xmm12[6,7]
-; AVX512-NEXT: vpshufb %xmm6, %xmm3, %xmm3
-; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm1[5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-NEXT: vmovdqa %ymm9, %ymm3
-; AVX512-NEXT: vpternlogq $202, %ymm17, %ymm5, %ymm3
-; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7,8,9],ymm6[10],ymm3[11,12],ymm6[13],ymm3[14,15]
-; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3
-; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1],xmm15[2],xmm10[3,4],xmm15[5],xmm10[6,7]
-; AVX512-NEXT: vpshufb %xmm2, %xmm4, %xmm2
-; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,7,4]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
-; AVX512-NEXT: vpternlogq $226, %ymm24, %ymm9, %ymm13
-; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm13[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm13[1,2],ymm2[3],ymm13[4,5],ymm2[6],ymm13[7],ymm2[8],ymm13[9,10],ymm2[11],ymm13[12,13],ymm2[14],ymm13[15]
-; AVX512-NEXT: vpternlogq $226, %ymm17, %ymm0, %ymm5
-; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm5[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1,2],ymm3[3],ymm5[4,5],ymm3[6],ymm5[7],ymm3[8],ymm5[9,10],ymm3[11],ymm5[12,13],ymm3[14],ymm5[15]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
-; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3
-; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0],xmm10[1],xmm15[2,3],xmm10[4],xmm15[5,6],xmm10[7]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
-; AVX512-NEXT: vpshufb %xmm6, %xmm5, %xmm5
-; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7]
; AVX512-NEXT: vpshufb %ymm4, %ymm2, %ymm2
-; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm11[0,1],xmm12[2],xmm11[3,4],xmm12[5],xmm11[6,7]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
-; AVX512-NEXT: vpshufb %xmm8, %xmm5, %xmm5
-; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm5
-; AVX512-NEXT: vextracti32x4 $2, %zmm5, %xmm5
-; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm2[5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-NEXT: vmovdqa64 %xmm19, %xmm8
+; AVX512-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7]
+; AVX512-NEXT: vmovdqa64 %xmm3, %xmm27
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
+; AVX512-NEXT: vpshufb %xmm3, %xmm14, %xmm14
+; AVX512-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm2[0,1,2],ymm14[3,4,5,6,7],ymm2[8,9,10],ymm14[11,12,13,14,15]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,4]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm19
+; AVX512-NEXT: vmovdqa %ymm0, %ymm2
+; AVX512-NEXT: vpternlogq $202, %ymm11, %ymm23, %ymm2
+; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7,8,9],ymm7[10],ymm2[11,12],ymm7[13],ymm2[14,15]
+; AVX512-NEXT: vpshufb %ymm9, %ymm2, %ymm2
+; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0,1],xmm15[2],xmm5[3,4],xmm15[5],xmm5[6,7]
+; AVX512-NEXT: vpshufb %xmm6, %xmm7, %xmm6
+; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm2[5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-NEXT: vmovdqa %ymm13, %ymm6
+; AVX512-NEXT: vpternlogq $202, %ymm24, %ymm12, %ymm6
+; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15]
+; AVX512-NEXT: vpshufb %ymm4, %ymm6, %ymm4
+; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm1[2],xmm10[3,4],xmm1[5],xmm10[6,7]
+; AVX512-NEXT: vpshufb %xmm3, %xmm6, %xmm3
+; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512-NEXT: vpternlogq $202, %ymm22, %ymm23, %ymm9
-; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm9[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm9[1,2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7],ymm3[8],ymm9[9,10],ymm3[11],ymm9[12,13],ymm3[14],ymm9[15]
-; AVX512-NEXT: vpternlogq $202, %ymm21, %ymm20, %ymm0
-; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2],ymm5[3],ymm0[4,5],ymm5[6],ymm0[7],ymm5[8],ymm0[9,10],ymm5[11],ymm0[12,13],ymm5[14],ymm0[15]
-; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3
-; AVX512-NEXT: vpshufb %ymm4, %ymm0, %ymm0
+; AVX512-NEXT: vpternlogq $226, %ymm23, %ymm13, %ymm11
+; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm11[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm11[1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7],ymm3[8],ymm11[9,10],ymm3[11],ymm11[12,13],ymm3[14],ymm11[15]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
+; AVX512-NEXT: vpshufb %ymm11, %ymm3, %ymm3
+; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm5[2],xmm15[3,4],xmm5[5],xmm15[6,7]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
+; AVX512-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512-NEXT: vpternlogq $226, %ymm24, %ymm0, %ymm12
+; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15]
+; AVX512-NEXT: vpshufb %ymm11, %ymm4, %ymm4
+; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3],xmm10[4],xmm1[5,6],xmm10[7]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
+; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm1
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
+; AVX512-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm13
+; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm13[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7],ymm3[8],ymm13[9,10],ymm3[11],ymm13[12,13],ymm3[14],ymm13[15]
+; AVX512-NEXT: vmovdqa64 %xmm25, %xmm4
+; AVX512-NEXT: vmovdqa64 %xmm26, %xmm7
+; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7]
+; AVX512-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX512-NEXT: vpshufb %ymm11, %ymm3, %ymm3
+; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512-NEXT: vpternlogq $202, %ymm20, %ymm18, %ymm0
+; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7],ymm4[8],ymm0[9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15]
+; AVX512-NEXT: vpshufb %ymm11, %ymm0, %ymm0
; AVX512-NEXT: vmovdqa64 %xmm27, %xmm4
-; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2,3],xmm4[4],xmm7[5,6],xmm4[7]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7]
; AVX512-NEXT: vpshufb %xmm6, %xmm4, %xmm4
; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7]
-; AVX512-NEXT: vmovdqa64 %xmm25, %xmm4
-; AVX512-NEXT: vmovdqa64 %xmm26, %xmm5
-; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7]
-; AVX512-NEXT: vpshufb %xmm8, %xmm4, %xmm4
-; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4
-; AVX512-NEXT: vextracti32x4 $2, %zmm4, %xmm4
-; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
-; AVX512-NEXT: vmovdqa64 %zmm18, (%rsi)
+; AVX512-NEXT: vmovdqa64 %zmm17, (%rsi)
; AVX512-NEXT: vmovdqa64 %zmm16, 64(%rsi)
; AVX512-NEXT: vmovdqa64 %zmm19, 64(%rdx)
-; AVX512-NEXT: vmovdqa64 %zmm1, (%rdx)
+; AVX512-NEXT: vmovdqa64 %zmm2, (%rdx)
; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rcx)
-; AVX512-NEXT: vmovdqa64 %zmm2, (%rcx)
+; AVX512-NEXT: vmovdqa64 %zmm1, (%rcx)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512-FCP-LABEL: load_i16_stride3_vf64:
; AVX512-FCP: # %bb.0:
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
-; AVX512-FCP-NEXT: vmovdqa64 224(%rdi), %ymm20
-; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm21
+; AVX512-FCP-NEXT: vmovdqa64 224(%rdi), %ymm18
+; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm20
; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm1
-; AVX512-FCP-NEXT: vpternlogq $202, %ymm20, %ymm21, %ymm1
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7],ymm1[8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13,14],ymm3[15]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
-; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm5
-; AVX512-FCP-NEXT: vmovdqa 272(%rdi), %xmm8
+; AVX512-FCP-NEXT: vpternlogq $202, %ymm18, %ymm20, %ymm1
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
+; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm5
+; AVX512-FCP-NEXT: vmovdqa 272(%rdi), %xmm1
; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %xmm2
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm8[2],xmm2[3,4],xmm8[5],xmm2[6,7]
-; AVX512-FCP-NEXT: vmovdqa %xmm2, %xmm14
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
-; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm6, %xmm6
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
+; AVX512-FCP-NEXT: vmovdqa %xmm2, %xmm3
+; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm19
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
+; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm6, %xmm6
; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15]
; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %ymm22
-; AVX512-FCP-NEXT: vmovdqa64 352(%rdi), %ymm23
-; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm6
-; AVX512-FCP-NEXT: vpternlogq $202, %ymm22, %ymm23, %ymm6
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6],ymm7[7],ymm6[8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13,14],ymm7[15]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
-; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm6, %ymm12
+; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %ymm21
+; AVX512-FCP-NEXT: vmovdqa64 352(%rdi), %ymm22
+; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm8
+; AVX512-FCP-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm8
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7],ymm8[8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13,14],ymm9[15]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
+; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm11
; AVX512-FCP-NEXT: vmovdqa 304(%rdi), %xmm1
; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %xmm2
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7]
; AVX512-FCP-NEXT: vmovdqa %xmm2, %xmm4
-; AVX512-FCP-NEXT: vmovdqa %xmm1, %xmm6
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm13, %xmm13
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm16
-; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm24
-; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm13
+; AVX512-FCP-NEXT: vmovdqa %xmm1, %xmm8
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm12
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm16
+; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23
+; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm11
; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm5
-; AVX512-FCP-NEXT: vpternlogq $202, %ymm24, %ymm13, %ymm5
+; AVX512-FCP-NEXT: vpternlogq $202, %ymm23, %ymm11, %ymm5
; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm5[2,3,0,1]
; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6],ymm12[7],ymm5[8],ymm12[9],ymm5[10,11],ymm12[12],ymm5[13,14],ymm12[15]
-; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm5
-; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm11
-; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm12
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm11[1],xmm12[2,3],xmm11[4],xmm12[5,6],xmm11[7]
-; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm10, %xmm10
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm5[3,4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm17
-; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm5
+; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm5, %ymm10
+; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm15
+; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm5
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0],xmm15[1],xmm5[2,3],xmm15[4],xmm5[5,6],xmm15[7]
+; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm12
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2],ymm10[3,4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm24
+; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm12
; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm10
-; AVX512-FCP-NEXT: vpternlogq $202, %ymm5, %ymm17, %ymm10
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm10[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3],ymm15[4],ymm10[5,6],ymm15[7],ymm10[8],ymm15[9],ymm10[10,11],ymm15[12],ymm10[13,14],ymm15[15]
-; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm10, %ymm2
+; AVX512-FCP-NEXT: vpternlogq $202, %ymm12, %ymm24, %ymm10
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm10[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm1[1],ymm10[2,3],ymm1[4],ymm10[5,6],ymm1[7],ymm10[8],ymm1[9],ymm10[10,11],ymm1[12],ymm10[13,14],ymm1[15]
+; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm7
; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm10
-; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm15
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1],xmm10[2],xmm15[3,4],xmm10[5],xmm15[6,7]
-; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm3
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm18
-; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm1
-; AVX512-FCP-NEXT: vpternlogq $202, %ymm23, %ymm22, %ymm1
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
-; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm28
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm6[2],xmm4[3,4],xmm6[5],xmm4[6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm6, %xmm25
+; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm1
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm10[2],xmm1[3,4],xmm10[5],xmm1[6,7]
+; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm2
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7],ymm7[8,9,10],ymm2[11,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,4,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm17
+; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm2
+; AVX512-FCP-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm2
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
+; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm8[2],xmm4[3,4],xmm8[5],xmm4[6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm8, %xmm25
; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm26
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
-; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm1[5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm1
-; AVX512-FCP-NEXT: vpternlogq $202, %ymm21, %ymm20, %ymm1
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15]
+; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm7
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm2[5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm2
+; AVX512-FCP-NEXT: vpternlogq $202, %ymm20, %ymm18, %ymm2
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15]
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
-; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1
-; AVX512-FCP-NEXT: vmovdqa %xmm14, %xmm7
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm14[2],xmm8[3,4],xmm14[5],xmm8[6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm8, %xmm27
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
-; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm14, %xmm14
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm1[0,1,2],ymm14[3,4,5,6,7],ymm1[8,9,10],ymm14[11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm19
-; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm1
-; AVX512-FCP-NEXT: vpternlogq $202, %ymm13, %ymm24, %ymm1
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm3
-; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1],xmm11[2],xmm12[3,4],xmm11[5],xmm12[6,7]
-; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm1[5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm3
-; AVX512-FCP-NEXT: vpternlogq $202, %ymm17, %ymm5, %ymm3
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7,8,9],ymm6[10],ymm3[11,12],ymm6[13],ymm3[14,15]
-; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1],xmm15[2],xmm10[3,4],xmm15[5],xmm10[6,7]
-; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm2
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,7,4]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
-; AVX512-FCP-NEXT: vpternlogq $226, %ymm24, %ymm9, %ymm13
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm13[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm13[1,2],ymm2[3],ymm13[4,5],ymm2[6],ymm13[7],ymm2[8],ymm13[9,10],ymm2[11],ymm13[12,13],ymm2[14],ymm13[15]
-; AVX512-FCP-NEXT: vpternlogq $226, %ymm17, %ymm0, %ymm5
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm5[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1,2],ymm3[3],ymm5[4,5],ymm3[6],ymm5[7],ymm3[8],ymm5[9,10],ymm3[11],ymm5[12,13],ymm3[14],ymm5[15]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
-; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0],xmm10[1],xmm15[2,3],xmm10[4],xmm15[5,6],xmm10[7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
-; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm5
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7]
; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm11[0,1],xmm12[2],xmm11[3,4],xmm12[5],xmm11[6,7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
-; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm5
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm5
-; AVX512-FCP-NEXT: vextracti32x4 $2, %zmm5, %xmm5
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm2[5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm8
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm27
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
+; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm14, %xmm14
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm2[0,1,2],ymm14[3,4,5,6,7],ymm2[8,9,10],ymm14[11,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,4]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm19
+; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm2
+; AVX512-FCP-NEXT: vpternlogq $202, %ymm11, %ymm23, %ymm2
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7,8,9],ymm7[10],ymm2[11,12],ymm7[13],ymm2[14,15]
+; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0,1],xmm15[2],xmm5[3,4],xmm15[5],xmm5[6,7]
+; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm6
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm2[5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm6
+; AVX512-FCP-NEXT: vpternlogq $202, %ymm24, %ymm12, %ymm6
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15]
+; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm4
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm1[2],xmm10[3,4],xmm1[5],xmm10[6,7]
+; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm3
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512-FCP-NEXT: vpternlogq $202, %ymm22, %ymm23, %ymm9
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm9[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm9[1,2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7],ymm3[8],ymm9[9,10],ymm3[11],ymm9[12,13],ymm3[14],ymm9[15]
-; AVX512-FCP-NEXT: vpternlogq $202, %ymm21, %ymm20, %ymm0
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2],ymm5[3],ymm0[4,5],ymm5[6],ymm0[7],ymm5[8],ymm0[9,10],ymm5[11],ymm0[12,13],ymm5[14],ymm0[15]
-; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3
-; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vpternlogq $226, %ymm23, %ymm13, %ymm11
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm11[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm11[1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7],ymm3[8],ymm11[9,10],ymm3[11],ymm11[12,13],ymm3[14],ymm11[15]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm5[2],xmm15[3,4],xmm5[5],xmm15[6,7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
+; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512-FCP-NEXT: vpternlogq $226, %ymm24, %ymm0, %ymm12
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15]
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3],xmm10[4],xmm1[5,6],xmm10[7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
+; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
+; AVX512-FCP-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm13
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm13[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7],ymm3[8],ymm13[9,10],ymm3[11],ymm13[12,13],ymm3[14],ymm13[15]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm4
+; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm7
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7]
+; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512-FCP-NEXT: vpternlogq $202, %ymm20, %ymm18, %ymm0
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7],ymm4[8],ymm0[9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15]
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm0
; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm4
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2,3],xmm4[4],xmm7[5,6],xmm4[7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7]
; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm4
; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm4
-; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm5
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7]
-; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm4, %xmm4
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4
-; AVX512-FCP-NEXT: vextracti32x4 $2, %zmm4, %xmm4
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
-; AVX512-FCP-NEXT: vmovdqa64 %zmm18, (%rsi)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm17, (%rsi)
; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 64(%rsi)
; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 64(%rdx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rdx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rdx)
; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 64(%rcx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rcx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rcx)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
; AVX512DQ-LABEL: load_i16_stride3_vf64:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
-; AVX512DQ-NEXT: vmovdqa64 224(%rdi), %ymm20
-; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm21
+; AVX512DQ-NEXT: vmovdqa64 224(%rdi), %ymm18
+; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm20
; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1
-; AVX512DQ-NEXT: vpternlogq $202, %ymm20, %ymm21, %ymm1
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7],ymm1[8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13,14],ymm3[15]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
-; AVX512DQ-NEXT: vpshufb %ymm3, %ymm1, %ymm5
-; AVX512DQ-NEXT: vmovdqa 272(%rdi), %xmm8
+; AVX512DQ-NEXT: vpternlogq $202, %ymm18, %ymm20, %ymm1
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
+; AVX512DQ-NEXT: vpshufb %ymm7, %ymm2, %ymm5
+; AVX512DQ-NEXT: vmovdqa 272(%rdi), %xmm1
; AVX512DQ-NEXT: vmovdqa 256(%rdi), %xmm2
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm8[2],xmm2[3,4],xmm8[5],xmm2[6,7]
-; AVX512DQ-NEXT: vmovdqa %xmm2, %xmm14
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
-; AVX512DQ-NEXT: vpshufb %xmm9, %xmm6, %xmm6
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
+; AVX512DQ-NEXT: vmovdqa %xmm2, %xmm3
+; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm19
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
+; AVX512DQ-NEXT: vpshufb %xmm13, %xmm6, %xmm6
; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %ymm22
-; AVX512DQ-NEXT: vmovdqa64 352(%rdi), %ymm23
-; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm6
-; AVX512DQ-NEXT: vpternlogq $202, %ymm22, %ymm23, %ymm6
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6],ymm7[7],ymm6[8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13,14],ymm7[15]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
-; AVX512DQ-NEXT: vpshufb %ymm11, %ymm6, %ymm12
+; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %ymm21
+; AVX512DQ-NEXT: vmovdqa64 352(%rdi), %ymm22
+; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm8
+; AVX512DQ-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm8
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7],ymm8[8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13,14],ymm9[15]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
+; AVX512DQ-NEXT: vpshufb %ymm10, %ymm8, %ymm11
; AVX512DQ-NEXT: vmovdqa 304(%rdi), %xmm1
; AVX512DQ-NEXT: vmovdqa 288(%rdi), %xmm2
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm13 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7]
; AVX512DQ-NEXT: vmovdqa %xmm2, %xmm4
-; AVX512DQ-NEXT: vmovdqa %xmm1, %xmm6
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15]
-; AVX512DQ-NEXT: vpshufb %xmm15, %xmm13, %xmm13
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4,5,6,7]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm16
-; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %ymm24
-; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm13
+; AVX512DQ-NEXT: vmovdqa %xmm1, %xmm8
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15]
+; AVX512DQ-NEXT: vpshufb %xmm14, %xmm12, %xmm12
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm16
+; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %ymm23
+; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm11
; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm5
-; AVX512DQ-NEXT: vpternlogq $202, %ymm24, %ymm13, %ymm5
+; AVX512DQ-NEXT: vpternlogq $202, %ymm23, %ymm11, %ymm5
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm5[2,3,0,1]
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6],ymm12[7],ymm5[8],ymm12[9],ymm5[10,11],ymm12[12],ymm5[13,14],ymm12[15]
-; AVX512DQ-NEXT: vpshufb %ymm11, %ymm5, %ymm5
-; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm11
-; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm12
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm11[1],xmm12[2,3],xmm11[4],xmm12[5,6],xmm11[7]
-; AVX512DQ-NEXT: vpshufb %xmm15, %xmm10, %xmm10
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm5[3,4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm17
-; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm5
+; AVX512DQ-NEXT: vpshufb %ymm10, %ymm5, %ymm10
+; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm15
+; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm5
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0],xmm15[1],xmm5[2,3],xmm15[4],xmm5[5,6],xmm15[7]
+; AVX512DQ-NEXT: vpshufb %xmm14, %xmm12, %xmm12
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2],ymm10[3,4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm24
+; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm12
; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm10
-; AVX512DQ-NEXT: vpternlogq $202, %ymm5, %ymm17, %ymm10
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm10[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3],ymm15[4],ymm10[5,6],ymm15[7],ymm10[8],ymm15[9],ymm10[10,11],ymm15[12],ymm10[13,14],ymm15[15]
-; AVX512DQ-NEXT: vpshufb %ymm3, %ymm10, %ymm2
+; AVX512DQ-NEXT: vpternlogq $202, %ymm12, %ymm24, %ymm10
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm10[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm1[1],ymm10[2,3],ymm1[4],ymm10[5,6],ymm1[7],ymm10[8],ymm1[9],ymm10[10,11],ymm1[12],ymm10[13,14],ymm1[15]
+; AVX512DQ-NEXT: vpshufb %ymm7, %ymm1, %ymm7
; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm10
-; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm15
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1],xmm10[2],xmm15[3,4],xmm10[5],xmm15[6,7]
-; AVX512DQ-NEXT: vpshufb %xmm9, %xmm3, %xmm3
-; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm18
-; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1
-; AVX512DQ-NEXT: vpternlogq $202, %ymm23, %ymm22, %ymm1
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
-; AVX512DQ-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm28
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm6[2],xmm4[3,4],xmm6[5],xmm4[6,7]
-; AVX512DQ-NEXT: vmovdqa64 %xmm6, %xmm25
+; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm1
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm10[2],xmm1[3,4],xmm10[5],xmm1[6,7]
+; AVX512DQ-NEXT: vpshufb %xmm13, %xmm2, %xmm2
+; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7],ymm7[8,9,10],ymm2[11,12,13,14,15]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,4,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm17
+; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm2
+; AVX512DQ-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm2
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
+; AVX512DQ-NEXT: vpshufb %ymm9, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm8[2],xmm4[3,4],xmm8[5],xmm4[6,7]
+; AVX512DQ-NEXT: vmovdqa64 %xmm8, %xmm25
; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm26
; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
-; AVX512DQ-NEXT: vpshufb %xmm6, %xmm3, %xmm3
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm1[5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm1
-; AVX512DQ-NEXT: vpternlogq $202, %ymm21, %ymm20, %ymm1
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15]
+; AVX512DQ-NEXT: vpshufb %xmm6, %xmm7, %xmm7
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm2[5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512DQ-NEXT: vmovdqa %ymm13, %ymm2
+; AVX512DQ-NEXT: vpternlogq $202, %ymm20, %ymm18, %ymm2
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
-; AVX512DQ-NEXT: vpshufb %ymm4, %ymm1, %ymm1
-; AVX512DQ-NEXT: vmovdqa %xmm14, %xmm7
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm14[2],xmm8[3,4],xmm14[5],xmm8[6,7]
-; AVX512DQ-NEXT: vmovdqa64 %xmm8, %xmm27
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm14, %xmm14
-; AVX512DQ-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm1[0,1,2],ymm14[3,4,5,6,7],ymm1[8,9,10],ymm14[11,12,13,14,15]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm19
-; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1
-; AVX512DQ-NEXT: vpternlogq $202, %ymm13, %ymm24, %ymm1
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15]
-; AVX512DQ-NEXT: vmovdqa64 %ymm28, %ymm3
-; AVX512DQ-NEXT: vpshufb %ymm3, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1],xmm11[2],xmm12[3,4],xmm11[5],xmm12[6,7]
-; AVX512DQ-NEXT: vpshufb %xmm6, %xmm3, %xmm3
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm1[5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm3
-; AVX512DQ-NEXT: vpternlogq $202, %ymm17, %ymm5, %ymm3
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7,8,9],ymm6[10],ymm3[11,12],ymm6[13],ymm3[14,15]
-; AVX512DQ-NEXT: vpshufb %ymm4, %ymm3, %ymm3
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1],xmm15[2],xmm10[3,4],xmm15[5],xmm10[6,7]
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm4, %xmm2
-; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,7,4]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
-; AVX512DQ-NEXT: vpternlogq $226, %ymm24, %ymm9, %ymm13
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm13[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm13[1,2],ymm2[3],ymm13[4,5],ymm2[6],ymm13[7],ymm2[8],ymm13[9,10],ymm2[11],ymm13[12,13],ymm2[14],ymm13[15]
-; AVX512DQ-NEXT: vpternlogq $226, %ymm17, %ymm0, %ymm5
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm5[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1,2],ymm3[3],ymm5[4,5],ymm3[6],ymm5[7],ymm3[8],ymm5[9,10],ymm3[11],ymm5[12,13],ymm3[14],ymm5[15]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
-; AVX512DQ-NEXT: vpshufb %ymm4, %ymm3, %ymm3
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0],xmm10[1],xmm15[2,3],xmm10[4],xmm15[5,6],xmm10[7]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
-; AVX512DQ-NEXT: vpshufb %xmm6, %xmm5, %xmm5
-; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7]
; AVX512DQ-NEXT: vpshufb %ymm4, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm11[0,1],xmm12[2],xmm11[3,4],xmm12[5],xmm11[6,7]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
-; AVX512DQ-NEXT: vpshufb %xmm8, %xmm5, %xmm5
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm5
-; AVX512DQ-NEXT: vextracti32x4 $2, %zmm5, %xmm5
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm2[5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm8
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7]
+; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm27
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
+; AVX512DQ-NEXT: vpshufb %xmm3, %xmm14, %xmm14
+; AVX512DQ-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm2[0,1,2],ymm14[3,4,5,6,7],ymm2[8,9,10],ymm14[11,12,13,14,15]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,4]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm19
+; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm2
+; AVX512DQ-NEXT: vpternlogq $202, %ymm11, %ymm23, %ymm2
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7,8,9],ymm7[10],ymm2[11,12],ymm7[13],ymm2[14,15]
+; AVX512DQ-NEXT: vpshufb %ymm9, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0,1],xmm15[2],xmm5[3,4],xmm15[5],xmm5[6,7]
+; AVX512DQ-NEXT: vpshufb %xmm6, %xmm7, %xmm6
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm2[5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa %ymm13, %ymm6
+; AVX512DQ-NEXT: vpternlogq $202, %ymm24, %ymm12, %ymm6
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15]
+; AVX512DQ-NEXT: vpshufb %ymm4, %ymm6, %ymm4
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm1[2],xmm10[3,4],xmm1[5],xmm10[6,7]
+; AVX512DQ-NEXT: vpshufb %xmm3, %xmm6, %xmm3
+; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512DQ-NEXT: vpternlogq $202, %ymm22, %ymm23, %ymm9
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm9[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm9[1,2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7],ymm3[8],ymm9[9,10],ymm3[11],ymm9[12,13],ymm3[14],ymm9[15]
-; AVX512DQ-NEXT: vpternlogq $202, %ymm21, %ymm20, %ymm0
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2],ymm5[3],ymm0[4,5],ymm5[6],ymm0[7],ymm5[8],ymm0[9,10],ymm5[11],ymm0[12,13],ymm5[14],ymm0[15]
-; AVX512DQ-NEXT: vpshufb %ymm4, %ymm3, %ymm3
-; AVX512DQ-NEXT: vpshufb %ymm4, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpternlogq $226, %ymm23, %ymm13, %ymm11
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm11[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm11[1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7],ymm3[8],ymm11[9,10],ymm3[11],ymm11[12,13],ymm3[14],ymm11[15]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
+; AVX512DQ-NEXT: vpshufb %ymm11, %ymm3, %ymm3
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm5[2],xmm15[3,4],xmm5[5],xmm15[6,7]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
+; AVX512DQ-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-NEXT: vpternlogq $226, %ymm24, %ymm0, %ymm12
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15]
+; AVX512DQ-NEXT: vpshufb %ymm11, %ymm4, %ymm4
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3],xmm10[4],xmm1[5,6],xmm10[7]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
+; AVX512DQ-NEXT: vpshufb %xmm6, %xmm1, %xmm1
+; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
+; AVX512DQ-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm13
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm13[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7],ymm3[8],ymm13[9,10],ymm3[11],ymm13[12,13],ymm3[14],ymm13[15]
+; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm4
+; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm7
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7]
+; AVX512DQ-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX512DQ-NEXT: vpshufb %ymm11, %ymm3, %ymm3
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-NEXT: vpternlogq $202, %ymm20, %ymm18, %ymm0
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7],ymm4[8],ymm0[9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15]
+; AVX512DQ-NEXT: vpshufb %ymm11, %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm4
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2,3],xmm4[4],xmm7[5,6],xmm4[7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7]
; AVX512DQ-NEXT: vpshufb %xmm6, %xmm4, %xmm4
; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7]
-; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm4
-; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm5
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7]
-; AVX512DQ-NEXT: vpshufb %xmm8, %xmm4, %xmm4
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4
-; AVX512DQ-NEXT: vextracti32x4 $2, %zmm4, %xmm4
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
-; AVX512DQ-NEXT: vmovdqa64 %zmm18, (%rsi)
+; AVX512DQ-NEXT: vmovdqa64 %zmm17, (%rsi)
; AVX512DQ-NEXT: vmovdqa64 %zmm16, 64(%rsi)
; AVX512DQ-NEXT: vmovdqa64 %zmm19, 64(%rdx)
-; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx)
+; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rcx)
-; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rcx)
+; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rcx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512DQ-FCP-LABEL: load_i16_stride3_vf64:
; AVX512DQ-FCP: # %bb.0:
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
-; AVX512DQ-FCP-NEXT: vmovdqa64 224(%rdi), %ymm20
-; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm21
+; AVX512DQ-FCP-NEXT: vmovdqa64 224(%rdi), %ymm18
+; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm20
; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm20, %ymm21, %ymm1
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7],ymm1[8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13,14],ymm3[15]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm5
-; AVX512DQ-FCP-NEXT: vmovdqa 272(%rdi), %xmm8
+; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm18, %ymm20, %ymm1
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm5
+; AVX512DQ-FCP-NEXT: vmovdqa 272(%rdi), %xmm1
; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm8[2],xmm2[3,4],xmm8[5],xmm2[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, %xmm14
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm6, %xmm6
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, %xmm3
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm19
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm6, %xmm6
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15]
; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %ymm22
-; AVX512DQ-FCP-NEXT: vmovdqa64 352(%rdi), %ymm23
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm6
-; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm22, %ymm23, %ymm6
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6],ymm7[7],ymm6[8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13,14],ymm7[15]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm6, %ymm12
+; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %ymm21
+; AVX512DQ-FCP-NEXT: vmovdqa64 352(%rdi), %ymm22
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm8
+; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm8
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7],ymm8[8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13,14],ymm9[15]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm11
; AVX512DQ-FCP-NEXT: vmovdqa 304(%rdi), %xmm1
; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7]
; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, %xmm4
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, %xmm6
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm13, %xmm13
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm16
-; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm24
-; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm13
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, %xmm8
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm12
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm16
+; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23
+; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm11
; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm5
-; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm24, %ymm13, %ymm5
+; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm23, %ymm11, %ymm5
; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm5[2,3,0,1]
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6],ymm12[7],ymm5[8],ymm12[9],ymm5[10,11],ymm12[12],ymm5[13,14],ymm12[15]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm5
-; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm11
-; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm12
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm11[1],xmm12[2,3],xmm11[4],xmm12[5,6],xmm11[7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm10, %xmm10
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm5[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm17
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm5
+; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm5, %ymm10
+; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm15
+; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm5
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0],xmm15[1],xmm5[2,3],xmm15[4],xmm5[5,6],xmm15[7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm12
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2],ymm10[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm24
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm12
; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm10
-; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm5, %ymm17, %ymm10
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm10[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3],ymm15[4],ymm10[5,6],ymm15[7],ymm10[8],ymm15[9],ymm10[10,11],ymm15[12],ymm10[13,14],ymm15[15]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm10, %ymm2
+; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm12, %ymm24, %ymm10
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm10[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm1[1],ymm10[2,3],ymm1[4],ymm10[5,6],ymm1[7],ymm10[8],ymm1[9],ymm10[10,11],ymm1[12],ymm10[13,14],ymm1[15]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm7
; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm10
-; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm15
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1],xmm10[2],xmm15[3,4],xmm10[5],xmm15[6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm18
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm23, %ymm22, %ymm1
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm28
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm6[2],xmm4[3,4],xmm6[5],xmm4[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm6, %xmm25
+; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm1
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm10[2],xmm1[3,4],xmm10[5],xmm1[6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7],ymm7[8,9,10],ymm2[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,4,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm17
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm2
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm8[2],xmm4[3,4],xmm8[5],xmm4[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm8, %xmm25
; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm26
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm1[5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm1
-; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm21, %ymm20, %ymm1
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm7
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm2[5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm13, %ymm2
+; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm20, %ymm18, %ymm2
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15]
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm14, %xmm7
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm14[2],xmm8[3,4],xmm14[5],xmm8[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm8, %xmm27
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm14, %xmm14
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm1[0,1,2],ymm14[3,4,5,6,7],ymm1[8,9,10],ymm14[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm19
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm13, %ymm24, %ymm1
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm3
-; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1],xmm11[2],xmm12[3,4],xmm11[5],xmm12[6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm1[5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm3
-; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm17, %ymm5, %ymm3
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7,8,9],ymm6[10],ymm3[11,12],ymm6[13],ymm3[14,15]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1],xmm15[2],xmm10[3,4],xmm15[5],xmm10[6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm2
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,7,4]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
-; AVX512DQ-FCP-NEXT: vpternlogq $226, %ymm24, %ymm9, %ymm13
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm13[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm13[1,2],ymm2[3],ymm13[4,5],ymm2[6],ymm13[7],ymm2[8],ymm13[9,10],ymm2[11],ymm13[12,13],ymm2[14],ymm13[15]
-; AVX512DQ-FCP-NEXT: vpternlogq $226, %ymm17, %ymm0, %ymm5
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm5[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1,2],ymm3[3],ymm5[4,5],ymm3[6],ymm5[7],ymm3[8],ymm5[9,10],ymm3[11],ymm5[12,13],ymm3[14],ymm5[15]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0],xmm10[1],xmm15[2,3],xmm10[4],xmm15[5,6],xmm10[7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm5
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7]
; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm11[0,1],xmm12[2],xmm11[3,4],xmm12[5],xmm11[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm5
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm5
-; AVX512DQ-FCP-NEXT: vextracti32x4 $2, %zmm5, %xmm5
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm2[5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm8
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm27
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm14, %xmm14
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm2[0,1,2],ymm14[3,4,5,6,7],ymm2[8,9,10],ymm14[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,4]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm19
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm11, %ymm23, %ymm2
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7,8,9],ymm7[10],ymm2[11,12],ymm7[13],ymm2[14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0,1],xmm15[2],xmm5[3,4],xmm15[5],xmm5[6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm6
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm2[5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm13, %ymm6
+; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm24, %ymm12, %ymm6
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm4
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm1[2],xmm10[3,4],xmm1[5],xmm10[6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm3
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm22, %ymm23, %ymm9
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm9[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm9[1,2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7],ymm3[8],ymm9[9,10],ymm3[11],ymm9[12,13],ymm3[14],ymm9[15]
-; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm21, %ymm20, %ymm0
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2],ymm5[3],ymm0[4,5],ymm5[6],ymm0[7],ymm5[8],ymm0[9,10],ymm5[11],ymm0[12,13],ymm5[14],ymm0[15]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vpternlogq $226, %ymm23, %ymm13, %ymm11
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm11[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm11[1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7],ymm3[8],ymm11[9,10],ymm3[11],ymm11[12,13],ymm3[14],ymm11[15]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm5[2],xmm15[3,4],xmm5[5],xmm15[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpternlogq $226, %ymm24, %ymm0, %ymm12
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3],xmm10[4],xmm1[5,6],xmm10[7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
+; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm13
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm13[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7],ymm3[8],ymm13[9,10],ymm3[11],ymm13[12,13],ymm3[14],ymm13[15]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm4
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm7
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm20, %ymm18, %ymm0
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7],ymm4[8],ymm0[9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm0
; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm4
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2,3],xmm4[4],xmm7[5,6],xmm4[7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7]
; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm4
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm4
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm5
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm4, %xmm4
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4
-; AVX512DQ-FCP-NEXT: vextracti32x4 $2, %zmm4, %xmm4
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, (%rsi)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, (%rsi)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 64(%rsi)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 64(%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rdx)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 64(%rcx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%rcx)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
index 8b6ba51506ab..8091afbbfd70 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
@@ -1246,29 +1246,28 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0]
; AVX512BW-FCP-NEXT: vpermi2q %ymm3, %ymm0, %ymm1
-; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,3,5,7,1,3,5,7]
-; AVX512BW-FCP-NEXT: # ymm0 = mem[0,1,0,1]
-; AVX512BW-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm0
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6,10,14],zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,3,5,0,5,1,3,0]
-; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,4,8],zero,zero,zero,zero,ymm3[1,5,9],zero,zero,zero,zero,ymm3[2,6,18],zero,zero,zero,zero,ymm3[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[2,10,18,26],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28]
+; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8],zero,zero,zero,zero,zero,ymm3[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[19,27],zero,zero,zero,zero,zero,ymm3[20,28],zero,zero
; AVX512BW-FCP-NEXT: vpor %ymm0, %ymm3, %ymm0
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[2,10,18,26],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28]
-; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28],zero,zero
-; AVX512BW-FCP-NEXT: vpor %ymm3, %ymm2, %ymm2
; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,4,6,0,2,4,6]
; AVX512BW-FCP-NEXT: # ymm3 = mem[0,1,0,1]
-; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1
+; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3
; AVX512BW-FCP-NEXT: movl $236730480, %ecx # imm = 0xE1C3870
; AVX512BW-FCP-NEXT: kmovd %ecx, %k1
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 {%k1} = ymm1[u,u,u,u,0,4,8,u,u,u,u,1,5,9,u,u,u,u,18,22,26,u,u,u,u,19,23,27,u,u,u,u]
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
-; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%rax)
-; AVX512BW-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rax)
-; AVX512BW-FCP-NEXT: vextracti32x4 $3, %zmm0, %xmm0
-; AVX512BW-FCP-NEXT: vmovq %xmm0, 48(%rax)
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm3[u,u,u,u,0,4,8,u,u,u,u,1,5,9,u,u,u,u,18,22,26,u,u,u,u,19,23,27,u,u,u,u]
+; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,3,5,7,1,3,5,7]
+; AVX512BW-FCP-NEXT: # ymm3 = mem[0,1,0,1]
+; AVX512BW-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm2[1,5,9,13],zero,zero,zero,ymm2[2,6,10,14],zero,zero,zero,ymm2[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,3,5,0,5,1,3,0]
+; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,4,8],zero,zero,zero,zero,ymm1[1,5,9],zero,zero,zero,zero,ymm1[2,6,18],zero,zero,zero,zero,ymm1[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-FCP-NEXT: vpor %ymm2, %ymm1, %ymm1
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512BW-FCP-NEXT: vmovq %xmm2, 48(%rax)
+; AVX512BW-FCP-NEXT: vmovdqa %xmm1, 32(%rax)
+; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rax)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
@@ -1326,29 +1325,28 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0]
; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm3, %ymm0, %ymm1
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,3,5,7,1,3,5,7]
-; AVX512DQ-BW-FCP-NEXT: # ymm0 = mem[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6,10,14],zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,3,5,0,5,1,3,0]
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,4,8],zero,zero,zero,zero,ymm3[1,5,9],zero,zero,zero,zero,ymm3[2,6,18],zero,zero,zero,zero,ymm3[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[2,10,18,26],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28]
+; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8],zero,zero,zero,zero,zero,ymm3[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[19,27],zero,zero,zero,zero,zero,ymm3[20,28],zero,zero
; AVX512DQ-BW-FCP-NEXT: vpor %ymm0, %ymm3, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[2,10,18,26],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28]
-; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28],zero,zero
-; AVX512DQ-BW-FCP-NEXT: vpor %ymm3, %ymm2, %ymm2
; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,4,6,0,2,4,6]
; AVX512DQ-BW-FCP-NEXT: # ymm3 = mem[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3
; AVX512DQ-BW-FCP-NEXT: movl $236730480, %ecx # imm = 0xE1C3870
; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 {%k1} = ymm1[u,u,u,u,0,4,8,u,u,u,u,1,5,9,u,u,u,u,18,22,26,u,u,u,u,19,23,27,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%rax)
-; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rax)
-; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $3, %zmm0, %xmm0
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, 48(%rax)
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm3[u,u,u,u,0,4,8,u,u,u,u,1,5,9,u,u,u,u,18,22,26,u,u,u,u,19,23,27,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,3,5,7,1,3,5,7]
+; AVX512DQ-BW-FCP-NEXT: # ymm3 = mem[0,1,0,1]
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm2[1,5,9,13],zero,zero,zero,ymm2[2,6,10,14],zero,zero,zero,ymm2[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,3,5,0,5,1,3,0]
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,4,8],zero,zero,zero,zero,ymm1[1,5,9],zero,zero,zero,zero,ymm1[2,6,18],zero,zero,zero,zero,ymm1[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT: vpor %ymm2, %ymm1, %ymm1
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, 48(%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm1, 32(%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rax)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%in.vec0 = load <8 x i8>, ptr %in.vecptr0, align 64
@@ -2053,77 +2051,76 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512: # %bb.0:
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512-NEXT: vmovdqa (%rdi), %xmm4
-; AVX512-NEXT: vmovdqa (%rsi), %xmm5
-; AVX512-NEXT: vmovdqa (%rdx), %xmm6
-; AVX512-NEXT: vmovdqa (%rcx), %xmm7
-; AVX512-NEXT: vmovdqa (%r8), %xmm0
-; AVX512-NEXT: vmovdqa (%r10), %xmm1
-; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm3
-; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm2
-; AVX512-NEXT: vinserti128 $1, (%r9), %ymm0, %ymm0
-; AVX512-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
-; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[12,13,u,u,u],zero,zero,xmm6[14,15,u,u,u]
-; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,12,13],zero,zero,xmm4[u,u,u,14,15],zero,zero,xmm4[u,u,u]
-; AVX512-NEXT: vpor %xmm6, %xmm4, %xmm4
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm5
-; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10],zero,xmm5[u,u,u,u,13,12],zero,xmm5[u,u,u,u,15,14],zero
-; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm1[13,u,u,u,u],zero,zero,xmm1[14,u,u,u,u],zero,zero,xmm1[15]
-; AVX512-NEXT: vpor %xmm6, %xmm5, %xmm5
-; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm5
-; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm2[3,1,1,3]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[1],zero,zero,ymm4[u,u,u,10,2],zero,zero,ymm4[u,u,u,11,3],zero,zero,ymm4[u,u,u,20,28],zero,zero,ymm4[u,u,u,21,29],zero,zero,ymm4[u]
-; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm3[1,3,3,1]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = zero,ymm6[1,9,u,u,u],zero,zero,ymm6[2,10,u,u,u],zero,zero,ymm6[3,19,u,u,u],zero,zero,ymm6[28,20,u,u,u],zero,zero,ymm6[29,21,u]
-; AVX512-NEXT: vpor %ymm4, %ymm6, %ymm4
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,5,5,6]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3]
-; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1]
-; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm0[1,3,1,3]
+; AVX512-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512-NEXT: vmovdqa (%rsi), %xmm1
+; AVX512-NEXT: vmovdqa (%rdx), %xmm5
+; AVX512-NEXT: vmovdqa (%rcx), %xmm6
+; AVX512-NEXT: vmovdqa (%r8), %xmm3
+; AVX512-NEXT: vmovdqa (%r9), %xmm4
+; AVX512-NEXT: vmovdqa (%r10), %xmm2
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8
+; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm9
+; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm7
+; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm9[u,u,u,u,u,5],zero,ymm9[u,u,u,u,u,6],zero,ymm9[u,u,u,u,u],zero,ymm9[23,u,u,u,u,u],zero,ymm9[24,u,u,u,u]
+; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm9[2,3,0,1]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u],zero,ymm11[5,u,u,u,u,u],zero,ymm11[6,u,u,u,u,u,23],zero,ymm11[u,u,u,u,u,24],zero,ymm11[u,u,u,u]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255]
+; AVX512-NEXT: vpternlogq $50, %ymm10, %ymm12, %ymm11
+; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,2,0,2]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,ymm10[0,8,u,u,u],zero,zero,ymm10[1,9,u,u,u],zero,zero,ymm10[18,26,u,u,u],zero,zero,ymm10[19,27,u,u,u],zero,zero,ymm10[20,28]
+; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10
+; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm8[u,u,u,5],zero,ymm8[u,u,u,u,u,6],zero,ymm8[u,u,u,u,u],zero,ymm8[23,u,u,u,u,u],zero,ymm8[24,u,u,u,u,u],zero
+; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm8[2,3,0,1]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u],zero,ymm13[5,u,u,u,u,u],zero,ymm13[6,u,u,u,u,u,23],zero,ymm13[u,u,u,u,u,24],zero,ymm13[u,u,u,u,u,25]
+; AVX512-NEXT: vpternlogq $200, %ymm11, %ymm12, %ymm13
+; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm8[0,2,0,2]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,8],zero,zero,ymm11[u,u,u,1,9],zero,zero,ymm11[u,u,u,2,10],zero,zero,ymm11[u,u,u,19,27],zero,zero,ymm11[u,u,u,20,28],zero,zero
+; AVX512-NEXT: vinserti64x4 $1, %ymm13, %zmm11, %zmm11
+; AVX512-NEXT: vporq %zmm10, %zmm11, %zmm10
+; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm7[4],zero,ymm7[u,u,u,u,u,5],zero,ymm7[u,u,u,u,u,6],zero,ymm7[u,u,u,u,u],zero,ymm7[23,u,u,u,u,u],zero,ymm7[24,u,u]
+; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm7[2,3,0,1]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm12 = zero,ymm12[4,u,u,u,u,u],zero,ymm12[5,u,u,u,u,u],zero,ymm12[6,u,u,u,u,u,23],zero,ymm12[u,u,u,u,u,24],zero,ymm12[u,u]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255]
+; AVX512-NEXT: vpternlogq $200, %ymm11, %ymm13, %ymm12
+; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm7[0,2,0,2]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,0,8],zero,ymm11[u,u,u,u,1,9],zero,ymm11[u,u,u,u,18,26],zero,ymm11[u,u,u,u,19,27],zero,ymm11[u,u,u,u]
+; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11
+; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
+; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0]
+; AVX512-NEXT: vpandn %ymm12, %ymm13, %ymm12
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm2[1,1,0,0,4,5,6,7]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,0]
+; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0]
+; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm13, %ymm13
+; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12
+; AVX512-NEXT: vporq %zmm12, %zmm11, %zmm11
+; AVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm11
+; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[3,1,1,3]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[1],zero,zero,ymm8[u,u,u,10,2],zero,zero,ymm8[u,u,u,11,3],zero,zero,ymm8[u,u,u,20,28],zero,zero,ymm8[u,u,u,21,29],zero,zero,ymm8[u]
+; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,3,3,1]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm9[1,9,u,u,u],zero,zero,ymm9[2,10,u,u,u],zero,zero,ymm9[3,19,u,u,u],zero,zero,ymm9[28,20,u,u,u],zero,zero,ymm9[29,21,u]
+; AVX512-NEXT: vpor %ymm8, %ymm9, %ymm8
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm9 = xmm2[0,1,2,3,4,5,5,6]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3]
+; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1]
+; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,1,3]
; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,1,9],zero,ymm7[u,u,u,u,2,10],zero,ymm7[u,u,u,u,19,27],zero,ymm7[u,u,u,u,20,28],zero,ymm7[u,u,u,u,21]
-; AVX512-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm7
-; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm7
-; AVX512-NEXT: vinserti32x4 $2, %xmm5, %zmm7, %zmm4
-; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm3[u,u,u,u,u,5],zero,ymm3[u,u,u,u,u,6],zero,ymm3[u,u,u,u,u],zero,ymm3[23,u,u,u,u,u],zero,ymm3[24,u,u,u,u]
-; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u],zero,ymm7[5,u,u,u,u,u],zero,ymm7[6,u,u,u,u,u,23],zero,ymm7[u,u,u,u,u,24],zero,ymm7[u,u,u,u]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255]
-; AVX512-NEXT: vpternlogq $50, %ymm6, %ymm8, %ymm7
-; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,0,2]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u,u,u],zero,zero,ymm3[1,9,u,u,u],zero,zero,ymm3[18,26,u,u,u],zero,zero,ymm3[19,27,u,u,u],zero,zero,ymm3[20,28]
-; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm3, %zmm3
-; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[u,u,u,5],zero,ymm2[u,u,u,u,u,6],zero,ymm2[u,u,u,u,u],zero,ymm2[23,u,u,u,u,u],zero,ymm2[24,u,u,u,u,u],zero
-; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u],zero,ymm7[5,u,u,u,u,u],zero,ymm7[6,u,u,u,u,u,23],zero,ymm7[u,u,u,u,u,24],zero,ymm7[u,u,u,u,u,25]
-; AVX512-NEXT: vpternlogq $200, %ymm6, %ymm8, %ymm7
-; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,8],zero,zero,ymm2[u,u,u,1,9],zero,zero,ymm2[u,u,u,2,10],zero,zero,ymm2[u,u,u,19,27],zero,zero,ymm2[u,u,u,20,28],zero,zero
-; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2
-; AVX512-NEXT: vporq %zmm3, %zmm2, %zmm2
-; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
-; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,0]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255]
-; AVX512-NEXT: vpandn %ymm3, %ymm6, %ymm3
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,0,0,4,5,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
-; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,0]
-; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
-; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[4],zero,ymm0[u,u,u,u,u,5],zero,ymm0[u,u,u,u,u,6],zero,ymm0[u,u,u,u,u],zero,ymm0[23,u,u,u,u,u],zero,ymm0[24,u,u]
-; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm0[2,3,0,1]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm7[4,u,u,u,u,u],zero,ymm7[5,u,u,u,u,u],zero,ymm7[6,u,u,u,u,u,23],zero,ymm7[u,u,u,u,u,24],zero,ymm7[u,u]
-; AVX512-NEXT: vpternlogq $200, %ymm3, %ymm6, %ymm7
-; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,0,8],zero,ymm0[u,u,u,u,1,9],zero,ymm0[u,u,u,u,18,26],zero,ymm0[u,u,u,u,19,27],zero,ymm0[u,u,u,u]
-; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0
-; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
-; AVX512-NEXT: vmovdqa %xmm5, 96(%rax)
-; AVX512-NEXT: vmovdqa64 %zmm0, (%rax)
-; AVX512-NEXT: vmovdqa %ymm4, 64(%rax)
+; AVX512-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm7
+; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm7
+; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[12,13,u,u,u],zero,zero,xmm5[14,15,u,u,u]
+; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13],zero,zero,xmm0[u,u,u,14,15],zero,zero,xmm0[u,u,u]
+; AVX512-NEXT: vpor %xmm5, %xmm0, %xmm0
+; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,xmm1[u,u,u,u,13,12],zero,xmm1[u,u,u,u,15,14],zero
+; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[13,u,u,u,u],zero,zero,xmm2[14,u,u,u,u],zero,zero,xmm2[15]
+; AVX512-NEXT: vpor %xmm2, %xmm1, %xmm1
+; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX512-NEXT: vinserti32x4 $2, %xmm1, %zmm7, %zmm0
+; AVX512-NEXT: vmovdqa %xmm1, 96(%rax)
+; AVX512-NEXT: vmovdqa %ymm0, 64(%rax)
+; AVX512-NEXT: vmovdqa64 %zmm11, (%rax)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
@@ -2131,70 +2128,69 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-FCP: # %bb.0:
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm2
-; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm3
-; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm4
-; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm5
-; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm1
-; AVX512-FCP-NEXT: vmovdqa (%r10), %xmm0
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm6
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm7
-; AVX512-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm1
-; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[12,13,u,u,u],zero,zero,xmm4[14,15,u,u,u]
-; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,12,13],zero,zero,xmm2[u,u,u,14,15],zero,zero,xmm2[u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm4, %xmm2, %xmm2
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3
-; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10],zero,xmm3[u,u,u,u,13,12],zero,xmm3[u,u,u,u,15,14],zero
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15]
-; AVX512-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3
-; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm7[3,1,1,3]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1],zero,zero,ymm2[u,u,u,10,2],zero,zero,ymm2[u,u,u,11,3],zero,zero,ymm2[u,u,u,20,28],zero,zero,ymm2[u,u,u,21,29],zero,zero,ymm2[u]
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm6[1,3,3,1]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,ymm4[1,9,u,u,u],zero,zero,ymm4[2,10,u,u,u],zero,zero,ymm4[3,19,u,u,u],zero,zero,ymm4[28,20,u,u,u],zero,zero,ymm4[29,21,u]
-; AVX512-FCP-NEXT: vpor %ymm2, %ymm4, %ymm2
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,5,5,6]
-; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,2,3,3,2,2,3,3]
-; AVX512-FCP-NEXT: # ymm5 = mem[0,1,0,1]
-; AVX512-FCP-NEXT: vpermd %ymm4, %ymm5, %ymm4
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[1,3,1,3]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,1,9],zero,ymm5[u,u,u,u,2,10],zero,ymm5[u,u,u,u,19,27],zero,ymm5[u,u,u,u,20,28],zero,ymm5[u,u,u,u,21]
-; AVX512-FCP-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm5
-; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm5
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm5, %zmm2
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm6[0,2,0,2]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[0,8,u,u,u],zero,zero,ymm4[1,9,u,u,u],zero,zero,ymm4[18,26,u,u,u],zero,zero,ymm4[19,27,u,u,u],zero,zero,ymm4[20,28]
-; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,5,2,6,1,5,2,6]
-; AVX512-FCP-NEXT: # ymm5 = mem[0,1,0,1]
-; AVX512-FCP-NEXT: vpermd %ymm6, %ymm5, %ymm6
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u],zero,zero,ymm6[1,5,u,u,u],zero,zero,ymm6[2,6,u,u,u],zero,zero,ymm6[19,23,u,u,u],zero,zero,ymm6[24,28,u,u,u],zero
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm7[0,2,0,2]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,8],zero,zero,ymm6[u,u,u,1,9],zero,zero,ymm6[u,u,u,2,10],zero,zero,ymm6[u,u,u,19,27],zero,zero,ymm6[u,u,u,20,28],zero,zero
-; AVX512-FCP-NEXT: vpermd %ymm7, %ymm5, %ymm7
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,1,5],zero,zero,ymm7[u,u,u,2,6],zero,zero,ymm7[u,u,u,19,23],zero,zero,ymm7[u,u,u,24,28],zero,zero,ymm7[u,u,u,25]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6
-; AVX512-FCP-NEXT: vporq %zmm4, %zmm6, %zmm4
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[1,1,0,0,4,5,6,7]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,0,1,0,0,0,0]
-; AVX512-FCP-NEXT: vpermd %ymm6, %ymm7, %ymm6
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm1[0,2,0,2]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,0,8],zero,ymm6[u,u,u,u,1,9],zero,ymm6[u,u,u,u,18,26],zero,ymm6[u,u,u,u,19,27],zero,ymm6[u,u,u,u]
-; AVX512-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm1
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,4],zero,ymm1[u,u,u,u,1,5],zero,ymm1[u,u,u,u,2,6],zero,ymm1[u,u,u,u,19,23],zero,ymm1[u,u,u,u,24,28],zero,ymm1[u]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm6, %zmm1
-; AVX512-FCP-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
-; AVX512-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1
-; AVX512-FCP-NEXT: vmovdqa %xmm3, 96(%rax)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rax)
-; AVX512-FCP-NEXT: vmovdqa %ymm2, 64(%rax)
+; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm1
+; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm5
+; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm6
+; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm3
+; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm4
+; AVX512-FCP-NEXT: vmovdqa (%r10), %xmm2
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm7
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm8
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm9
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm8[0,2,0,2]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,ymm10[0,8,u,u,u],zero,zero,ymm10[1,9,u,u,u],zero,zero,ymm10[18,26,u,u,u],zero,zero,ymm10[19,27,u,u,u],zero,zero,ymm10[20,28]
+; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [1,5,2,6,1,5,2,6]
+; AVX512-FCP-NEXT: # ymm11 = mem[0,1,0,1]
+; AVX512-FCP-NEXT: vpermd %ymm8, %ymm11, %ymm12
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u],zero,zero,ymm12[1,5,u,u,u],zero,zero,ymm12[2,6,u,u,u],zero,zero,ymm12[19,23,u,u,u],zero,zero,ymm12[24,28,u,u,u],zero
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm10
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm7[0,2,0,2]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,8],zero,zero,ymm12[u,u,u,1,9],zero,zero,ymm12[u,u,u,2,10],zero,zero,ymm12[u,u,u,19,27],zero,zero,ymm12[u,u,u,20,28],zero,zero
+; AVX512-FCP-NEXT: vpermd %ymm7, %ymm11, %ymm13
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,1,5],zero,zero,ymm13[u,u,u,2,6],zero,zero,ymm13[u,u,u,19,23],zero,zero,ymm13[u,u,u,24,28],zero,zero,ymm13[u,u,u,25]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12
+; AVX512-FCP-NEXT: vporq %zmm10, %zmm12, %zmm10
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[1,1,0,0,4,5,6,7]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,1,0,1,0,0,0,0]
+; AVX512-FCP-NEXT: vpermd %ymm12, %ymm13, %ymm12
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm9[0,2,0,2]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,0,8],zero,ymm13[u,u,u,u,1,9],zero,ymm13[u,u,u,u,18,26],zero,ymm13[u,u,u,u,19,27],zero,ymm13[u,u,u,u]
+; AVX512-FCP-NEXT: vpermd %ymm9, %ymm11, %ymm11
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,4],zero,ymm11[u,u,u,u,1,5],zero,ymm11[u,u,u,u,2,6],zero,ymm11[u,u,u,u,19,23],zero,ymm11[u,u,u,u,24,28],zero,ymm11[u]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm11
+; AVX512-FCP-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm11
+; AVX512-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm11
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[3,1,1,3]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[1],zero,zero,ymm7[u,u,u,10,2],zero,zero,ymm7[u,u,u,11,3],zero,zero,ymm7[u,u,u,20,28],zero,zero,ymm7[u,u,u,21,29],zero,zero,ymm7[u]
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,3,3,1]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[1,9,u,u,u],zero,zero,ymm8[2,10,u,u,u],zero,zero,ymm8[3,19,u,u,u],zero,zero,ymm8[28,20,u,u,u],zero,zero,ymm8[29,21,u]
+; AVX512-FCP-NEXT: vpor %ymm7, %ymm8, %ymm7
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm2[0,1,2,3,4,5,5,6]
+; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,2,3,3,2,2,3,3]
+; AVX512-FCP-NEXT: # ymm10 = mem[0,1,0,1]
+; AVX512-FCP-NEXT: vpermd %ymm8, %ymm10, %ymm8
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,3,1,3]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,1,9],zero,ymm9[u,u,u,u,2,10],zero,ymm9[u,u,u,u,19,27],zero,ymm9[u,u,u,u,20,28],zero,ymm9[u,u,u,u,21]
+; AVX512-FCP-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm9
+; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm9
+; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[12,13,u,u,u],zero,zero,xmm5[14,15,u,u,u]
+; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13],zero,zero,xmm0[u,u,u,14,15],zero,zero,xmm0[u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm5, %xmm0, %xmm0
+; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,xmm1[u,u,u,u,13,12],zero,xmm1[u,u,u,u,15,14],zero
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[13,u,u,u,u],zero,zero,xmm2[14,u,u,u,u],zero,zero,xmm2[15]
+; AVX512-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1
+; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm9, %zmm0
+; AVX512-FCP-NEXT: vmovdqa %xmm1, 96(%rax)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm11, (%rax)
+; AVX512-FCP-NEXT: vmovdqa %ymm0, 64(%rax)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
@@ -2202,77 +2198,76 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm4
-; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm5
-; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm6
-; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm7
-; AVX512DQ-NEXT: vmovdqa (%r8), %xmm0
-; AVX512DQ-NEXT: vmovdqa (%r10), %xmm1
-; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm3
-; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm2
-; AVX512DQ-NEXT: vinserti128 $1, (%r9), %ymm0, %ymm0
-; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[12,13,u,u,u],zero,zero,xmm6[14,15,u,u,u]
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,12,13],zero,zero,xmm4[u,u,u,14,15],zero,zero,xmm4[u,u,u]
-; AVX512DQ-NEXT: vpor %xmm6, %xmm4, %xmm4
-; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm5
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10],zero,xmm5[u,u,u,u,13,12],zero,xmm5[u,u,u,u,15,14],zero
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm1[13,u,u,u,u],zero,zero,xmm1[14,u,u,u,u],zero,zero,xmm1[15]
-; AVX512DQ-NEXT: vpor %xmm6, %xmm5, %xmm5
-; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm5
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm2[3,1,1,3]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[1],zero,zero,ymm4[u,u,u,10,2],zero,zero,ymm4[u,u,u,11,3],zero,zero,ymm4[u,u,u,20,28],zero,zero,ymm4[u,u,u,21,29],zero,zero,ymm4[u]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm3[1,3,3,1]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = zero,ymm6[1,9,u,u,u],zero,zero,ymm6[2,10,u,u,u],zero,zero,ymm6[3,19,u,u,u],zero,zero,ymm6[28,20,u,u,u],zero,zero,ymm6[29,21,u]
-; AVX512DQ-NEXT: vpor %ymm4, %ymm6, %ymm4
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,5,5,6]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm0[1,3,1,3]
+; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm1
+; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm5
+; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm6
+; AVX512DQ-NEXT: vmovdqa (%r8), %xmm3
+; AVX512DQ-NEXT: vmovdqa (%r9), %xmm4
+; AVX512DQ-NEXT: vmovdqa (%r10), %xmm2
+; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8
+; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm9
+; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm7
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm9[u,u,u,u,u,5],zero,ymm9[u,u,u,u,u,6],zero,ymm9[u,u,u,u,u],zero,ymm9[23,u,u,u,u,u],zero,ymm9[24,u,u,u,u]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm9[2,3,0,1]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u],zero,ymm11[5,u,u,u,u,u],zero,ymm11[6,u,u,u,u,u,23],zero,ymm11[u,u,u,u,u,24],zero,ymm11[u,u,u,u]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255]
+; AVX512DQ-NEXT: vpternlogq $50, %ymm10, %ymm12, %ymm11
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,2,0,2]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,ymm10[0,8,u,u,u],zero,zero,ymm10[1,9,u,u,u],zero,zero,ymm10[18,26,u,u,u],zero,zero,ymm10[19,27,u,u,u],zero,zero,ymm10[20,28]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm8[u,u,u,5],zero,ymm8[u,u,u,u,u,6],zero,ymm8[u,u,u,u,u],zero,ymm8[23,u,u,u,u,u],zero,ymm8[24,u,u,u,u,u],zero
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm8[2,3,0,1]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u],zero,ymm13[5,u,u,u,u,u],zero,ymm13[6,u,u,u,u,u,23],zero,ymm13[u,u,u,u,u,24],zero,ymm13[u,u,u,u,u,25]
+; AVX512DQ-NEXT: vpternlogq $200, %ymm11, %ymm12, %ymm13
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm8[0,2,0,2]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,8],zero,zero,ymm11[u,u,u,1,9],zero,zero,ymm11[u,u,u,2,10],zero,zero,ymm11[u,u,u,19,27],zero,zero,ymm11[u,u,u,20,28],zero,zero
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm11, %zmm11
+; AVX512DQ-NEXT: vporq %zmm10, %zmm11, %zmm10
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm7[4],zero,ymm7[u,u,u,u,u,5],zero,ymm7[u,u,u,u,u,6],zero,ymm7[u,u,u,u,u],zero,ymm7[23,u,u,u,u,u],zero,ymm7[24,u,u]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm7[2,3,0,1]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm12 = zero,ymm12[4,u,u,u,u,u],zero,ymm12[5,u,u,u,u,u],zero,ymm12[6,u,u,u,u,u,23],zero,ymm12[u,u,u,u,u,24],zero,ymm12[u,u]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255]
+; AVX512DQ-NEXT: vpternlogq $200, %ymm11, %ymm13, %ymm12
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm7[0,2,0,2]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,0,8],zero,ymm11[u,u,u,u,1,9],zero,ymm11[u,u,u,u,18,26],zero,ymm11[u,u,u,u,19,27],zero,ymm11[u,u,u,u]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0]
+; AVX512DQ-NEXT: vpandn %ymm12, %ymm13, %ymm12
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm2[1,1,0,0,4,5,6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,0]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0]
+; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm13, %ymm13
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12
+; AVX512DQ-NEXT: vporq %zmm12, %zmm11, %zmm11
+; AVX512DQ-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm11
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[3,1,1,3]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[1],zero,zero,ymm8[u,u,u,10,2],zero,zero,ymm8[u,u,u,11,3],zero,zero,ymm8[u,u,u,20,28],zero,zero,ymm8[u,u,u,21,29],zero,zero,ymm8[u]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,3,3,1]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm9[1,9,u,u,u],zero,zero,ymm9[2,10,u,u,u],zero,zero,ymm9[3,19,u,u,u],zero,zero,ymm9[28,20,u,u,u],zero,zero,ymm9[29,21,u]
+; AVX512DQ-NEXT: vpor %ymm8, %ymm9, %ymm8
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm9 = xmm2[0,1,2,3,4,5,5,6]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,1,3]
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,1,9],zero,ymm7[u,u,u,u,2,10],zero,ymm7[u,u,u,u,19,27],zero,ymm7[u,u,u,u,20,28],zero,ymm7[u,u,u,u,21]
-; AVX512DQ-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm7
-; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm7
-; AVX512DQ-NEXT: vinserti32x4 $2, %xmm5, %zmm7, %zmm4
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm3[u,u,u,u,u,5],zero,ymm3[u,u,u,u,u,6],zero,ymm3[u,u,u,u,u],zero,ymm3[23,u,u,u,u,u],zero,ymm3[24,u,u,u,u]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u],zero,ymm7[5,u,u,u,u,u],zero,ymm7[6,u,u,u,u,u,23],zero,ymm7[u,u,u,u,u,24],zero,ymm7[u,u,u,u]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255]
-; AVX512DQ-NEXT: vpternlogq $50, %ymm6, %ymm8, %ymm7
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,0,2]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u,u,u],zero,zero,ymm3[1,9,u,u,u],zero,zero,ymm3[18,26,u,u,u],zero,zero,ymm3[19,27,u,u,u],zero,zero,ymm3[20,28]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm3, %zmm3
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[u,u,u,5],zero,ymm2[u,u,u,u,u,6],zero,ymm2[u,u,u,u,u],zero,ymm2[23,u,u,u,u,u],zero,ymm2[24,u,u,u,u,u],zero
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u],zero,ymm7[5,u,u,u,u,u],zero,ymm7[6,u,u,u,u,u,23],zero,ymm7[u,u,u,u,u,24],zero,ymm7[u,u,u,u,u,25]
-; AVX512DQ-NEXT: vpternlogq $200, %ymm6, %ymm8, %ymm7
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,8],zero,zero,ymm2[u,u,u,1,9],zero,zero,ymm2[u,u,u,2,10],zero,zero,ymm2[u,u,u,19,27],zero,zero,ymm2[u,u,u,20,28],zero,zero
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2
-; AVX512DQ-NEXT: vporq %zmm3, %zmm2, %zmm2
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,0]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255]
-; AVX512DQ-NEXT: vpandn %ymm3, %ymm6, %ymm3
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,0,0,4,5,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,0]
-; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[4],zero,ymm0[u,u,u,u,u,5],zero,ymm0[u,u,u,u,u,6],zero,ymm0[u,u,u,u,u],zero,ymm0[23,u,u,u,u,u],zero,ymm0[24,u,u]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm0[2,3,0,1]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm7[4,u,u,u,u,u],zero,ymm7[5,u,u,u,u,u],zero,ymm7[6,u,u,u,u,u,23],zero,ymm7[u,u,u,u,u,24],zero,ymm7[u,u]
-; AVX512DQ-NEXT: vpternlogq $200, %ymm3, %ymm6, %ymm7
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,0,8],zero,ymm0[u,u,u,u,1,9],zero,ymm0[u,u,u,u,18,26],zero,ymm0[u,u,u,u,19,27],zero,ymm0[u,u,u,u]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0
-; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
-; AVX512DQ-NEXT: vmovdqa %xmm5, 96(%rax)
-; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rax)
-; AVX512DQ-NEXT: vmovdqa %ymm4, 64(%rax)
+; AVX512DQ-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm7
+; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm7
+; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[12,13,u,u,u],zero,zero,xmm5[14,15,u,u,u]
+; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13],zero,zero,xmm0[u,u,u,14,15],zero,zero,xmm0[u,u,u]
+; AVX512DQ-NEXT: vpor %xmm5, %xmm0, %xmm0
+; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,xmm1[u,u,u,u,13,12],zero,xmm1[u,u,u,u,15,14],zero
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[13,u,u,u,u],zero,zero,xmm2[14,u,u,u,u],zero,zero,xmm2[15]
+; AVX512DQ-NEXT: vpor %xmm2, %xmm1, %xmm1
+; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm7, %zmm0
+; AVX512DQ-NEXT: vmovdqa %xmm1, 96(%rax)
+; AVX512DQ-NEXT: vmovdqa %ymm0, 64(%rax)
+; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%rax)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
@@ -2280,70 +2275,69 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-FCP: # %bb.0:
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm2
-; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm3
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm4
-; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm5
-; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %xmm0
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm6
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm7
-; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm1
-; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[12,13,u,u,u],zero,zero,xmm4[14,15,u,u,u]
-; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,12,13],zero,zero,xmm2[u,u,u,14,15],zero,zero,xmm2[u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3
-; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10],zero,xmm3[u,u,u,u,13,12],zero,xmm3[u,u,u,u,15,14],zero
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15]
-; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm7[3,1,1,3]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1],zero,zero,ymm2[u,u,u,10,2],zero,zero,ymm2[u,u,u,11,3],zero,zero,ymm2[u,u,u,20,28],zero,zero,ymm2[u,u,u,21,29],zero,zero,ymm2[u]
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm6[1,3,3,1]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,ymm4[1,9,u,u,u],zero,zero,ymm4[2,10,u,u,u],zero,zero,ymm4[3,19,u,u,u],zero,zero,ymm4[28,20,u,u,u],zero,zero,ymm4[29,21,u]
-; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm4, %ymm2
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,5,5,6]
-; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,2,3,3,2,2,3,3]
-; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm5, %ymm4
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[1,3,1,3]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,1,9],zero,ymm5[u,u,u,u,2,10],zero,ymm5[u,u,u,u,19,27],zero,ymm5[u,u,u,u,20,28],zero,ymm5[u,u,u,u,21]
-; AVX512DQ-FCP-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm5
-; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm5
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm5, %zmm2
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm6[0,2,0,2]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[0,8,u,u,u],zero,zero,ymm4[1,9,u,u,u],zero,zero,ymm4[18,26,u,u,u],zero,zero,ymm4[19,27,u,u,u],zero,zero,ymm4[20,28]
-; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,5,2,6,1,5,2,6]
-; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm5, %ymm6
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u],zero,zero,ymm6[1,5,u,u,u],zero,zero,ymm6[2,6,u,u,u],zero,zero,ymm6[19,23,u,u,u],zero,zero,ymm6[24,28,u,u,u],zero
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm7[0,2,0,2]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,8],zero,zero,ymm6[u,u,u,1,9],zero,zero,ymm6[u,u,u,2,10],zero,zero,ymm6[u,u,u,19,27],zero,zero,ymm6[u,u,u,20,28],zero,zero
-; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm5, %ymm7
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,1,5],zero,zero,ymm7[u,u,u,2,6],zero,zero,ymm7[u,u,u,19,23],zero,zero,ymm7[u,u,u,24,28],zero,zero,ymm7[u,u,u,25]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6
-; AVX512DQ-FCP-NEXT: vporq %zmm4, %zmm6, %zmm4
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[1,1,0,0,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,0,1,0,0,0,0]
-; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm7, %ymm6
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm1[0,2,0,2]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,0,8],zero,ymm6[u,u,u,u,1,9],zero,ymm6[u,u,u,u,18,26],zero,ymm6[u,u,u,u,19,27],zero,ymm6[u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm1
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,4],zero,ymm1[u,u,u,u,1,5],zero,ymm1[u,u,u,u,2,6],zero,ymm1[u,u,u,u,19,23],zero,ymm1[u,u,u,u,24,28],zero,ymm1[u]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm6, %zmm1
-; AVX512DQ-FCP-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
-; AVX512DQ-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, 96(%rax)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%rax)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, 64(%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm1
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm5
+; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm6
+; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm3
+; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm4
+; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %xmm2
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm7
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm8
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm9
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm8[0,2,0,2]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,ymm10[0,8,u,u,u],zero,zero,ymm10[1,9,u,u,u],zero,zero,ymm10[18,26,u,u,u],zero,zero,ymm10[19,27,u,u,u],zero,zero,ymm10[20,28]
+; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [1,5,2,6,1,5,2,6]
+; AVX512DQ-FCP-NEXT: # ymm11 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm11, %ymm12
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u],zero,zero,ymm12[1,5,u,u,u],zero,zero,ymm12[2,6,u,u,u],zero,zero,ymm12[19,23,u,u,u],zero,zero,ymm12[24,28,u,u,u],zero
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm10
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm7[0,2,0,2]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,8],zero,zero,ymm12[u,u,u,1,9],zero,zero,ymm12[u,u,u,2,10],zero,zero,ymm12[u,u,u,19,27],zero,zero,ymm12[u,u,u,20,28],zero,zero
+; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm11, %ymm13
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,1,5],zero,zero,ymm13[u,u,u,2,6],zero,zero,ymm13[u,u,u,19,23],zero,zero,ymm13[u,u,u,24,28],zero,zero,ymm13[u,u,u,25]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12
+; AVX512DQ-FCP-NEXT: vporq %zmm10, %zmm12, %zmm10
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[1,1,0,0,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,1,0,1,0,0,0,0]
+; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm13, %ymm12
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm9[0,2,0,2]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,0,8],zero,ymm13[u,u,u,u,1,9],zero,ymm13[u,u,u,u,18,26],zero,ymm13[u,u,u,u,19,27],zero,ymm13[u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpermd %ymm9, %ymm11, %ymm11
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,4],zero,ymm11[u,u,u,u,1,5],zero,ymm11[u,u,u,u,2,6],zero,ymm11[u,u,u,u,19,23],zero,ymm11[u,u,u,u,24,28],zero,ymm11[u]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm11
+; AVX512DQ-FCP-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm11
+; AVX512DQ-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm11
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[3,1,1,3]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[1],zero,zero,ymm7[u,u,u,10,2],zero,zero,ymm7[u,u,u,11,3],zero,zero,ymm7[u,u,u,20,28],zero,zero,ymm7[u,u,u,21,29],zero,zero,ymm7[u]
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,3,3,1]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[1,9,u,u,u],zero,zero,ymm8[2,10,u,u,u],zero,zero,ymm8[3,19,u,u,u],zero,zero,ymm8[28,20,u,u,u],zero,zero,ymm8[29,21,u]
+; AVX512DQ-FCP-NEXT: vpor %ymm7, %ymm8, %ymm7
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm2[0,1,2,3,4,5,5,6]
+; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,2,3,3,2,2,3,3]
+; AVX512DQ-FCP-NEXT: # ymm10 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm10, %ymm8
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,3,1,3]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,1,9],zero,ymm9[u,u,u,u,2,10],zero,ymm9[u,u,u,u,19,27],zero,ymm9[u,u,u,u,20,28],zero,ymm9[u,u,u,u,21]
+; AVX512DQ-FCP-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm9
+; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm9
+; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[12,13,u,u,u],zero,zero,xmm5[14,15,u,u,u]
+; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13],zero,zero,xmm0[u,u,u,14,15],zero,zero,xmm0[u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,xmm1[u,u,u,u,13,12],zero,xmm1[u,u,u,u,15,14],zero
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[13,u,u,u,u],zero,zero,xmm2[14,u,u,u,u],zero,zero,xmm2[15]
+; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm9, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, 96(%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, (%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, 64(%rax)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
index 0c76c14afb0a..4859a8e0eaaa 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
@@ -305,6 +305,37 @@ define <4 x float> @combine_vpermilvar_4f32_as_insertps(<4 x float> %a0) {
ret <4 x float> %2
}
+define <8 x i32> @combine_blend_of_permutes_v8i32(<4 x i64> %a0, <4 x i64> %a1) {
+; AVX1-LABEL: combine_blend_of_permutes_v8i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4],ymm0[5,6],ymm1[7]
+; AVX1-NEXT: ret{{[l|q]}}
+;
+; AVX2-LABEL: combine_blend_of_permutes_v8i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,3,0,1]
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4],ymm0[5,6],ymm1[7]
+; AVX2-NEXT: ret{{[l|q]}}
+;
+; AVX512-LABEL: combine_blend_of_permutes_v8i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [4,21,6,23,16,1,2,19]
+; AVX512-NEXT: vpermt2d %zmm1, %zmm2, %zmm0
+; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512-NEXT: ret{{[l|q]}}
+ %s0 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+ %s1 = shufflevector <4 x i64> %a1, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+ %x0 = bitcast <4 x i64> %s0 to <8 x i32>
+ %x1 = bitcast <4 x i64> %s1 to <8 x i32>
+ %r = shufflevector <8 x i32> %x0, <8 x i32> %x1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 12, i32 5, i32 6, i32 15>
+ ret <8 x i32> %r
+}
+
define <2 x double> @constant_fold_vpermilvar_pd() {
; CHECK-LABEL: constant_fold_vpermilvar_pd:
; CHECK: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll
index f53b1eeaf8f5..e87e810971e1 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll
@@ -973,3 +973,47 @@ define <8 x i64> @combine_vpermvar_insertion_as_broadcast_v8i64(i64 %a0) {
%2 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %1, <8 x i64> zeroinitializer)
ret <8 x i64> %2
}
+
+define <16 x i32> @blend_of_permutes_v16i32(<8 x i64> %a0, <8x i64> %a1) {
+; X86-AVX512F-LABEL: blend_of_permutes_v16i32:
+; X86-AVX512F: # %bb.0:
+; X86-AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5]
+; X86-AVX512F-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5]
+; X86-AVX512F-NEXT: movw $-25958, %ax # imm = 0x9A9A
+; X86-AVX512F-NEXT: kmovw %eax, %k1
+; X86-AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
+; X86-AVX512F-NEXT: retl
+;
+; X86-AVX512BW-LABEL: blend_of_permutes_v16i32:
+; X86-AVX512BW: # %bb.0:
+; X86-AVX512BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5]
+; X86-AVX512BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5]
+; X86-AVX512BW-NEXT: movw $-25958, %ax # imm = 0x9A9A
+; X86-AVX512BW-NEXT: kmovd %eax, %k1
+; X86-AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
+; X86-AVX512BW-NEXT: retl
+;
+; X64-AVX512F-LABEL: blend_of_permutes_v16i32:
+; X64-AVX512F: # %bb.0:
+; X64-AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5]
+; X64-AVX512F-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5]
+; X64-AVX512F-NEXT: movw $-25958, %ax # imm = 0x9A9A
+; X64-AVX512F-NEXT: kmovw %eax, %k1
+; X64-AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
+; X64-AVX512F-NEXT: retq
+;
+; X64-AVX512BW-LABEL: blend_of_permutes_v16i32:
+; X64-AVX512BW: # %bb.0:
+; X64-AVX512BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5]
+; X64-AVX512BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5]
+; X64-AVX512BW-NEXT: movw $-25958, %ax # imm = 0x9A9A
+; X64-AVX512BW-NEXT: kmovd %eax, %k1
+; X64-AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
+; X64-AVX512BW-NEXT: retq
+ %s0 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
+ %s1 = shufflevector <8 x i64> %a1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
+ %x0 = bitcast <8 x i64> %s0 to <16 x i32>
+ %x1 = bitcast <8 x i64> %s1 to <16 x i32>
+ %r = shufflevector <16 x i32> %x0, <16 x i32> %x1, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 20, i32 5, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 28, i32 13, i32 14, i32 31>
+ ret <16 x i32> %r
+}
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll
index 5eb017bc80ca..33851f56fe8d 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll
@@ -22,6 +22,21 @@ define <16 x i8> @combine_vpshufb_as_movzx(<16 x i8> %a0) {
ret <16 x i8> %res0
}
+define <4 x i32> @combine_blend_of_permutes_v4i32(<2 x i64> %a0, <2 x i64> %a1) {
+; SSE-LABEL: combine_blend_of_permutes_v4i32:
+; SSE: # %bb.0:
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
+; SSE-NEXT: retq
+ %s0 = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
+ %s1 = shufflevector <2 x i64> %a1, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
+ %x0 = bitcast <2 x i64> %s0 to <4 x i32>
+ %x1 = bitcast <2 x i64> %s1 to <4 x i32>
+ %r = shufflevector <4 x i32> %x0, <4 x i32> %x1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+ ret <4 x i32> %r
+}
+
define <16 x i8> @PR50049(ptr %p1, ptr %p2) {
; SSE-LABEL: PR50049:
; SSE: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
index 11f422d67154..99e8cdb179c8 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
@@ -314,8 +314,8 @@ define void @vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2(ptr %in.v
;
; AVX512F-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u]
; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
@@ -324,8 +324,8 @@ define void @vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2(ptr %in.v
;
; AVX512DQ-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
@@ -981,7 +981,7 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in.
; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7]
; AVX512F-NEXT: vmovdqa (%rdi), %ymm1
; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1
-; AVX512F-NEXT: vpermd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpermd %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512F-NEXT: vzeroupper
@@ -992,7 +992,7 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in.
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7]
; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1
-; AVX512DQ-NEXT: vpermd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpermd %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512DQ-NEXT: vzeroupper
@@ -4026,10 +4026,10 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
;
; AVX512F-FAST-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
; AVX512F-FAST: # %bb.0:
-; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX512F-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
+; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512F-FAST-NEXT: vpbroadcastq %xmm0, %ymm2
; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
; AVX512F-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
@@ -4062,10 +4062,10 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
;
; AVX512DQ-FAST-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
; AVX512DQ-FAST: # %bb.0:
-; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-FAST-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX512DQ-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
+; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm0, %ymm2
; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
; AVX512DQ-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
@@ -4541,9 +4541,9 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr %
;
; AVX512F-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
+; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0
@@ -4559,9 +4559,9 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr %
;
; AVX512DQ-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
+; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0