diff options
Diffstat (limited to 'llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll')
-rw-r--r-- | llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll | 280 |
1 files changed, 136 insertions, 144 deletions
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll index 955927eb7691..265f6daeb200 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll @@ -160,24 +160,23 @@ define void @store_i32_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX512-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX512-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX512-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX512-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero -; AVX512-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512-NEXT: vmovaps {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15] -; AVX512-NEXT: vpermps %zmm0, %zmm1, %zmm0 -; AVX512-NEXT: vmovaps %zmm0, (%rax) +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,16,18,20,22,1,3,5,7,17,19,21,23] +; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -186,24 +185,23 @@ define void @store_i32_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512-FCP-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX512-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512-FCP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512-FCP-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX512-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512-FCP-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX512-FCP-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX512-FCP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX512-FCP-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX512-FCP-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero -; AVX512-FCP-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX512-FCP-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vmovaps {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15] -; AVX512-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0 -; AVX512-FCP-NEXT: vmovaps %zmm0, (%rax) +; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,16,18,20,22,1,3,5,7,17,19,21,23] +; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -212,24 +210,23 @@ define void @store_i32_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512DQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX512DQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX512DQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero -; AVX512DQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX512DQ-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX512DQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovaps {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15] -; AVX512DQ-NEXT: vpermps %zmm0, %zmm1, %zmm0 -; AVX512DQ-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,16,18,20,22,1,3,5,7,17,19,21,23] +; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -238,24 +235,23 @@ define void @store_i32_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512DQ-FCP-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX512DQ-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512DQ-FCP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512DQ-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512DQ-FCP-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX512DQ-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512DQ-FCP-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX512DQ-FCP-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX512DQ-FCP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX512DQ-FCP-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX512DQ-FCP-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero -; AVX512DQ-FCP-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX512DQ-FCP-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vmovaps {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15] -; AVX512DQ-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0 -; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,16,18,20,22,1,3,5,7,17,19,21,23] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -264,24 +260,23 @@ define void @store_i32_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512BW-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX512BW-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512BW-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512BW-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512BW-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX512BW-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX512BW-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX512BW-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero -; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX512BW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX512BW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovaps {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15] -; AVX512BW-NEXT: vpermps %zmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovaps %zmm0, (%rax) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,16,18,20,22,1,3,5,7,17,19,21,23] +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -290,24 +285,23 @@ define void @store_i32_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512BW-FCP-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX512BW-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512BW-FCP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512BW-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512BW-FCP-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX512BW-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512BW-FCP-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX512BW-FCP-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX512BW-FCP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX512BW-FCP-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX512BW-FCP-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero -; AVX512BW-FCP-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX512BW-FCP-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX512BW-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vmovaps {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15] -; AVX512BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0 -; AVX512BW-FCP-NEXT: vmovaps %zmm0, (%rax) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,16,18,20,22,1,3,5,7,17,19,21,23] +; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -316,24 +310,23 @@ define void @store_i32_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512DQ-BW-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX512DQ-BW-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512DQ-BW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512DQ-BW-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512DQ-BW-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX512DQ-BW-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512DQ-BW-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX512DQ-BW-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX512DQ-BW-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX512DQ-BW-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX512DQ-BW-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero -; AVX512DQ-BW-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX512DQ-BW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX512DQ-BW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vmovaps {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15] -; AVX512DQ-BW-NEXT: vpermps %zmm0, %zmm1, %zmm0 -; AVX512DQ-BW-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,16,18,20,22,1,3,5,7,17,19,21,23] +; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -342,24 +335,23 @@ define void @store_i32_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512DQ-BW-FCP-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX512DQ-BW-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512DQ-BW-FCP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512DQ-BW-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512DQ-BW-FCP-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX512DQ-BW-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512DQ-BW-FCP-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX512DQ-BW-FCP-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX512DQ-BW-FCP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX512DQ-BW-FCP-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX512DQ-BW-FCP-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero -; AVX512DQ-BW-FCP-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX512DQ-BW-FCP-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovaps {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15] -; AVX512DQ-BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,16,18,20,22,1,3,5,7,17,19,21,23] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %in.vec0 = load <2 x i32>, ptr %in.vecptr0, align 64 |