diff options
author | Dinar Temirbulatov <Dinar.Temirbulatov@arm.com> | 2023-11-08 14:37:49 +0000 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-11-08 14:37:49 +0000 |
commit | 3f9d385e5844f2f1f144305037cfc904789c6187 (patch) | |
tree | bbf58654e8b1875cc095c319e81bb86a2c023c79 | |
parent | 9cdaeefc4542d889cc8aefbc7d7e69baa8675cd9 (diff) |
[AArch64][SME] Shuffle lowering, assume that the minimal SVE register is 128-bit, when NOEN is not available. (#71647)
We can assume that the minimal SVE register is 128-bit, when NEON is not
available. And we can lower the shuffle shuffle operation with one
operand to TBL1 SVE instruction.
6 files changed, 71 insertions, 179 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index b9d6578ee33f..0fb9c3ef2cd2 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -26124,6 +26124,9 @@ static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2, bool IsSingleOp = ShuffleVectorInst::isSingleSourceMask(ShuffleMask, ShuffleMask.size()); + if (!Subtarget.isNeonAvailable() && !MinSVESize) + MinSVESize = 128; + // Ignore two operands if no SVE2 or all index numbers couldn't // be represented. if (!IsSingleOp && (!Subtarget.hasSVE2() || MinSVESize != MaxSVESize)) @@ -26135,9 +26138,8 @@ static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2, unsigned ElementsPerVectorReg = VTOp1.getVectorNumElements(); unsigned MaskSize = ShuffleMask.size(); uint64_t MaxOffset = APInt(BitsPerElt, -1, false).getZExtValue(); - assert(ElementsPerVectorReg <= IndexLen && MaskSize <= IndexLen && + assert(ElementsPerVectorReg <= IndexLen && ShuffleMask.size() <= IndexLen && "Incorrectly legalised shuffle operation"); - (void)MaskSize; SmallVector<SDValue, 8> TBLMask; for (int Index : ShuffleMask) { @@ -26333,8 +26335,10 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE( } } - // Avoid producing TBL instruction if we don't know SVE register minimal size. - if (MinSVESize) + // Avoid producing TBL instruction if we don't know SVE register minimal size, + // unless NEON is not available and we can assume minimal SVE register size is + // 128-bits. + if (MinSVESize || !Subtarget->isNeonAvailable()) return GenerateFixedLengthSVETBL(Op, Op1, Op2, ShuffleMask, VT, ContainerVT, DAG); diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll index d8f6506cba69..25ecd7a8d7e3 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll @@ -184,15 +184,11 @@ define void @extract_subvector_v4i64(ptr %a, ptr %b) { define <2 x half> @extract_subvector_v4f16(<4 x half> %op) { ; CHECK-LABEL: extract_subvector_v4f16: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI12_0 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: mov z1.h, z0.h[3] -; CHECK-NEXT: mov z0.h, z0.h[2] -; CHECK-NEXT: str h1, [sp, #10] -; CHECK-NEXT: str h0, [sp, #8] -; CHECK-NEXT: ldr d0, [sp, #8] -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI12_0] +; CHECK-NEXT: tbl z0.h, { z0.h }, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %ret = call <2 x half> @llvm.vector.extract.v2f16.v4f16(<4 x half> %op, i64 2) ret <2 x half> %ret diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll index 4947764f139e..1fc51d50b50a 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll @@ -81,42 +81,22 @@ define void @alloc_v6i8(ptr %st_ptr) nounwind { define void @alloc_v32i8(ptr %st_ptr) nounwind { ; CHECK-LABEL: alloc_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #64 -; CHECK-NEXT: stp x30, x19, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #48 +; CHECK-NEXT: stp x30, x19, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: add x0, sp, #16 +; CHECK-NEXT: mov x0, sp ; CHECK-NEXT: bl def -; CHECK-NEXT: ldp q0, q3, [sp, #16] -; CHECK-NEXT: mov z1.b, z0.b[14] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: mov z4.b, z0.b[10] -; CHECK-NEXT: mov z2.b, z0.b[12] -; CHECK-NEXT: mov z5.b, z0.b[8] -; CHECK-NEXT: strb w8, [sp] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov z1.b, z0.b[6] -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: mov z2.b, z0.b[4] -; CHECK-NEXT: mov z0.b, z0.b[2] -; CHECK-NEXT: strb w8, [sp, #7] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: strb w9, [sp, #6] -; CHECK-NEXT: fmov w9, s5 -; CHECK-NEXT: strb w8, [sp, #5] +; CHECK-NEXT: adrp x8, .LCPI2_0 +; CHECK-NEXT: ldr q0, [sp] +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_0] +; CHECK-NEXT: tbl z0.b, { z0.b }, z1.b +; CHECK-NEXT: ldr q1, [sp, #16] ; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: strb w9, [sp, #4] -; CHECK-NEXT: strb w8, [sp, #3] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: strb w8, [sp, #2] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: strb w8, [sp, #1] -; CHECK-NEXT: fmov w8, s3 ; CHECK-NEXT: strb w8, [x19, #8] -; CHECK-NEXT: ldr q0, [sp] ; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: str x8, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: add sp, sp, #64 +; CHECK-NEXT: ldp x30, x19, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #48 ; CHECK-NEXT: ret %alloc = alloca [32 x i8] call void @def(ptr %alloc) diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll index 88fb73e64967..d1bff4fa21a1 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll @@ -185,19 +185,11 @@ define void @test_revhv32i16(ptr %a) { define void @test_rev_elts_fail(ptr %a) { ; CHECK-LABEL: test_rev_elts_fail: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0] -; CHECK-NEXT: mov z2.d, z0.d[1] -; CHECK-NEXT: fmov x8, d0 -; CHECK-NEXT: mov z0.d, z1.d[1] -; CHECK-NEXT: fmov x9, d2 -; CHECK-NEXT: stp x9, x8, [sp, #-32]! -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: fmov x8, d1 -; CHECK-NEXT: fmov x9, d0 -; CHECK-NEXT: stp x9, x8, [sp, #16] -; CHECK-NEXT: ldp q1, q0, [sp] -; CHECK-NEXT: stp q0, q1, [x0] -; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: index z0.d, #1, #-1 +; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: tbl z1.d, { z1.d }, z0.d +; CHECK-NEXT: tbl z0.d, { z2.d }, z0.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %tmp1 = load <4 x i64>, ptr %a %tmp2 = shufflevector <4 x i64> %tmp1, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> @@ -240,30 +232,11 @@ define void @test_revdv4f64_sve2p1(ptr %a) #1 { define void @test_revv8i32(ptr %a) { ; CHECK-LABEL: test_revv8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #32 -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: ldp q0, q3, [x0] -; CHECK-NEXT: mov z1.s, z0.s[1] -; CHECK-NEXT: mov z2.s, z0.s[2] -; CHECK-NEXT: mov z4.s, z0.s[3] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: mov z0.s, z3.s[1] -; CHECK-NEXT: fmov w9, s1 -; CHECK-NEXT: mov z1.s, z3.s[2] -; CHECK-NEXT: stp w9, w8, [sp, #24] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: fmov w9, s4 -; CHECK-NEXT: mov z2.s, z3.s[3] -; CHECK-NEXT: stp w9, w8, [sp, #16] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: stp w9, w8, [sp, #8] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: stp w9, w8, [sp] -; CHECK-NEXT: ldp q0, q1, [sp] -; CHECK-NEXT: stp q0, q1, [x0] -; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: index z0.s, #3, #-1 +; CHECK-NEXT: ldp q2, q1, [x0] +; CHECK-NEXT: tbl z1.s, { z1.s }, z0.s +; CHECK-NEXT: tbl z0.s, { z2.s }, z0.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %tmp1 = load <8 x i32>, ptr %a %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll index 1f036fa08ef1..d7bfb6b2680e 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll @@ -342,21 +342,14 @@ define void @zip_v4i32(ptr %a, ptr %b) { define void @zip1_v8i32_undef(ptr %a) { ; CHECK-LABEL: zip1_v8i32_undef: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI6_0 ; CHECK-NEXT: ldr q0, [x0, #16] ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: mov z1.s, z0.s[3] -; CHECK-NEXT: mov z2.s, z0.s[2] +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI6_0] +; CHECK-NEXT: tbl z1.s, { z0.s }, z1.s ; CHECK-NEXT: zip1 z0.s, z0.s, z0.s -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: stp w8, w8, [sp, #8] -; CHECK-NEXT: stp w9, w9, [sp] -; CHECK-NEXT: ldr q1, [sp] -; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: str q1, [x0, #16] -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret %tmp1 = load volatile <8 x i32>, ptr %a %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3> @@ -389,41 +382,15 @@ define void @trn_v32i8(ptr %a, ptr %b) { define void @trn_v8i16(ptr %a, ptr %b) { ; CHECK-LABEL: trn_v8i16: ; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI8_0 +; CHECK-NEXT: adrp x9, .LCPI8_1 ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: mov z1.h, z0.h[3] -; CHECK-NEXT: mov z2.h, z0.h[1] -; CHECK-NEXT: mov z3.h, z0.h[5] -; CHECK-NEXT: mov z4.h, z0.h[4] -; CHECK-NEXT: strh w8, [sp, #-32]! -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov z1.h, z0.h[2] -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: mov z2.h, z0.h[6] -; CHECK-NEXT: mov z0.h, z0.h[7] -; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: fmov w11, s4 -; CHECK-NEXT: fmov w12, s1 -; CHECK-NEXT: strh w8, [sp, #14] -; CHECK-NEXT: fmov w13, s2 -; CHECK-NEXT: strh w9, [sp, #12] -; CHECK-NEXT: strh w10, [sp, #10] -; CHECK-NEXT: strh w12, [sp, #4] -; CHECK-NEXT: fmov w12, s0 -; CHECK-NEXT: strh w11, [sp, #8] -; CHECK-NEXT: strh w13, [sp, #6] -; CHECK-NEXT: strh w12, [sp, #2] -; CHECK-NEXT: strh w12, [sp, #28] -; CHECK-NEXT: strh w11, [sp, #26] -; CHECK-NEXT: strh w10, [sp, #22] -; CHECK-NEXT: strh w8, [sp, #20] -; CHECK-NEXT: strh w13, [sp, #18] -; CHECK-NEXT: strh w9, [sp, #16] -; CHECK-NEXT: ldp q0, q1, [sp] -; CHECK-NEXT: add z0.h, z0.h, z1.h +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI8_0] +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI8_1] +; CHECK-NEXT: tbl z1.h, { z0.h }, z1.h +; CHECK-NEXT: tbl z0.h, { z0.h }, z2.h +; CHECK-NEXT: add z0.h, z1.h, z0.h ; CHECK-NEXT: str q0, [x0] -; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret %tmp1 = load <8 x i16>, ptr %a %tmp2 = load <8 x i16>, ptr %b @@ -692,21 +659,14 @@ define void @zip2_v8i32(ptr %a, ptr %b) #0{ define void @zip2_v8i32_undef(ptr %a) #0{ ; CHECK-LABEL: zip2_v8i32_undef: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI17_0 ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q0, [x0, #16] -; CHECK-NEXT: mov z1.s, z0.s[3] -; CHECK-NEXT: mov z2.s, z0.s[2] +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI17_0] +; CHECK-NEXT: tbl z1.s, { z0.s }, z1.s ; CHECK-NEXT: zip1 z0.s, z0.s, z0.s -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: stp w8, w8, [sp, #8] -; CHECK-NEXT: stp w9, w9, [sp] -; CHECK-NEXT: ldr q1, [sp] -; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: str q1, [x0, #16] -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret %tmp1 = load volatile <8 x i32>, ptr %a %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7> @@ -921,26 +881,15 @@ define void @uzp_v32i8(ptr %a, ptr %b) #0{ define void @uzp_v4i16(ptr %a, ptr %b) #0{ ; CHECK-LABEL: uzp_v4i16: ; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI19_0 +; CHECK-NEXT: adrp x9, .LCPI19_1 ; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: mov z1.h, z0.h[1] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: mov z2.h, z0.h[2] -; CHECK-NEXT: mov z3.h, z0.h[3] -; CHECK-NEXT: fmov w9, s1 -; CHECK-NEXT: strh w8, [sp, #-16]! -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: fmov w10, s2 -; CHECK-NEXT: fmov w11, s3 -; CHECK-NEXT: strh w9, [sp, #6] -; CHECK-NEXT: strh w8, [sp, #10] -; CHECK-NEXT: strh w9, [sp, #8] -; CHECK-NEXT: strh w10, [sp, #4] -; CHECK-NEXT: strh w11, [sp, #2] -; CHECK-NEXT: strh w10, [sp, #12] -; CHECK-NEXT: ldp d0, d1, [sp] -; CHECK-NEXT: add z0.h, z0.h, z1.h +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI19_0] +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI19_1] +; CHECK-NEXT: tbl z1.h, { z0.h }, z1.h +; CHECK-NEXT: tbl z0.h, { z0.h }, z2.h +; CHECK-NEXT: add z0.h, z1.h, z0.h ; CHECK-NEXT: str d0, [x0] -; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, ptr %a %tmp2 = load <4 x i16>, ptr %b @@ -1071,11 +1020,12 @@ define void @uzp_v16i16(ptr %a, ptr %b) #0{ define void @uzp_v8f32(ptr %a, ptr %b) #0{ ; CHECK-LABEL: uzp_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #64 -; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: sub sp, sp, #48 +; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: ldp q2, q0, [x0] -; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: adrp x8, .LCPI21_0 ; CHECK-NEXT: ldp q4, q1, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: mov z3.s, z0.s[2] ; CHECK-NEXT: mov z5.s, z1.s[2] ; CHECK-NEXT: stp s0, s3, [sp, #24] @@ -1085,17 +1035,17 @@ define void @uzp_v8f32(ptr %a, ptr %b) #0{ ; CHECK-NEXT: mov z0.s, z0.s[1] ; CHECK-NEXT: stp s3, s1, [sp, #4] ; CHECK-NEXT: mov z1.s, z2.s[1] -; CHECK-NEXT: stp s0, s5, [sp, #40] -; CHECK-NEXT: mov z5.s, z4.s[3] -; CHECK-NEXT: mov z4.s, z4.s[1] +; CHECK-NEXT: str s5, [sp, #44] +; CHECK-NEXT: ldr q5, [x8, :lo12:.LCPI21_0] +; CHECK-NEXT: str s0, [sp, #40] ; CHECK-NEXT: ldp q3, q2, [sp] +; CHECK-NEXT: tbl z0.s, { z4.s }, z5.s ; CHECK-NEXT: str s1, [sp, #32] -; CHECK-NEXT: stp s4, s5, [sp, #48] -; CHECK-NEXT: ldp q0, q1, [sp, #32] -; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z2.s -; CHECK-NEXT: fadd z1.s, p0/m, z1.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] -; CHECK-NEXT: add sp, sp, #64 +; CHECK-NEXT: ldr q1, [sp, #32] +; CHECK-NEXT: fadd z1.s, p0/m, z1.s, z2.s +; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z3.s +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: add sp, sp, #48 ; CHECK-NEXT: ret %tmp1 = load <8 x float>, ptr %a %tmp2 = load <8 x float>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll index 03e156cb4aff..175731480407 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll @@ -8,22 +8,11 @@ target triple = "aarch64-unknown-linux-gnu" define <4 x i8> @shuffle_ext_byone_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; CHECK-LABEL: shuffle_ext_byone_v4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI0_0 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: mov z1.h, z0.h[1] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: mov z2.h, z0.h[2] -; CHECK-NEXT: mov z3.h, z0.h[3] -; CHECK-NEXT: strh w8, [sp, #8] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: strh w8, [sp, #14] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: strh w9, [sp, #12] -; CHECK-NEXT: strh w8, [sp, #10] -; CHECK-NEXT: ldr d0, [sp, #8] -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_0] +; CHECK-NEXT: tbl z0.h, { z0.h }, z1.h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %ret = shufflevector <4 x i8> %op1, <4 x i8> %op2, <4 x i32> <i32 0, i32 3, i32 2, i32 1> ret <4 x i8> %ret |