summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDinar Temirbulatov <Dinar.Temirbulatov@arm.com>2023-11-08 14:37:49 +0000
committerGitHub <noreply@github.com>2023-11-08 14:37:49 +0000
commit3f9d385e5844f2f1f144305037cfc904789c6187 (patch)
treebbf58654e8b1875cc095c319e81bb86a2c023c79
parent9cdaeefc4542d889cc8aefbc7d7e69baa8675cd9 (diff)
[AArch64][SME] Shuffle lowering, assume that the minimal SVE register is 128-bit, when NOEN is not available. (#71647)
We can assume that the minimal SVE register is 128-bit, when NEON is not available. And we can lower the shuffle shuffle operation with one operand to TBL1 SVE instruction.
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp12
-rw-r--r--llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll12
-rw-r--r--llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll40
-rw-r--r--llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll47
-rw-r--r--llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll120
-rw-r--r--llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll19
6 files changed, 71 insertions, 179 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index b9d6578ee33f..0fb9c3ef2cd2 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -26124,6 +26124,9 @@ static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2,
bool IsSingleOp =
ShuffleVectorInst::isSingleSourceMask(ShuffleMask, ShuffleMask.size());
+ if (!Subtarget.isNeonAvailable() && !MinSVESize)
+ MinSVESize = 128;
+
// Ignore two operands if no SVE2 or all index numbers couldn't
// be represented.
if (!IsSingleOp && (!Subtarget.hasSVE2() || MinSVESize != MaxSVESize))
@@ -26135,9 +26138,8 @@ static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2,
unsigned ElementsPerVectorReg = VTOp1.getVectorNumElements();
unsigned MaskSize = ShuffleMask.size();
uint64_t MaxOffset = APInt(BitsPerElt, -1, false).getZExtValue();
- assert(ElementsPerVectorReg <= IndexLen && MaskSize <= IndexLen &&
+ assert(ElementsPerVectorReg <= IndexLen && ShuffleMask.size() <= IndexLen &&
"Incorrectly legalised shuffle operation");
- (void)MaskSize;
SmallVector<SDValue, 8> TBLMask;
for (int Index : ShuffleMask) {
@@ -26333,8 +26335,10 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
}
}
- // Avoid producing TBL instruction if we don't know SVE register minimal size.
- if (MinSVESize)
+ // Avoid producing TBL instruction if we don't know SVE register minimal size,
+ // unless NEON is not available and we can assume minimal SVE register size is
+ // 128-bits.
+ if (MinSVESize || !Subtarget->isNeonAvailable())
return GenerateFixedLengthSVETBL(Op, Op1, Op2, ShuffleMask, VT, ContainerVT,
DAG);
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll
index d8f6506cba69..25ecd7a8d7e3 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll
@@ -184,15 +184,11 @@ define void @extract_subvector_v4i64(ptr %a, ptr %b) {
define <2 x half> @extract_subvector_v4f16(<4 x half> %op) {
; CHECK-LABEL: extract_subvector_v4f16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: adrp x8, .LCPI12_0
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT: mov z1.h, z0.h[3]
-; CHECK-NEXT: mov z0.h, z0.h[2]
-; CHECK-NEXT: str h1, [sp, #10]
-; CHECK-NEXT: str h0, [sp, #8]
-; CHECK-NEXT: ldr d0, [sp, #8]
-; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI12_0]
+; CHECK-NEXT: tbl z0.h, { z0.h }, z1.h
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%ret = call <2 x half> @llvm.vector.extract.v2f16.v4f16(<4 x half> %op, i64 2)
ret <2 x half> %ret
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
index 4947764f139e..1fc51d50b50a 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
@@ -81,42 +81,22 @@ define void @alloc_v6i8(ptr %st_ptr) nounwind {
define void @alloc_v32i8(ptr %st_ptr) nounwind {
; CHECK-LABEL: alloc_v32i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #64
-; CHECK-NEXT: stp x30, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #48
+; CHECK-NEXT: stp x30, x19, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: mov x19, x0
-; CHECK-NEXT: add x0, sp, #16
+; CHECK-NEXT: mov x0, sp
; CHECK-NEXT: bl def
-; CHECK-NEXT: ldp q0, q3, [sp, #16]
-; CHECK-NEXT: mov z1.b, z0.b[14]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: mov z4.b, z0.b[10]
-; CHECK-NEXT: mov z2.b, z0.b[12]
-; CHECK-NEXT: mov z5.b, z0.b[8]
-; CHECK-NEXT: strb w8, [sp]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: mov z1.b, z0.b[6]
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: mov z2.b, z0.b[4]
-; CHECK-NEXT: mov z0.b, z0.b[2]
-; CHECK-NEXT: strb w8, [sp, #7]
-; CHECK-NEXT: fmov w8, s4
-; CHECK-NEXT: strb w9, [sp, #6]
-; CHECK-NEXT: fmov w9, s5
-; CHECK-NEXT: strb w8, [sp, #5]
+; CHECK-NEXT: adrp x8, .LCPI2_0
+; CHECK-NEXT: ldr q0, [sp]
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_0]
+; CHECK-NEXT: tbl z0.b, { z0.b }, z1.b
+; CHECK-NEXT: ldr q1, [sp, #16]
; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: strb w9, [sp, #4]
-; CHECK-NEXT: strb w8, [sp, #3]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: strb w8, [sp, #2]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: strb w8, [sp, #1]
-; CHECK-NEXT: fmov w8, s3
; CHECK-NEXT: strb w8, [x19, #8]
-; CHECK-NEXT: ldr q0, [sp]
; CHECK-NEXT: fmov x8, d0
; CHECK-NEXT: str x8, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #64
+; CHECK-NEXT: ldp x30, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #48
; CHECK-NEXT: ret
%alloc = alloca [32 x i8]
call void @def(ptr %alloc)
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll
index 88fb73e64967..d1bff4fa21a1 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll
@@ -185,19 +185,11 @@ define void @test_revhv32i16(ptr %a) {
define void @test_rev_elts_fail(ptr %a) {
; CHECK-LABEL: test_rev_elts_fail:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldp q1, q0, [x0]
-; CHECK-NEXT: mov z2.d, z0.d[1]
-; CHECK-NEXT: fmov x8, d0
-; CHECK-NEXT: mov z0.d, z1.d[1]
-; CHECK-NEXT: fmov x9, d2
-; CHECK-NEXT: stp x9, x8, [sp, #-32]!
-; CHECK-NEXT: .cfi_def_cfa_offset 32
-; CHECK-NEXT: fmov x8, d1
-; CHECK-NEXT: fmov x9, d0
-; CHECK-NEXT: stp x9, x8, [sp, #16]
-; CHECK-NEXT: ldp q1, q0, [sp]
-; CHECK-NEXT: stp q0, q1, [x0]
-; CHECK-NEXT: add sp, sp, #32
+; CHECK-NEXT: index z0.d, #1, #-1
+; CHECK-NEXT: ldp q1, q2, [x0]
+; CHECK-NEXT: tbl z1.d, { z1.d }, z0.d
+; CHECK-NEXT: tbl z0.d, { z2.d }, z0.d
+; CHECK-NEXT: stp q1, q0, [x0]
; CHECK-NEXT: ret
%tmp1 = load <4 x i64>, ptr %a
%tmp2 = shufflevector <4 x i64> %tmp1, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
@@ -240,30 +232,11 @@ define void @test_revdv4f64_sve2p1(ptr %a) #1 {
define void @test_revv8i32(ptr %a) {
; CHECK-LABEL: test_revv8i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #32
-; CHECK-NEXT: .cfi_def_cfa_offset 32
-; CHECK-NEXT: ldp q0, q3, [x0]
-; CHECK-NEXT: mov z1.s, z0.s[1]
-; CHECK-NEXT: mov z2.s, z0.s[2]
-; CHECK-NEXT: mov z4.s, z0.s[3]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: mov z0.s, z3.s[1]
-; CHECK-NEXT: fmov w9, s1
-; CHECK-NEXT: mov z1.s, z3.s[2]
-; CHECK-NEXT: stp w9, w8, [sp, #24]
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: fmov w9, s4
-; CHECK-NEXT: mov z2.s, z3.s[3]
-; CHECK-NEXT: stp w9, w8, [sp, #16]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: stp w9, w8, [sp, #8]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: stp w9, w8, [sp]
-; CHECK-NEXT: ldp q0, q1, [sp]
-; CHECK-NEXT: stp q0, q1, [x0]
-; CHECK-NEXT: add sp, sp, #32
+; CHECK-NEXT: index z0.s, #3, #-1
+; CHECK-NEXT: ldp q2, q1, [x0]
+; CHECK-NEXT: tbl z1.s, { z1.s }, z0.s
+; CHECK-NEXT: tbl z0.s, { z2.s }, z0.s
+; CHECK-NEXT: stp q1, q0, [x0]
; CHECK-NEXT: ret
%tmp1 = load <8 x i32>, ptr %a
%tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
index 1f036fa08ef1..d7bfb6b2680e 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
@@ -342,21 +342,14 @@ define void @zip_v4i32(ptr %a, ptr %b) {
define void @zip1_v8i32_undef(ptr %a) {
; CHECK-LABEL: zip1_v8i32_undef:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: adrp x8, .LCPI6_0
; CHECK-NEXT: ldr q0, [x0, #16]
; CHECK-NEXT: ldr q0, [x0]
-; CHECK-NEXT: mov z1.s, z0.s[3]
-; CHECK-NEXT: mov z2.s, z0.s[2]
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI6_0]
+; CHECK-NEXT: tbl z1.s, { z0.s }, z1.s
; CHECK-NEXT: zip1 z0.s, z0.s, z0.s
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: stp w8, w8, [sp, #8]
-; CHECK-NEXT: stp w9, w9, [sp]
-; CHECK-NEXT: ldr q1, [sp]
-; CHECK-NEXT: str q0, [x0]
; CHECK-NEXT: str q1, [x0, #16]
-; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: str q0, [x0]
; CHECK-NEXT: ret
%tmp1 = load volatile <8 x i32>, ptr %a
%tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
@@ -389,41 +382,15 @@ define void @trn_v32i8(ptr %a, ptr %b) {
define void @trn_v8i16(ptr %a, ptr %b) {
; CHECK-LABEL: trn_v8i16:
; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI8_0
+; CHECK-NEXT: adrp x9, .LCPI8_1
; CHECK-NEXT: ldr q0, [x0]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: mov z1.h, z0.h[3]
-; CHECK-NEXT: mov z2.h, z0.h[1]
-; CHECK-NEXT: mov z3.h, z0.h[5]
-; CHECK-NEXT: mov z4.h, z0.h[4]
-; CHECK-NEXT: strh w8, [sp, #-32]!
-; CHECK-NEXT: .cfi_def_cfa_offset 32
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: mov z1.h, z0.h[2]
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: mov z2.h, z0.h[6]
-; CHECK-NEXT: mov z0.h, z0.h[7]
-; CHECK-NEXT: fmov w10, s3
-; CHECK-NEXT: fmov w11, s4
-; CHECK-NEXT: fmov w12, s1
-; CHECK-NEXT: strh w8, [sp, #14]
-; CHECK-NEXT: fmov w13, s2
-; CHECK-NEXT: strh w9, [sp, #12]
-; CHECK-NEXT: strh w10, [sp, #10]
-; CHECK-NEXT: strh w12, [sp, #4]
-; CHECK-NEXT: fmov w12, s0
-; CHECK-NEXT: strh w11, [sp, #8]
-; CHECK-NEXT: strh w13, [sp, #6]
-; CHECK-NEXT: strh w12, [sp, #2]
-; CHECK-NEXT: strh w12, [sp, #28]
-; CHECK-NEXT: strh w11, [sp, #26]
-; CHECK-NEXT: strh w10, [sp, #22]
-; CHECK-NEXT: strh w8, [sp, #20]
-; CHECK-NEXT: strh w13, [sp, #18]
-; CHECK-NEXT: strh w9, [sp, #16]
-; CHECK-NEXT: ldp q0, q1, [sp]
-; CHECK-NEXT: add z0.h, z0.h, z1.h
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI8_0]
+; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI8_1]
+; CHECK-NEXT: tbl z1.h, { z0.h }, z1.h
+; CHECK-NEXT: tbl z0.h, { z0.h }, z2.h
+; CHECK-NEXT: add z0.h, z1.h, z0.h
; CHECK-NEXT: str q0, [x0]
-; CHECK-NEXT: add sp, sp, #32
; CHECK-NEXT: ret
%tmp1 = load <8 x i16>, ptr %a
%tmp2 = load <8 x i16>, ptr %b
@@ -692,21 +659,14 @@ define void @zip2_v8i32(ptr %a, ptr %b) #0{
define void @zip2_v8i32_undef(ptr %a) #0{
; CHECK-LABEL: zip2_v8i32_undef:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: adrp x8, .LCPI17_0
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr q0, [x0, #16]
-; CHECK-NEXT: mov z1.s, z0.s[3]
-; CHECK-NEXT: mov z2.s, z0.s[2]
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI17_0]
+; CHECK-NEXT: tbl z1.s, { z0.s }, z1.s
; CHECK-NEXT: zip1 z0.s, z0.s, z0.s
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: stp w8, w8, [sp, #8]
-; CHECK-NEXT: stp w9, w9, [sp]
-; CHECK-NEXT: ldr q1, [sp]
-; CHECK-NEXT: str q0, [x0]
; CHECK-NEXT: str q1, [x0, #16]
-; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: str q0, [x0]
; CHECK-NEXT: ret
%tmp1 = load volatile <8 x i32>, ptr %a
%tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
@@ -921,26 +881,15 @@ define void @uzp_v32i8(ptr %a, ptr %b) #0{
define void @uzp_v4i16(ptr %a, ptr %b) #0{
; CHECK-LABEL: uzp_v4i16:
; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI19_0
+; CHECK-NEXT: adrp x9, .LCPI19_1
; CHECK-NEXT: ldr d0, [x0]
-; CHECK-NEXT: mov z1.h, z0.h[1]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: mov z2.h, z0.h[2]
-; CHECK-NEXT: mov z3.h, z0.h[3]
-; CHECK-NEXT: fmov w9, s1
-; CHECK-NEXT: strh w8, [sp, #-16]!
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: fmov w10, s2
-; CHECK-NEXT: fmov w11, s3
-; CHECK-NEXT: strh w9, [sp, #6]
-; CHECK-NEXT: strh w8, [sp, #10]
-; CHECK-NEXT: strh w9, [sp, #8]
-; CHECK-NEXT: strh w10, [sp, #4]
-; CHECK-NEXT: strh w11, [sp, #2]
-; CHECK-NEXT: strh w10, [sp, #12]
-; CHECK-NEXT: ldp d0, d1, [sp]
-; CHECK-NEXT: add z0.h, z0.h, z1.h
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI19_0]
+; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI19_1]
+; CHECK-NEXT: tbl z1.h, { z0.h }, z1.h
+; CHECK-NEXT: tbl z0.h, { z0.h }, z2.h
+; CHECK-NEXT: add z0.h, z1.h, z0.h
; CHECK-NEXT: str d0, [x0]
-; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
%tmp1 = load <4 x i16>, ptr %a
%tmp2 = load <4 x i16>, ptr %b
@@ -1071,11 +1020,12 @@ define void @uzp_v16i16(ptr %a, ptr %b) #0{
define void @uzp_v8f32(ptr %a, ptr %b) #0{
; CHECK-LABEL: uzp_v8f32:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #64
-; CHECK-NEXT: .cfi_def_cfa_offset 64
+; CHECK-NEXT: sub sp, sp, #48
+; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: ldp q2, q0, [x0]
-; CHECK-NEXT: ptrue p0.s, vl4
+; CHECK-NEXT: adrp x8, .LCPI21_0
; CHECK-NEXT: ldp q4, q1, [x1]
+; CHECK-NEXT: ptrue p0.s, vl4
; CHECK-NEXT: mov z3.s, z0.s[2]
; CHECK-NEXT: mov z5.s, z1.s[2]
; CHECK-NEXT: stp s0, s3, [sp, #24]
@@ -1085,17 +1035,17 @@ define void @uzp_v8f32(ptr %a, ptr %b) #0{
; CHECK-NEXT: mov z0.s, z0.s[1]
; CHECK-NEXT: stp s3, s1, [sp, #4]
; CHECK-NEXT: mov z1.s, z2.s[1]
-; CHECK-NEXT: stp s0, s5, [sp, #40]
-; CHECK-NEXT: mov z5.s, z4.s[3]
-; CHECK-NEXT: mov z4.s, z4.s[1]
+; CHECK-NEXT: str s5, [sp, #44]
+; CHECK-NEXT: ldr q5, [x8, :lo12:.LCPI21_0]
+; CHECK-NEXT: str s0, [sp, #40]
; CHECK-NEXT: ldp q3, q2, [sp]
+; CHECK-NEXT: tbl z0.s, { z4.s }, z5.s
; CHECK-NEXT: str s1, [sp, #32]
-; CHECK-NEXT: stp s4, s5, [sp, #48]
-; CHECK-NEXT: ldp q0, q1, [sp, #32]
-; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z2.s
-; CHECK-NEXT: fadd z1.s, p0/m, z1.s, z3.s
-; CHECK-NEXT: stp q0, q1, [x0]
-; CHECK-NEXT: add sp, sp, #64
+; CHECK-NEXT: ldr q1, [sp, #32]
+; CHECK-NEXT: fadd z1.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z3.s
+; CHECK-NEXT: stp q1, q0, [x0]
+; CHECK-NEXT: add sp, sp, #48
; CHECK-NEXT: ret
%tmp1 = load <8 x float>, ptr %a
%tmp2 = load <8 x float>, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll
index 03e156cb4aff..175731480407 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll
@@ -8,22 +8,11 @@ target triple = "aarch64-unknown-linux-gnu"
define <4 x i8> @shuffle_ext_byone_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
; CHECK-LABEL: shuffle_ext_byone_v4i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: adrp x8, .LCPI0_0
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT: mov z1.h, z0.h[1]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: mov z2.h, z0.h[2]
-; CHECK-NEXT: mov z3.h, z0.h[3]
-; CHECK-NEXT: strh w8, [sp, #8]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: strh w8, [sp, #14]
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: strh w9, [sp, #12]
-; CHECK-NEXT: strh w8, [sp, #10]
-; CHECK-NEXT: ldr d0, [sp, #8]
-; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_0]
+; CHECK-NEXT: tbl z0.h, { z0.h }, z1.h
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%ret = shufflevector <4 x i8> %op1, <4 x i8> %op2, <4 x i32> <i32 0, i32 3, i32 2, i32 1>
ret <4 x i8> %ret