diff options
author | Dhruv Chawla <dhruvc@nvidia.com> | 2024-03-13 13:18:36 +0530 |
---|---|---|
committer | Dhruv Chawla <dhruvc@nvidia.com> | 2024-03-13 13:18:36 +0530 |
commit | 97c7e27bdd014706e66aa5c559659cbfac9e2383 (patch) | |
tree | 849f9ddcd14c134c186a9e37ea9defc5da748952 | |
parent | 0d98582c8b86644e77f8ddd68fc251e41127b7f4 (diff) | |
parent | 65420b76c461b7ad49e469fa8092cd9e7bed398e (diff) |
[𝘀𝗽𝗿] initial versionupstream/users/dc03-work/spr/aarch64globalisel-avoid-splitting-loads-of-large-vector-types-into-individual-element-loads-1
Created using spr 1.3.5
-rw-r--r-- | llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp | 20 | ||||
-rw-r--r-- | llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp | 70 | ||||
-rw-r--r-- | llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir | 41 | ||||
-rw-r--r-- | llvm/test/CodeGen/AArch64/fcmp.ll | 81 | ||||
-rw-r--r-- | llvm/test/CodeGen/AArch64/sext.ll | 14 | ||||
-rw-r--r-- | llvm/test/CodeGen/AArch64/vecreduce-add.ll | 905 | ||||
-rw-r--r-- | llvm/test/CodeGen/AArch64/zext.ll | 18 |
7 files changed, 379 insertions, 770 deletions
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index bd3ff7265d51..a480c2909077 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -5496,6 +5496,26 @@ LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx, return Legalized; } + case TargetOpcode::G_SEXT: + case TargetOpcode::G_ZEXT: + case TargetOpcode::G_ANYEXT: { + LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); + LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); + if (TypeIdx == 0) { + DstTy = MoreTy; + SrcTy = MoreTy.changeElementType(SrcTy.getElementType()); + } else if (TypeIdx == 1) { + SrcTy = MoreTy; + DstTy = MoreTy.changeElementType(DstTy.getElementType()); + } + + Observer.changingInstr(MI); + moreElementsVectorSrc(MI, SrcTy, 1); + moreElementsVectorDst(MI, DstTy, 0); + Observer.changedInstr(MI); + return Legalized; + } + default: return UnableToLegalize; } diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index 36adada27965..fc1063b6bd48 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -356,6 +356,12 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) return Query.Types[0] == s128 && Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic; }) + .customIf([=](const LegalityQuery &Query) { + // We need custom legalization for loads greater than 128-bits as they + // need to be split up into chunks. + return Query.Types[0].isFixedVector() && + Query.Types[0].getSizeInBits() > 128; + }) .legalForTypesWithMemDesc({{s8, p0, s8, 8}, {s16, p0, s16, 8}, {s32, p0, s32, 8}, @@ -1632,6 +1638,70 @@ bool AArch64LegalizerInfo::legalizeLoadStore( Register ValReg = MI.getOperand(0).getReg(); const LLT ValTy = MRI.getType(ValReg); + if (ValTy.isFixedVector() && ValTy.getSizeInBits() > 128) { + // Break fixed-width vector loads of sizes greater than 128 bits into chunks + // of 128-bit vector loads with the same element type. + Register LoadReg = MI.getOperand(1).getReg(); + Register LoadRegWithOffset = LoadReg; + + unsigned EltSize = ValTy.getScalarSizeInBits(); + // Only support element types which can cleanly divide into 128-bit wide + // vectors. + if (128 % EltSize != 0) + return false; + + unsigned NewEltCount = 128 / EltSize; + LLT NewTy = LLT::fixed_vector(NewEltCount, ValTy.getElementType()); + + unsigned OldEltCount = ValTy.getNumElements(); + unsigned NumVecs = OldEltCount / NewEltCount; + + // Create registers to represent each element of ValReg. Load into these, + // then combine them at the end. + SmallVector<Register, 16> ComponentRegs; + for (unsigned i = 0, e = ValTy.getNumElements(); i != e; i++) + ComponentRegs.push_back( + MRI.createGenericVirtualRegister(ValTy.getElementType())); + + MachineMemOperand &MMO = **MI.memoperands_begin(); + auto GetMMO = [&MMO, &MI](int64_t Offset, LLT Ty) { + return MI.getMF()->getMachineMemOperand(&MMO, Offset, Ty); + }; + + for (unsigned i = 0, e = NumVecs; i != e; i++) { + auto LoadChunk = MIRBuilder.buildLoad( + NewTy, LoadRegWithOffset, *GetMMO(i * NewTy.getSizeInBytes(), NewTy)); + + auto LoadOffset = MIRBuilder.buildConstant( + LLT::scalar(64), (i + 1) * NewTy.getSizeInBytes()); + + LoadRegWithOffset = + MIRBuilder.buildPtrAdd(MRI.getType(LoadReg), LoadReg, LoadOffset) + .getReg(0); + + Register *ChunkFirstReg = ComponentRegs.begin() + (i * NewEltCount); + MIRBuilder.buildUnmerge({ChunkFirstReg, ChunkFirstReg + NewEltCount}, + LoadChunk.getReg(0)); + } + + unsigned ExtraElems = OldEltCount % NewEltCount; + if (ExtraElems != 0) { + LLT ExtraTy = LLT::fixed_vector(ExtraElems, ValTy.getElementType()); + + auto ExtraLoadChunk = MIRBuilder.buildLoad( + ExtraTy, LoadRegWithOffset, + *GetMMO(NumVecs * NewTy.getSizeInBytes(), ExtraTy)); + + MIRBuilder.buildUnmerge({ComponentRegs.begin() + (NumVecs * NewEltCount), + ComponentRegs.end()}, + ExtraLoadChunk.getReg(0)); + } + + MIRBuilder.buildBuildVector(ValReg, ComponentRegs); + MI.eraseFromParent(); + return true; + } + if (ValTy == LLT::scalar(128)) { AtomicOrdering Ordering = (*MI.memoperands_begin())->getSuccessOrdering(); diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir index 5cbb8649d158..aa152aea81ff 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir @@ -711,33 +711,24 @@ body: | ; CHECK: liveins: $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: %ptr:_(p0) = COPY $x0 - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(p0) = G_LOAD %ptr(p0) :: (load (p0), align 64) - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD %ptr(p0) :: (load (<2 x s64>), align 64) + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p0>) = G_BITCAST [[LOAD]](<2 x s64>) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C]](s64) - ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(p0) = G_LOAD [[PTR_ADD]](p0) :: (load (p0) from unknown-address + 8) - ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16) + ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x p0>) = G_BITCAST [[LOAD1]](<2 x s64>) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32 ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C1]](s64) - ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(p0) = G_LOAD [[PTR_ADD1]](p0) :: (load (p0) from unknown-address + 16, align 16) - ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 24 - ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C2]](s64) - ; CHECK-NEXT: [[LOAD3:%[0-9]+]]:_(p0) = G_LOAD [[PTR_ADD2]](p0) :: (load (p0) from unknown-address + 24) - ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 32 - ; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C3]](s64) - ; CHECK-NEXT: [[LOAD4:%[0-9]+]]:_(p0) = G_LOAD [[PTR_ADD3]](p0) :: (load (p0) from unknown-address + 32, align 32) - ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 40 - ; CHECK-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C4]](s64) - ; CHECK-NEXT: [[LOAD5:%[0-9]+]]:_(p0) = G_LOAD [[PTR_ADD4]](p0) :: (load (p0) from unknown-address + 40) - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p0>) = G_BUILD_VECTOR [[LOAD]](p0), [[LOAD1]](p0) - ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x p0>) = G_BUILD_VECTOR [[LOAD2]](p0), [[LOAD3]](p0) - ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x p0>) = G_BUILD_VECTOR [[LOAD4]](p0), [[LOAD5]](p0) - ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[BUILD_VECTOR]](<2 x p0>) - ; CHECK-NEXT: G_STORE [[BITCAST]](<2 x s64>), %ptr(p0) :: (store (<2 x s64>), align 64) - ; CHECK-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C1]](s64) - ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[BUILD_VECTOR1]](<2 x p0>) - ; CHECK-NEXT: G_STORE [[BITCAST1]](<2 x s64>), [[PTR_ADD5]](p0) :: (store (<2 x s64>) into unknown-address + 16) - ; CHECK-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C3]](s64) - ; CHECK-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[BUILD_VECTOR2]](<2 x p0>) - ; CHECK-NEXT: G_STORE [[BITCAST2]](<2 x s64>), [[PTR_ADD6]](p0) :: (store (<2 x s64>) into unknown-address + 32, align 32) + ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD1]](p0) :: (load (<2 x s64>) from unknown-address + 32, align 32) + ; CHECK-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x p0>) = G_BITCAST [[LOAD2]](<2 x s64>) + ; CHECK-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[BITCAST]](<2 x p0>) + ; CHECK-NEXT: G_STORE [[BITCAST3]](<2 x s64>), %ptr(p0) :: (store (<2 x s64>), align 64) + ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C]](s64) + ; CHECK-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[BITCAST1]](<2 x p0>) + ; CHECK-NEXT: G_STORE [[BITCAST4]](<2 x s64>), [[PTR_ADD2]](p0) :: (store (<2 x s64>) into unknown-address + 16) + ; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C1]](s64) + ; CHECK-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[BITCAST2]](<2 x p0>) + ; CHECK-NEXT: G_STORE [[BITCAST5]](<2 x s64>), [[PTR_ADD3]](p0) :: (store (<2 x s64>) into unknown-address + 32, align 32) ; CHECK-NEXT: RET_ReallyLR %ptr:_(p0) = COPY $x0 %val:_(<6 x p0>) = G_LOAD %ptr(p0) :: (load (<6 x p0>)) diff --git a/llvm/test/CodeGen/AArch64/fcmp.ll b/llvm/test/CodeGen/AArch64/fcmp.ll index 2d0b5574cdd7..9916aeeab1ca 100644 --- a/llvm/test/CodeGen/AArch64/fcmp.ll +++ b/llvm/test/CodeGen/AArch64/fcmp.ll @@ -1108,61 +1108,54 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32 ; ; CHECK-GI-FP16-LABEL: v7f16_i32: ; CHECK-GI-FP16: // %bb.0: // %entry -; CHECK-GI-FP16-NEXT: fcmgt v1.8h, v1.8h, v0.8h -; CHECK-GI-FP16-NEXT: mov w12, #31 // =0x1f -; CHECK-GI-FP16-NEXT: ldr s4, [sp] -; CHECK-GI-FP16-NEXT: fmov s2, w12 +; CHECK-GI-FP16-NEXT: fcmgt v0.8h, v1.8h, v0.8h +; CHECK-GI-FP16-NEXT: mov w10, #31 // =0x1f +; CHECK-GI-FP16-NEXT: ldr s3, [sp] +; CHECK-GI-FP16-NEXT: fmov s1, w10 ; CHECK-GI-FP16-NEXT: fmov s6, w0 -; CHECK-GI-FP16-NEXT: ldr s5, [sp, #8] +; CHECK-GI-FP16-NEXT: ldr s4, [sp, #8] ; CHECK-GI-FP16-NEXT: ldr s7, [sp, #24] ; CHECK-GI-FP16-NEXT: ldr s16, [sp, #32] -; CHECK-GI-FP16-NEXT: umov w9, v1.h[4] -; CHECK-GI-FP16-NEXT: umov w8, v1.h[0] -; CHECK-GI-FP16-NEXT: umov w11, v1.h[5] -; CHECK-GI-FP16-NEXT: umov w10, v1.h[1] -; CHECK-GI-FP16-NEXT: mov v2.s[1], w12 -; CHECK-GI-FP16-NEXT: umov w13, v1.h[2] +; CHECK-GI-FP16-NEXT: umov w8, v0.h[4] +; CHECK-GI-FP16-NEXT: umov w9, v0.h[5] +; CHECK-GI-FP16-NEXT: mov v1.s[1], w10 ; CHECK-GI-FP16-NEXT: mov v6.s[1], w1 ; CHECK-GI-FP16-NEXT: mov v7.s[1], v16.s[0] ; CHECK-GI-FP16-NEXT: ldr s16, [sp, #40] -; CHECK-GI-FP16-NEXT: fmov s3, w9 -; CHECK-GI-FP16-NEXT: fmov s0, w8 -; CHECK-GI-FP16-NEXT: umov w8, v1.h[6] -; CHECK-GI-FP16-NEXT: mov v2.s[2], w12 -; CHECK-GI-FP16-NEXT: umov w9, v1.h[3] +; CHECK-GI-FP16-NEXT: fmov s2, w8 +; CHECK-GI-FP16-NEXT: umov w8, v0.h[6] +; CHECK-GI-FP16-NEXT: mov v1.s[2], w10 +; CHECK-GI-FP16-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-GI-FP16-NEXT: mov v6.s[2], w2 ; CHECK-GI-FP16-NEXT: mov v7.s[2], v16.s[0] -; CHECK-GI-FP16-NEXT: mov v3.s[1], w11 -; CHECK-GI-FP16-NEXT: mov v0.s[1], w10 -; CHECK-GI-FP16-NEXT: mov w10, #-1 // =0xffffffff -; CHECK-GI-FP16-NEXT: fmov s1, w10 -; CHECK-GI-FP16-NEXT: neg v17.4s, v2.4s +; CHECK-GI-FP16-NEXT: mov v2.s[1], w9 +; CHECK-GI-FP16-NEXT: mov w9, #-1 // =0xffffffff +; CHECK-GI-FP16-NEXT: fmov s5, w9 +; CHECK-GI-FP16-NEXT: neg v17.4s, v1.4s +; CHECK-GI-FP16-NEXT: shl v0.4s, v0.4s, #31 ; CHECK-GI-FP16-NEXT: mov v6.s[3], w3 +; CHECK-GI-FP16-NEXT: mov v2.s[2], w8 +; CHECK-GI-FP16-NEXT: fmov w8, s3 +; CHECK-GI-FP16-NEXT: fmov s3, w7 +; CHECK-GI-FP16-NEXT: mov v5.s[1], w9 +; CHECK-GI-FP16-NEXT: sshr v0.4s, v0.4s, #31 +; CHECK-GI-FP16-NEXT: mov v3.s[1], w8 +; CHECK-GI-FP16-NEXT: fmov w8, s4 +; CHECK-GI-FP16-NEXT: ldr s4, [sp, #16] +; CHECK-GI-FP16-NEXT: ushl v1.4s, v2.4s, v1.4s +; CHECK-GI-FP16-NEXT: fmov s2, w4 +; CHECK-GI-FP16-NEXT: mov v5.s[2], w9 +; CHECK-GI-FP16-NEXT: mov v2.s[1], w5 ; CHECK-GI-FP16-NEXT: mov v3.s[2], w8 +; CHECK-GI-FP16-NEXT: sshl v1.4s, v1.4s, v17.4s ; CHECK-GI-FP16-NEXT: fmov w8, s4 -; CHECK-GI-FP16-NEXT: fmov s4, w7 -; CHECK-GI-FP16-NEXT: mov v0.s[2], w13 -; CHECK-GI-FP16-NEXT: mov v1.s[1], w10 -; CHECK-GI-FP16-NEXT: mov v4.s[1], w8 -; CHECK-GI-FP16-NEXT: fmov w8, s5 -; CHECK-GI-FP16-NEXT: ldr s5, [sp, #16] -; CHECK-GI-FP16-NEXT: ushl v2.4s, v3.4s, v2.4s -; CHECK-GI-FP16-NEXT: fmov s3, w4 -; CHECK-GI-FP16-NEXT: mov v0.s[3], w9 -; CHECK-GI-FP16-NEXT: mov v1.s[2], w10 -; CHECK-GI-FP16-NEXT: mov v3.s[1], w5 -; CHECK-GI-FP16-NEXT: mov v4.s[2], w8 -; CHECK-GI-FP16-NEXT: sshl v2.4s, v2.4s, v17.4s -; CHECK-GI-FP16-NEXT: fmov w8, s5 -; CHECK-GI-FP16-NEXT: shl v0.4s, v0.4s, #31 -; CHECK-GI-FP16-NEXT: eor v1.16b, v2.16b, v1.16b -; CHECK-GI-FP16-NEXT: mov v3.s[2], w6 -; CHECK-GI-FP16-NEXT: mov v4.s[3], w8 -; CHECK-GI-FP16-NEXT: sshr v0.4s, v0.4s, #31 -; CHECK-GI-FP16-NEXT: and v1.16b, v7.16b, v1.16b -; CHECK-GI-FP16-NEXT: and v2.16b, v3.16b, v2.16b -; CHECK-GI-FP16-NEXT: bsl v0.16b, v6.16b, v4.16b -; CHECK-GI-FP16-NEXT: orr v1.16b, v2.16b, v1.16b +; CHECK-GI-FP16-NEXT: eor v4.16b, v1.16b, v5.16b +; CHECK-GI-FP16-NEXT: mov v2.s[2], w6 +; CHECK-GI-FP16-NEXT: mov v3.s[3], w8 +; CHECK-GI-FP16-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-GI-FP16-NEXT: and v2.16b, v7.16b, v4.16b +; CHECK-GI-FP16-NEXT: bsl v0.16b, v6.16b, v3.16b +; CHECK-GI-FP16-NEXT: orr v1.16b, v1.16b, v2.16b ; CHECK-GI-FP16-NEXT: mov s2, v0.s[1] ; CHECK-GI-FP16-NEXT: mov s3, v0.s[2] ; CHECK-GI-FP16-NEXT: mov s4, v0.s[3] diff --git a/llvm/test/CodeGen/AArch64/sext.ll b/llvm/test/CodeGen/AArch64/sext.ll index 61f04fbf0484..3e0d5dd87509 100644 --- a/llvm/test/CodeGen/AArch64/sext.ll +++ b/llvm/test/CodeGen/AArch64/sext.ll @@ -280,13 +280,12 @@ define <3 x i64> @sext_v3i8_v3i64(<3 x i8> %a) { ; ; CHECK-GI-LABEL: sext_v3i8_v3i64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-GI-NEXT: fmov d0, x0 -; CHECK-GI-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK-GI-NEXT: fmov s0, w0 ; CHECK-GI-NEXT: // kill: def $w2 killed $w2 def $x2 ; CHECK-GI-NEXT: sxtb x8, w2 ; CHECK-GI-NEXT: fmov d2, x8 -; CHECK-GI-NEXT: mov v0.d[1], x1 +; CHECK-GI-NEXT: mov v0.s[1], w1 +; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-GI-NEXT: shl v0.2d, v0.2d, #56 ; CHECK-GI-NEXT: sshr v0.2d, v0.2d, #56 ; CHECK-GI-NEXT: mov d1, v0.d[1] @@ -444,13 +443,12 @@ define <3 x i64> @sext_v3i10_v3i64(<3 x i10> %a) { ; ; CHECK-GI-LABEL: sext_v3i10_v3i64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-GI-NEXT: fmov d0, x0 -; CHECK-GI-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK-GI-NEXT: fmov s0, w0 ; CHECK-GI-NEXT: // kill: def $w2 killed $w2 def $x2 ; CHECK-GI-NEXT: sbfx x8, x2, #0, #10 ; CHECK-GI-NEXT: fmov d2, x8 -; CHECK-GI-NEXT: mov v0.d[1], x1 +; CHECK-GI-NEXT: mov v0.s[1], w1 +; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-GI-NEXT: shl v0.2d, v0.2d, #54 ; CHECK-GI-NEXT: sshr v0.2d, v0.2d, #54 ; CHECK-GI-NEXT: mov d1, v0.d[1] diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll index 66b49466cc73..66ef436f48c6 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll @@ -4,11 +4,6 @@ ; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel -global-isel-abort=2 %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-BASE ; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel -global-isel-abort=2 %s -o - -mattr=+dotprod 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-DOT -; CHECK-GI-BASE: warning: Instruction selection used fallback path for test_udot_v24i8 -; CHECK-GI-BASE-NEXT: warning: Instruction selection used fallback path for test_udot_v48i8 -; CHECK-GI-BASE-NEXT: warning: Instruction selection used fallback path for test_sdot_v24i8 -; CHECK-GI-BASE-NEXT: warning: Instruction selection used fallback path for test_sdot_v48i8 - define i32 @addv_v2i32(<2 x i32> %a) { ; CHECK-LABEL: addv_v2i32: ; CHECK: // %bb.0: // %entry @@ -2070,126 +2065,50 @@ define i32 @test_udot_v24i8(ptr %p1, ptr %p2) { ; CHECK-GI-BASE: // %bb.0: // %entry ; CHECK-GI-BASE-NEXT: ldr q0, [x0] ; CHECK-GI-BASE-NEXT: ldr q1, [x1] -; CHECK-GI-BASE-NEXT: ldr d4, [x0, #16] -; CHECK-GI-BASE-NEXT: ldr d5, [x1, #16] -; CHECK-GI-BASE-NEXT: ushll v2.8h, v0.8b, #0 -; CHECK-GI-BASE-NEXT: ushll v3.8h, v1.8b, #0 +; CHECK-GI-BASE-NEXT: ldr d2, [x0, #16] +; CHECK-GI-BASE-NEXT: ldr d3, [x1, #16] +; CHECK-GI-BASE-NEXT: ushll v4.8h, v0.8b, #0 ; CHECK-GI-BASE-NEXT: ushll2 v0.8h, v0.16b, #0 +; CHECK-GI-BASE-NEXT: ushll v5.8h, v1.8b, #0 +; CHECK-GI-BASE-NEXT: ushll v2.8h, v2.8b, #0 ; CHECK-GI-BASE-NEXT: ushll2 v1.8h, v1.16b, #0 -; CHECK-GI-BASE-NEXT: umull v6.4s, v3.4h, v2.4h -; CHECK-GI-BASE-NEXT: umull2 v2.4s, v3.8h, v2.8h -; CHECK-GI-BASE-NEXT: ushll v3.8h, v4.8b, #0 -; CHECK-GI-BASE-NEXT: ushll v4.8h, v5.8b, #0 -; CHECK-GI-BASE-NEXT: umlal2 v2.4s, v4.8h, v3.8h -; CHECK-GI-BASE-NEXT: umlal v6.4s, v4.4h, v3.4h -; CHECK-GI-BASE-NEXT: umlal2 v2.4s, v1.8h, v0.8h -; CHECK-GI-BASE-NEXT: umlal v6.4s, v1.4h, v0.4h -; CHECK-GI-BASE-NEXT: add v0.4s, v6.4s, v2.4s +; CHECK-GI-BASE-NEXT: ushll v3.8h, v3.8b, #0 +; CHECK-GI-BASE-NEXT: umull v6.4s, v5.4h, v4.4h +; CHECK-GI-BASE-NEXT: umull2 v4.4s, v5.8h, v4.8h +; CHECK-GI-BASE-NEXT: umull2 v5.4s, v1.8h, v0.8h +; CHECK-GI-BASE-NEXT: umull v7.4s, v3.4h, v2.4h +; CHECK-GI-BASE-NEXT: umull v0.4s, v1.4h, v0.4h +; CHECK-GI-BASE-NEXT: umull2 v1.4s, v3.8h, v2.8h +; CHECK-GI-BASE-NEXT: addv s2, v6.4s +; CHECK-GI-BASE-NEXT: addv s3, v4.4s +; CHECK-GI-BASE-NEXT: addv s4, v5.4s +; CHECK-GI-BASE-NEXT: addv s5, v7.4s ; CHECK-GI-BASE-NEXT: addv s0, v0.4s -; CHECK-GI-BASE-NEXT: fmov w0, s0 +; CHECK-GI-BASE-NEXT: addv s1, v1.4s +; CHECK-GI-BASE-NEXT: fmov w8, s2 +; CHECK-GI-BASE-NEXT: fmov w9, s3 +; CHECK-GI-BASE-NEXT: fmov w10, s4 +; CHECK-GI-BASE-NEXT: fmov w11, s5 +; CHECK-GI-BASE-NEXT: add w8, w8, w9 +; CHECK-GI-BASE-NEXT: fmov w9, s0 +; CHECK-GI-BASE-NEXT: add w10, w10, w11 +; CHECK-GI-BASE-NEXT: fmov w11, s1 +; CHECK-GI-BASE-NEXT: add w8, w8, w9 +; CHECK-GI-BASE-NEXT: add w9, w10, w11 +; CHECK-GI-BASE-NEXT: add w0, w8, w9 ; CHECK-GI-BASE-NEXT: ret ; ; CHECK-GI-DOT-LABEL: test_udot_v24i8: ; CHECK-GI-DOT: // %bb.0: // %entry -; CHECK-GI-DOT-NEXT: ldr b1, [x0] -; CHECK-GI-DOT-NEXT: ldr b3, [x0, #1] ; CHECK-GI-DOT-NEXT: movi v0.2d, #0000000000000000 -; CHECK-GI-DOT-NEXT: ldr b2, [x1] -; CHECK-GI-DOT-NEXT: ldr b4, [x1, #1] -; CHECK-GI-DOT-NEXT: ldr b5, [x0, #8] -; CHECK-GI-DOT-NEXT: mov v1.b[1], v3.b[0] -; CHECK-GI-DOT-NEXT: ldr b3, [x0, #2] -; CHECK-GI-DOT-NEXT: ldr b6, [x1, #8] -; CHECK-GI-DOT-NEXT: mov v2.b[1], v4.b[0] -; CHECK-GI-DOT-NEXT: ldr b4, [x1, #2] -; CHECK-GI-DOT-NEXT: ldr b7, [x0, #17] -; CHECK-GI-DOT-NEXT: ldr b16, [x1, #17] -; CHECK-GI-DOT-NEXT: mov v1.b[2], v3.b[0] -; CHECK-GI-DOT-NEXT: ldr b3, [x0, #3] -; CHECK-GI-DOT-NEXT: mov v2.b[2], v4.b[0] -; CHECK-GI-DOT-NEXT: ldr b4, [x1, #3] -; CHECK-GI-DOT-NEXT: mov v1.b[3], v3.b[0] -; CHECK-GI-DOT-NEXT: ldr b3, [x0, #4] -; CHECK-GI-DOT-NEXT: mov v2.b[3], v4.b[0] -; CHECK-GI-DOT-NEXT: ldr b4, [x1, #4] -; CHECK-GI-DOT-NEXT: mov v1.b[4], v3.b[0] -; CHECK-GI-DOT-NEXT: ldr b3, [x0, #5] -; CHECK-GI-DOT-NEXT: mov v2.b[4], v4.b[0] -; CHECK-GI-DOT-NEXT: ldr b4, [x1, #5] -; CHECK-GI-DOT-NEXT: mov v1.b[5], v3.b[0] -; CHECK-GI-DOT-NEXT: ldr b3, [x0, #6] -; CHECK-GI-DOT-NEXT: mov v2.b[5], v4.b[0] -; CHECK-GI-DOT-NEXT: ldr b4, [x1, #6] -; CHECK-GI-DOT-NEXT: mov v1.b[6], v3.b[0] -; CHECK-GI-DOT-NEXT: ldr b3, [x0, #7] -; CHECK-GI-DOT-NEXT: mov v2.b[6], v4.b[0] -; CHECK-GI-DOT-NEXT: ldr b4, [x1, #7] -; CHECK-GI-DOT-NEXT: mov v1.b[7], v3.b[0] -; CHECK-GI-DOT-NEXT: ldr b3, [x0, #16] -; CHECK-GI-DOT-NEXT: mov v2.b[7], v4.b[0] -; CHECK-GI-DOT-NEXT: ldr b4, [x1, #16] -; CHECK-GI-DOT-NEXT: mov v3.b[1], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x0, #18] -; CHECK-GI-DOT-NEXT: mov v4.b[1], v16.b[0] -; CHECK-GI-DOT-NEXT: ldr b16, [x1, #18] -; CHECK-GI-DOT-NEXT: mov v1.b[8], v5.b[0] -; CHECK-GI-DOT-NEXT: ldr b5, [x0, #9] -; CHECK-GI-DOT-NEXT: mov v2.b[8], v6.b[0] -; CHECK-GI-DOT-NEXT: ldr b6, [x1, #9] -; CHECK-GI-DOT-NEXT: mov v3.b[2], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x0, #19] -; CHECK-GI-DOT-NEXT: mov v4.b[2], v16.b[0] -; CHECK-GI-DOT-NEXT: ldr b16, [x1, #19] -; CHECK-GI-DOT-NEXT: mov v1.b[9], v5.b[0] -; CHECK-GI-DOT-NEXT: ldr b5, [x0, #10] -; CHECK-GI-DOT-NEXT: mov v2.b[9], v6.b[0] -; CHECK-GI-DOT-NEXT: ldr b6, [x1, #10] -; CHECK-GI-DOT-NEXT: mov v3.b[3], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x0, #20] -; CHECK-GI-DOT-NEXT: mov v4.b[3], v16.b[0] -; CHECK-GI-DOT-NEXT: ldr b16, [x1, #20] -; CHECK-GI-DOT-NEXT: mov v1.b[10], v5.b[0] -; CHECK-GI-DOT-NEXT: ldr b5, [x0, #11] -; CHECK-GI-DOT-NEXT: mov v2.b[10], v6.b[0] -; CHECK-GI-DOT-NEXT: ldr b6, [x1, #11] -; CHECK-GI-DOT-NEXT: mov v3.b[4], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x0, #21] -; CHECK-GI-DOT-NEXT: mov v4.b[4], v16.b[0] -; CHECK-GI-DOT-NEXT: ldr b16, [x1, #21] -; CHECK-GI-DOT-NEXT: mov v1.b[11], v5.b[0] -; CHECK-GI-DOT-NEXT: ldr b5, [x0, #12] -; CHECK-GI-DOT-NEXT: mov v2.b[11], v6.b[0] -; CHECK-GI-DOT-NEXT: ldr b6, [x1, #12] -; CHECK-GI-DOT-NEXT: mov v3.b[5], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x0, #22] -; CHECK-GI-DOT-NEXT: mov v4.b[5], v16.b[0] -; CHECK-GI-DOT-NEXT: ldr b16, [x1, #22] -; CHECK-GI-DOT-NEXT: mov v1.b[12], v5.b[0] -; CHECK-GI-DOT-NEXT: ldr b5, [x0, #13] -; CHECK-GI-DOT-NEXT: mov v2.b[12], v6.b[0] -; CHECK-GI-DOT-NEXT: ldr b6, [x1, #13] -; CHECK-GI-DOT-NEXT: mov v3.b[6], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x0, #23] -; CHECK-GI-DOT-NEXT: mov v4.b[6], v16.b[0] -; CHECK-GI-DOT-NEXT: ldr b16, [x1, #23] -; CHECK-GI-DOT-NEXT: mov v1.b[13], v5.b[0] -; CHECK-GI-DOT-NEXT: ldr b5, [x0, #14] -; CHECK-GI-DOT-NEXT: mov v2.b[13], v6.b[0] -; CHECK-GI-DOT-NEXT: ldr b6, [x1, #14] -; CHECK-GI-DOT-NEXT: mov v3.b[7], v7.b[0] -; CHECK-GI-DOT-NEXT: mov v4.b[7], v16.b[0] -; CHECK-GI-DOT-NEXT: mov v1.b[14], v5.b[0] -; CHECK-GI-DOT-NEXT: ldr b5, [x0, #15] -; CHECK-GI-DOT-NEXT: mov v2.b[14], v6.b[0] -; CHECK-GI-DOT-NEXT: ldr b6, [x1, #15] -; CHECK-GI-DOT-NEXT: fmov d3, d3 -; CHECK-GI-DOT-NEXT: fmov d4, d4 -; CHECK-GI-DOT-NEXT: mov v1.b[15], v5.b[0] -; CHECK-GI-DOT-NEXT: movi v5.2d, #0000000000000000 -; CHECK-GI-DOT-NEXT: mov v2.b[15], v6.b[0] -; CHECK-GI-DOT-NEXT: udot v0.4s, v4.16b, v3.16b -; CHECK-GI-DOT-NEXT: udot v5.4s, v2.16b, v1.16b -; CHECK-GI-DOT-NEXT: add v0.4s, v5.4s, v0.4s +; CHECK-GI-DOT-NEXT: movi v1.2d, #0000000000000000 +; CHECK-GI-DOT-NEXT: ldr q2, [x0] +; CHECK-GI-DOT-NEXT: ldr d3, [x0, #16] +; CHECK-GI-DOT-NEXT: ldr q4, [x1] +; CHECK-GI-DOT-NEXT: ldr d5, [x1, #16] +; CHECK-GI-DOT-NEXT: udot v1.4s, v4.16b, v2.16b +; CHECK-GI-DOT-NEXT: udot v0.4s, v5.16b, v3.16b +; CHECK-GI-DOT-NEXT: add v0.4s, v1.4s, v0.4s ; CHECK-GI-DOT-NEXT: addv s0, v0.4s ; CHECK-GI-DOT-NEXT: fmov w0, s0 ; CHECK-GI-DOT-NEXT: ret @@ -2257,243 +2176,91 @@ define i32 @test_udot_v48i8(ptr %p1, ptr %p2) { ; ; CHECK-GI-BASE-LABEL: test_udot_v48i8: ; CHECK-GI-BASE: // %bb.0: // %entry -; CHECK-GI-BASE-NEXT: ldp q0, q4, [x1] -; CHECK-GI-BASE-NEXT: ldr q2, [x0, #32] -; CHECK-GI-BASE-NEXT: ldp q1, q3, [x0] -; CHECK-GI-BASE-NEXT: ldr q7, [x1, #32] -; CHECK-GI-BASE-NEXT: ushll2 v16.8h, v2.16b, #0 -; CHECK-GI-BASE-NEXT: ushll2 v6.8h, v0.16b, #0 -; CHECK-GI-BASE-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-GI-BASE-NEXT: ushll2 v17.8h, v7.16b, #0 -; CHECK-GI-BASE-NEXT: ushll2 v5.8h, v1.16b, #0 -; CHECK-GI-BASE-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-GI-BASE-NEXT: umull2 v18.4s, v6.8h, v5.8h -; CHECK-GI-BASE-NEXT: umull v19.4s, v0.4h, v1.4h -; CHECK-GI-BASE-NEXT: umull v5.4s, v6.4h, v5.4h -; CHECK-GI-BASE-NEXT: umull2 v0.4s, v0.8h, v1.8h -; CHECK-GI-BASE-NEXT: ushll v1.8h, v2.8b, #0 -; CHECK-GI-BASE-NEXT: ushll v2.8h, v7.8b, #0 -; CHECK-GI-BASE-NEXT: ushll2 v6.8h, v3.16b, #0 -; CHECK-GI-BASE-NEXT: ushll2 v7.8h, v4.16b, #0 -; CHECK-GI-BASE-NEXT: umlal2 v18.4s, v17.8h, v16.8h -; CHECK-GI-BASE-NEXT: umlal v5.4s, v17.4h, v16.4h -; CHECK-GI-BASE-NEXT: umlal v19.4s, v2.4h, v1.4h -; CHECK-GI-BASE-NEXT: umlal2 v0.4s, v2.8h, v1.8h -; CHECK-GI-BASE-NEXT: ushll v1.8h, v3.8b, #0 -; CHECK-GI-BASE-NEXT: ushll v2.8h, v4.8b, #0 -; CHECK-GI-BASE-NEXT: umlal2 v18.4s, v7.8h, v6.8h -; CHECK-GI-BASE-NEXT: umlal v5.4s, v7.4h, v6.4h -; CHECK-GI-BASE-NEXT: umlal v19.4s, v2.4h, v1.4h -; CHECK-GI-BASE-NEXT: umlal2 v0.4s, v2.8h, v1.8h -; CHECK-GI-BASE-NEXT: add v1.4s, v19.4s, v5.4s -; CHECK-GI-BASE-NEXT: add v0.4s, v0.4s, v18.4s -; CHECK-GI-BASE-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-GI-BASE-NEXT: ldp q0, q3, [x1] +; CHECK-GI-BASE-NEXT: ldr q6, [x1, #32] +; CHECK-GI-BASE-NEXT: ldp q1, q2, [x0] +; CHECK-GI-BASE-NEXT: ldr q17, [x0, #32] +; CHECK-GI-BASE-NEXT: ushll v4.8h, v0.8b, #0 +; CHECK-GI-BASE-NEXT: ushll2 v0.8h, v0.16b, #0 +; CHECK-GI-BASE-NEXT: ushll v7.8h, v3.8b, #0 +; CHECK-GI-BASE-NEXT: ushll v5.8h, v1.8b, #0 +; CHECK-GI-BASE-NEXT: ushll2 v1.8h, v1.16b, #0 +; CHECK-GI-BASE-NEXT: ushll v16.8h, v2.8b, #0 +; CHECK-GI-BASE-NEXT: ushll2 v3.8h, v3.16b, #0 +; CHECK-GI-BASE-NEXT: ushll2 v2.8h, v2.16b, #0 +; CHECK-GI-BASE-NEXT: umull v18.4s, v4.4h, v5.4h +; CHECK-GI-BASE-NEXT: umull2 v4.4s, v4.8h, v5.8h +; CHECK-GI-BASE-NEXT: umull2 v19.4s, v0.8h, v1.8h +; CHECK-GI-BASE-NEXT: umull v20.4s, v7.4h, v16.4h +; CHECK-GI-BASE-NEXT: umull v0.4s, v0.4h, v1.4h +; CHECK-GI-BASE-NEXT: ushll v5.8h, v6.8b, #0 +; CHECK-GI-BASE-NEXT: ushll v1.8h, v17.8b, #0 +; CHECK-GI-BASE-NEXT: umull2 v7.4s, v7.8h, v16.8h +; CHECK-GI-BASE-NEXT: ushll2 v6.8h, v6.16b, #0 +; CHECK-GI-BASE-NEXT: ushll2 v17.8h, v17.16b, #0 +; CHECK-GI-BASE-NEXT: addv s16, v18.4s +; CHECK-GI-BASE-NEXT: addv s4, v4.4s +; CHECK-GI-BASE-NEXT: umull v18.4s, v3.4h, v2.4h +; CHECK-GI-BASE-NEXT: umull2 v2.4s, v3.8h, v2.8h +; CHECK-GI-BASE-NEXT: addv s3, v19.4s +; CHECK-GI-BASE-NEXT: umull v19.4s, v5.4h, v1.4h +; CHECK-GI-BASE-NEXT: umull2 v1.4s, v5.8h, v1.8h +; CHECK-GI-BASE-NEXT: addv s5, v20.4s ; CHECK-GI-BASE-NEXT: addv s0, v0.4s -; CHECK-GI-BASE-NEXT: fmov w0, s0 +; CHECK-GI-BASE-NEXT: addv s7, v7.4s +; CHECK-GI-BASE-NEXT: umull v20.4s, v6.4h, v17.4h +; CHECK-GI-BASE-NEXT: umull2 v6.4s, v6.8h, v17.8h +; CHECK-GI-BASE-NEXT: fmov w8, s16 +; CHECK-GI-BASE-NEXT: fmov w9, s4 +; CHECK-GI-BASE-NEXT: fmov w10, s3 +; CHECK-GI-BASE-NEXT: addv s3, v18.4s +; CHECK-GI-BASE-NEXT: addv s2, v2.4s +; CHECK-GI-BASE-NEXT: fmov w11, s5 +; CHECK-GI-BASE-NEXT: addv s4, v19.4s +; CHECK-GI-BASE-NEXT: add w8, w8, w9 +; CHECK-GI-BASE-NEXT: fmov w9, s0 +; CHECK-GI-BASE-NEXT: addv s0, v1.4s +; CHECK-GI-BASE-NEXT: addv s1, v20.4s +; CHECK-GI-BASE-NEXT: addv s5, v6.4s +; CHECK-GI-BASE-NEXT: add w10, w10, w11 +; CHECK-GI-BASE-NEXT: fmov w11, s3 +; CHECK-GI-BASE-NEXT: fmov w12, s2 +; CHECK-GI-BASE-NEXT: add w8, w8, w9 +; CHECK-GI-BASE-NEXT: fmov w9, s7 +; CHECK-GI-BASE-NEXT: add w9, w10, w9 +; CHECK-GI-BASE-NEXT: add w10, w11, w12 +; CHECK-GI-BASE-NEXT: fmov w11, s4 +; CHECK-GI-BASE-NEXT: add w8, w8, w9 +; CHECK-GI-BASE-NEXT: add w9, w10, w11 +; CHECK-GI-BASE-NEXT: fmov w10, s0 +; CHECK-GI-BASE-NEXT: fmov w11, s5 +; CHECK-GI-BASE-NEXT: add w9, w9, w10 +; CHECK-GI-BASE-NEXT: fmov w10, s1 +; CHECK-GI-BASE-NEXT: add w8, w8, w9 +; CHECK-GI-BASE-NEXT: add w9, w10, w11 +; CHECK-GI-BASE-NEXT: add w0, w8, w9 ; CHECK-GI-BASE-NEXT: ret ; ; CHECK-GI-DOT-LABEL: test_udot_v48i8: ; CHECK-GI-DOT: // %bb.0: // %entry -; CHECK-GI-DOT-NEXT: ldr b1, [x0] -; CHECK-GI-DOT-NEXT: ldr b5, [x0, #1] ; CHECK-GI-DOT-NEXT: movi v0.2d, #0000000000000000 -; CHECK-GI-DOT-NEXT: ldr b2, [x0, #16] -; CHECK-GI-DOT-NEXT: ldr b6, [x0, #17] -; CHECK-GI-DOT-NEXT: ldr b4, [x1] -; CHECK-GI-DOT-NEXT: ldr b17, [x1, #1] -; CHECK-GI-DOT-NEXT: mov v1.b[1], v5.b[0] -; CHECK-GI-DOT-NEXT: ldr b5, [x1, #16] -; CHECK-GI-DOT-NEXT: ldr b18, [x1, #17] -; CHECK-GI-DOT-NEXT: mov v2.b[1], v6.b[0] -; CHECK-GI-DOT-NEXT: ldr b3, [x0, #32] -; CHECK-GI-DOT-NEXT: ldr b7, [x0, #33] -; CHECK-GI-DOT-NEXT: mov v4.b[1], v17.b[0] -; CHECK-GI-DOT-NEXT: ldr b6, [x1, #32] -; CHECK-GI-DOT-NEXT: ldr b16, [x1, #33] -; CHECK-GI-DOT-NEXT: mov v5.b[1], v18.b[0] -; CHECK-GI-DOT-NEXT: ldr b17, [x0, #2] -; CHECK-GI-DOT-NEXT: ldr b18, [x0, #18] -; CHECK-GI-DOT-NEXT: mov v3.b[1], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x1, #2] -; CHECK-GI-DOT-NEXT: mov v6.b[1], v16.b[0] -; CHECK-GI-DOT-NEXT: mov v1.b[2], v17.b[0] -; CHECK-GI-DOT-NEXT: ldr b16, [x1, #18] -; CHECK-GI-DOT-NEXT: ldr b17, [x0, #34] -; CHECK-GI-DOT-NEXT: mov v2.b[2], v18.b[0] -; CHECK-GI-DOT-NEXT: ldr b18, [x1, #34] -; CHECK-GI-DOT-NEXT: mov v4.b[2], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x0, #3] -; CHECK-GI-DOT-NEXT: mov v5.b[2], v16.b[0] -; CHECK-GI-DOT-NEXT: ldr b16, [x0, #19] -; CHECK-GI-DOT-NEXT: mov v3.b[2], v17.b[0] -; CHECK-GI-DOT-NEXT: ldr b17, [x1, #19] -; CHECK-GI-DOT-NEXT: mov v6.b[2], v18.b[0] -; CHECK-GI-DOT-NEXT: mov v1.b[3], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x1, #3] -; CHECK-GI-DOT-NEXT: ldr b18, [x0, #35] -; CHECK-GI-DOT-NEXT: mov v2.b[3], v16.b[0] -; CHECK-GI-DOT-NEXT: ldr b16, [x1, #35] -; CHECK-GI-DOT-NEXT: mov v4.b[3], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x0, #4] -; CHECK-GI-DOT-NEXT: mov v5.b[3], v17.b[0] -; CHECK-GI-DOT-NEXT: ldr b17, [x0, #20] -; CHECK-GI-DOT-NEXT: mov v3.b[3], v18.b[0] -; CHECK-GI-DOT-NEXT: ldr b18, [x0, #36] -; CHECK-GI-DOT-NEXT: mov v6.b[3], v16.b[0] -; CHECK-GI-DOT-NEXT: mov v1.b[4], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x1, #4] -; CHECK-GI-DOT-NEXT: ldr b16, [x1, #20] -; CHECK-GI-DOT-NEXT: mov v2.b[4], v17.b[0] -; CHECK-GI-DOT-NEXT: ldr b17, [x1, #36] -; CHECK-GI-DOT-NEXT: mov v4.b[4], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x0, #5] -; CHECK-GI-DOT-NEXT: mov v5.b[4], v16.b[0] -; CHECK-GI-DOT-NEXT: ldr b16, [x0, #21] -; CHECK-GI-DOT-NEXT: mov v3.b[4], v18.b[0] -; CHECK-GI-DOT-NEXT: mov v6.b[4], v17.b[0] -; CHECK-GI-DOT-NEXT: mov v1.b[5], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x1, #5] -; CHECK-GI-DOT-NEXT: ldr b17, [x1, #21] -; CHECK-GI-DOT-NEXT: ldr b18, [x0, #37] -; CHECK-GI-DOT-NEXT: mov v2.b[5], v16.b[0] -; CHECK-GI-DOT-NEXT: ldr b16, [x1, #37] -; CHECK-GI-DOT-NEXT: mov v4.b[5], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x0, #6] -; CHECK-GI-DOT-NEXT: mov v5.b[5], v17.b[0] -; CHECK-GI-DOT-NEXT: ldr b17, [x0, #22] -; CHECK-GI-DOT-NEXT: mov v3.b[5], v18.b[0] -; CHECK-GI-DOT-NEXT: mov v6.b[5], v16.b[0] -; CHECK-GI-DOT-NEXT: mov v1.b[6], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x1, #6] -; CHECK-GI-DOT-NEXT: ldr b16, [x1, #22] -; CHECK-GI-DOT-NEXT: ldr b18, [x0, #38] -; CHECK-GI-DOT-NEXT: mov v2.b[6], v17.b[0] -; CHECK-GI-DOT-NEXT: ldr b17, [x1, #38] -; CHECK-GI-DOT-NEXT: mov v4.b[6], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x0, #7] -; CHECK-GI-DOT-NEXT: mov v5.b[6], v16.b[0] -; CHECK-GI-DOT-NEXT: ldr b16, [x0, #23] -; CHECK-GI-DOT-NEXT: mov v3.b[6], v18.b[0] -; CHECK-GI-DOT-NEXT: mov v6.b[6], v17.b[0] -; CHECK-GI-DOT-NEXT: mov v1.b[7], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x1, #7] -; CHECK-GI-DOT-NEXT: ldr b17, [x1, #23] -; CHECK-GI-DOT-NEXT: ldr b18, [x0, #39] -; CHECK-GI-DOT-NEXT: mov v2.b[7], v16.b[0] -; CHECK-GI-DOT-NEXT: ldr b16, [x1, #39] -; CHECK-GI-DOT-NEXT: mov v4.b[7], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x0, #8] -; CHECK-GI-DOT-NEXT: mov v5.b[7], v17.b[0] -; CHECK-GI-DOT-NEXT: ldr b17, [x0, #24] -; CHECK-GI-DOT-NEXT: mov v3.b[7], v18.b[0] -; CHECK-GI-DOT-NEXT: mov v6.b[7], v16.b[0] -; CHECK-GI-DOT-NEXT: mov v1.b[8], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x1, #8] -; CHECK-GI-DOT-NEXT: ldr b16, [x1, #24] -; CHECK-GI-DOT-NEXT: ldr b18, [x0, #40] -; CHECK-GI-DOT-NEXT: mov v2.b[8], v17.b[0] -; CHECK-GI-DOT-NEXT: ldr b17, [x1, #40] -; CHECK-GI-DOT-NEXT: mov v4.b[8], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x0, #9] -; CHECK-GI-DOT-NEXT: mov v5.b[8], v16.b[0] -; CHECK-GI-DOT-NEXT: ldr b16, [x0, #25] -; CHECK-GI-DOT-NEXT: mov v3.b[8], v18.b[0] -; CHECK-GI-DOT-NEXT: mov v6.b[8], v17.b[0] -; CHECK-GI-DOT-NEXT: mov v1.b[9], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x1, #9] -; CHECK-GI-DOT-NEXT: ldr b17, [x1, #25] -; CHECK-GI-DOT-NEXT: ldr b18, [x0, #41] -; CHECK-GI-DOT-NEXT: mov v2.b[9], v16.b[0] -; CHECK-GI-DOT-NEXT: ldr b16, [x1, #41] -; CHECK-GI-DOT-NEXT: mov v4.b[9], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x0, #10] -; CHECK-GI-DOT-NEXT: mov v5.b[9], v17.b[0] -; CHECK-GI-DOT-NEXT: ldr b17, [x0, #26] -; CHECK-GI-DOT-NEXT: mov v3.b[9], v18.b[0] -; CHECK-GI-DOT-NEXT: mov v6.b[9], v16.b[0] -; CHECK-GI-DOT-NEXT: mov v1.b[10], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x1, #10] -; CHECK-GI-DOT-NEXT: ldr b16, [x1, #26] -; CHECK-GI-DOT-NEXT: ldr b18, [x0, #42] -; CHECK-GI-DOT-NEXT: mov v2.b[10], v17.b[0] -; CHECK-GI-DOT-NEXT: ldr b17, [x1, #42] -; CHECK-GI-DOT-NEXT: mov v4.b[10], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x0, #11] -; CHECK-GI-DOT-NEXT: mov v5.b[10], v16.b[0] -; CHECK-GI-DOT-NEXT: ldr b16, [x0, #27] -; CHECK-GI-DOT-NEXT: mov v3.b[10], v18.b[0] -; CHECK-GI-DOT-NEXT: mov v6.b[10], v17.b[0] -; CHECK-GI-DOT-NEXT: mov v1.b[11], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x1, #11] -; CHECK-GI-DOT-NEXT: ldr b17, [x1, #27] -; CHECK-GI-DOT-NEXT: ldr b18, [x0, #43] -; CHECK-GI-DOT-NEXT: mov v2.b[11], v16.b[0] -; CHECK-GI-DOT-NEXT: ldr b16, [x1, #43] -; CHECK-GI-DOT-NEXT: mov v4.b[11], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x0, #12] -; CHECK-GI-DOT-NEXT: mov v5.b[11], v17.b[0] -; CHECK-GI-DOT-NEXT: ldr b17, [x0, #28] -; CHECK-GI-DOT-NEXT: mov v3.b[11], v18.b[0] -; CHECK-GI-DOT-NEXT: mov v6.b[11], v16.b[0] -; CHECK-GI-DOT-NEXT: mov v1.b[12], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x1, #12] -; CHECK-GI-DOT-NEXT: ldr b16, [x1, #28] -; CHECK-GI-DOT-NEXT: ldr b18, [x0, #44] -; CHECK-GI-DOT-NEXT: mov v2.b[12], v17.b[0] -; CHECK-GI-DOT-NEXT: ldr b17, [x1, #44] -; CHECK-GI-DOT-NEXT: mov v4.b[12], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x0, #13] -; CHECK-GI-DOT-NEXT: mov v5.b[12], v16.b[0] -; CHECK-GI-DOT-NEXT: ldr b16, [x0, #29] -; CHECK-GI-DOT-NEXT: mov v3.b[12], v18.b[0] -; CHECK-GI-DOT-NEXT: mov v6.b[12], v17.b[0] -; CHECK-GI-DOT-NEXT: mov v1.b[13], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x1, #13] -; CHECK-GI-DOT-NEXT: ldr b17, [x1, #29] -; CHECK-GI-DOT-NEXT: ldr b18, [x0, #45] -; CHECK-GI-DOT-NEXT: mov v2.b[13], v16.b[0] -; CHECK-GI-DOT-NEXT: ldr b16, [x1, #45] -; CHECK-GI-DOT-NEXT: mov v4.b[13], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x0, #14] -; CHECK-GI-DOT-NEXT: mov v5.b[13], v17.b[0] -; CHECK-GI-DOT-NEXT: ldr b17, [x0, #30] -; CHECK-GI-DOT-NEXT: mov v3.b[13], v18.b[0] -; CHECK-GI-DOT-NEXT: mov v6.b[13], v16.b[0] -; CHECK-GI-DOT-NEXT: mov v1.b[14], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x1, #14] -; CHECK-GI-DOT-NEXT: ldr b16, [x1, #30] -; CHECK-GI-DOT-NEXT: ldr b18, [x0, #46] -; CHECK-GI-DOT-NEXT: mov v2.b[14], v17.b[0] -; CHECK-GI-DOT-NEXT: ldr b17, [x1, #46] -; CHECK-GI-DOT-NEXT: mov v4.b[14], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x0, #15] -; CHECK-GI-DOT-NEXT: mov v5.b[14], v16.b[0] -; CHECK-GI-DOT-NEXT: ldr b16, [x0, #31] -; CHECK-GI-DOT-NEXT: mov v3.b[14], v18.b[0] -; CHECK-GI-DOT-NEXT: mov v6.b[14], v17.b[0] -; CHECK-GI-DOT-NEXT: mov v1.b[15], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x1, #15] -; CHECK-GI-DOT-NEXT: ldr b17, [x1, #31] -; CHECK-GI-DOT-NEXT: ldr b18, [x0, #47] -; CHECK-GI-DOT-NEXT: mov v2.b[15], v16.b[0] -; CHECK-GI-DOT-NEXT: ldr b16, [x1, #47] -; CHECK-GI-DOT-NEXT: mov v4.b[15], v7.b[0] -; CHECK-GI-DOT-NEXT: movi v7.2d, #0000000000000000 -; CHECK-GI-DOT-NEXT: mov v5.b[15], v17.b[0] -; CHECK-GI-DOT-NEXT: mov v3.b[15], v18.b[0] -; CHECK-GI-DOT-NEXT: mov v6.b[15], v16.b[0] -; CHECK-GI-DOT-NEXT: movi v16.2d, #0000000000000000 -; CHECK-GI-DOT-NEXT: udot v0.4s, v4.16b, v1.16b -; CHECK-GI-DOT-NEXT: udot v7.4s, v5.16b, v2.16b -; CHECK-GI-DOT-NEXT: udot v16.4s, v6.16b, v3.16b +; CHECK-GI-DOT-NEXT: movi v1.2d, #0000000000000000 +; CHECK-GI-DOT-NEXT: ldr q7, [x0, #32] +; CHECK-GI-DOT-NEXT: ldp q3, q4, [x0] +; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000 +; CHECK-GI-DOT-NEXT: ldp q5, q6, [x1] +; CHECK-GI-DOT-NEXT: ldr q16, [x1, #32] +; CHECK-GI-DOT-NEXT: udot v0.4s, v5.16b, v3.16b +; CHECK-GI-DOT-NEXT: udot v1.4s, v6.16b, v4.16b +; CHECK-GI-DOT-NEXT: udot v2.4s, v16.16b, v7.16b ; CHECK-GI-DOT-NEXT: addv s0, v0.4s -; CHECK-GI-DOT-NEXT: addv s1, v7.4s -; CHECK-GI-DOT-NEXT: addv s2, v16.4s +; CHECK-GI-DOT-NEXT: addv s1, v1.4s +; CHECK-GI-DOT-NEXT: addv s2, v2.4s ; CHECK-GI-DOT-NEXT: fmov w8, s0 ; CHECK-GI-DOT-NEXT: fmov w9, s1 -; CHECK-GI-DOT-NEXT: fmov w10, s2 ; CHECK-GI-DOT-NEXT: add w8, w8, w9 -; CHECK-GI-DOT-NEXT: add w0, w8, w10 +; CHECK-GI-DOT-NEXT: fmov w9, s2 +; CHECK-GI-DOT-NEXT: add w0, w8, w9 ; CHECK-GI-DOT-NEXT: ret entry: %a = load <48 x i8>, ptr %p1 @@ -2648,126 +2415,50 @@ define i32 @test_sdot_v24i8(ptr %p1, ptr %p2) { ; CHECK-GI-BASE: // %bb.0: // %entry ; CHECK-GI-BASE-NEXT: ldr q0, [x0] ; CHECK-GI-BASE-NEXT: ldr q1, [x1] -; CHECK-GI-BASE-NEXT: ldr d4, [x0, #16] -; CHECK-GI-BASE-NEXT: ldr d5, [x1, #16] -; CHECK-GI-BASE-NEXT: sshll v2.8h, v0.8b, #0 -; CHECK-GI-BASE-NEXT: sshll v3.8h, v1.8b, #0 +; CHECK-GI-BASE-NEXT: ldr d2, [x0, #16] +; CHECK-GI-BASE-NEXT: ldr d3, [x1, #16] +; CHECK-GI-BASE-NEXT: sshll v4.8h, v0.8b, #0 ; CHECK-GI-BASE-NEXT: sshll2 v0.8h, v0.16b, #0 +; CHECK-GI-BASE-NEXT: sshll v5.8h, v1.8b, #0 +; CHECK-GI-BASE-NEXT: sshll v2.8h, v2.8b, #0 ; CHECK-GI-BASE-NEXT: sshll2 v1.8h, v1.16b, #0 -; CHECK-GI-BASE-NEXT: smull v6.4s, v3.4h, v2.4h -; CHECK-GI-BASE-NEXT: smull2 v2.4s, v3.8h, v2.8h -; CHECK-GI-BASE-NEXT: sshll v3.8h, v4.8b, #0 -; CHECK-GI-BASE-NEXT: sshll v4.8h, v5.8b, #0 -; CHECK-GI-BASE-NEXT: smlal2 v2.4s, v4.8h, v3.8h -; CHECK-GI-BASE-NEXT: smlal v6.4s, v4.4h, v3.4h -; CHECK-GI-BASE-NEXT: smlal2 v2.4s, v1.8h, v0.8h -; CHECK-GI-BASE-NEXT: smlal v6.4s, v1.4h, v0.4h -; CHECK-GI-BASE-NEXT: add v0.4s, v6.4s, v2.4s +; CHECK-GI-BASE-NEXT: sshll v3.8h, v3.8b, #0 +; CHECK-GI-BASE-NEXT: smull v6.4s, v5.4h, v4.4h +; CHECK-GI-BASE-NEXT: smull2 v4.4s, v5.8h, v4.8h +; CHECK-GI-BASE-NEXT: smull2 v5.4s, v1.8h, v0.8h +; CHECK-GI-BASE-NEXT: smull v7.4s, v3.4h, v2.4h +; CHECK-GI-BASE-NEXT: smull v0.4s, v1.4h, v0.4h +; CHECK-GI-BASE-NEXT: smull2 v1.4s, v3.8h, v2.8h +; CHECK-GI-BASE-NEXT: addv s2, v6.4s +; CHECK-GI-BASE-NEXT: addv s3, v4.4s +; CHECK-GI-BASE-NEXT: addv s4, v5.4s +; CHECK-GI-BASE-NEXT: addv s5, v7.4s ; CHECK-GI-BASE-NEXT: addv s0, v0.4s -; CHECK-GI-BASE-NEXT: fmov w0, s0 +; CHECK-GI-BASE-NEXT: addv s1, v1.4s +; CHECK-GI-BASE-NEXT: fmov w8, s2 +; CHECK-GI-BASE-NEXT: fmov w9, s3 +; CHECK-GI-BASE-NEXT: fmov w10, s4 +; CHECK-GI-BASE-NEXT: fmov w11, s5 +; CHECK-GI-BASE-NEXT: add w8, w8, w9 +; CHECK-GI-BASE-NEXT: fmov w9, s0 +; CHECK-GI-BASE-NEXT: add w10, w10, w11 +; CHECK-GI-BASE-NEXT: fmov w11, s1 +; CHECK-GI-BASE-NEXT: add w8, w8, w9 +; CHECK-GI-BASE-NEXT: add w9, w10, w11 +; CHECK-GI-BASE-NEXT: add w0, w8, w9 ; CHECK-GI-BASE-NEXT: ret ; ; CHECK-GI-DOT-LABEL: test_sdot_v24i8: ; CHECK-GI-DOT: // %bb.0: // %entry -; CHECK-GI-DOT-NEXT: ldr b1, [x0] -; CHECK-GI-DOT-NEXT: ldr b3, [x0, #1] ; CHECK-GI-DOT-NEXT: movi v0.2d, #0000000000000000 -; CHECK-GI-DOT-NEXT: ldr b2, [x1] -; CHECK-GI-DOT-NEXT: ldr b4, [x1, #1] -; CHECK-GI-DOT-NEXT: ldr b5, [x0, #8] -; CHECK-GI-DOT-NEXT: mov v1.b[1], v3.b[0] -; CHECK-GI-DOT-NEXT: ldr b3, [x0, #2] -; CHECK-GI-DOT-NEXT: ldr b6, [x1, #8] -; CHECK-GI-DOT-NEXT: mov v2.b[1], v4.b[0] -; CHECK-GI-DOT-NEXT: ldr b4, [x1, #2] -; CHECK-GI-DOT-NEXT: ldr b7, [x0, #17] -; CHECK-GI-DOT-NEXT: ldr b16, [x1, #17] -; CHECK-GI-DOT-NEXT: mov v1.b[2], v3.b[0] -; CHECK-GI-DOT-NEXT: ldr b3, [x0, #3] -; CHECK-GI-DOT-NEXT: mov v2.b[2], v4.b[0] -; CHECK-GI-DOT-NEXT: ldr b4, [x1, #3] -; CHECK-GI-DOT-NEXT: mov v1.b[3], v3.b[0] -; CHECK-GI-DOT-NEXT: ldr b3, [x0, #4] -; CHECK-GI-DOT-NEXT: mov v2.b[3], v4.b[0] -; CHECK-GI-DOT-NEXT: ldr b4, [x1, #4] -; CHECK-GI-DOT-NEXT: mov v1.b[4], v3.b[0] -; CHECK-GI-DOT-NEXT: ldr b3, [x0, #5] -; CHECK-GI-DOT-NEXT: mov v2.b[4], v4.b[0] -; CHECK-GI-DOT-NEXT: ldr b4, [x1, #5] -; CHECK-GI-DOT-NEXT: mov v1.b[5], v3.b[0] -; CHECK-GI-DOT-NEXT: ldr b3, [x0, #6] -; CHECK-GI-DOT-NEXT: mov v2.b[5], v4.b[0] -; CHECK-GI-DOT-NEXT: ldr b4, [x1, #6] -; CHECK-GI-DOT-NEXT: mov v1.b[6], v3.b[0] -; CHECK-GI-DOT-NEXT: ldr b3, [x0, #7] -; CHECK-GI-DOT-NEXT: mov v2.b[6], v4.b[0] -; CHECK-GI-DOT-NEXT: ldr b4, [x1, #7] -; CHECK-GI-DOT-NEXT: mov v1.b[7], v3.b[0] -; CHECK-GI-DOT-NEXT: ldr b3, [x0, #16] -; CHECK-GI-DOT-NEXT: mov v2.b[7], v4.b[0] -; CHECK-GI-DOT-NEXT: ldr b4, [x1, #16] -; CHECK-GI-DOT-NEXT: mov v3.b[1], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x0, #18] -; CHECK-GI-DOT-NEXT: mov v4.b[1], v16.b[0] -; CHECK-GI-DOT-NEXT: ldr b16, [x1, #18] -; CHECK-GI-DOT-NEXT: mov v1.b[8], v5.b[0] -; CHECK-GI-DOT-NEXT: ldr b5, [x0, #9] -; CHECK-GI-DOT-NEXT: mov v2.b[8], v6.b[0] -; CHECK-GI-DOT-NEXT: ldr b6, [x1, #9] -; CHECK-GI-DOT-NEXT: mov v3.b[2], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x0, #19] -; CHECK-GI-DOT-NEXT: mov v4.b[2], v16.b[0] -; CHECK-GI-DOT-NEXT: ldr b16, [x1, #19] -; CHECK-GI-DOT-NEXT: mov v1.b[9], v5.b[0] -; CHECK-GI-DOT-NEXT: ldr b5, [x0, #10] -; CHECK-GI-DOT-NEXT: mov v2.b[9], v6.b[0] -; CHECK-GI-DOT-NEXT: ldr b6, [x1, #10] -; CHECK-GI-DOT-NEXT: mov v3.b[3], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x0, #20] -; CHECK-GI-DOT-NEXT: mov v4.b[3], v16.b[0] -; CHECK-GI-DOT-NEXT: ldr b16, [x1, #20] -; CHECK-GI-DOT-NEXT: mov v1.b[10], v5.b[0] -; CHECK-GI-DOT-NEXT: ldr b5, [x0, #11] -; CHECK-GI-DOT-NEXT: mov v2.b[10], v6.b[0] -; CHECK-GI-DOT-NEXT: ldr b6, [x1, #11] -; CHECK-GI-DOT-NEXT: mov v3.b[4], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x0, #21] -; CHECK-GI-DOT-NEXT: mov v4.b[4], v16.b[0] -; CHECK-GI-DOT-NEXT: ldr b16, [x1, #21] -; CHECK-GI-DOT-NEXT: mov v1.b[11], v5.b[0] -; CHECK-GI-DOT-NEXT: ldr b5, [x0, #12] -; CHECK-GI-DOT-NEXT: mov v2.b[11], v6.b[0] -; CHECK-GI-DOT-NEXT: ldr b6, [x1, #12] -; CHECK-GI-DOT-NEXT: mov v3.b[5], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x0, #22] -; CHECK-GI-DOT-NEXT: mov v4.b[5], v16.b[0] -; CHECK-GI-DOT-NEXT: ldr b16, [x1, #22] -; CHECK-GI-DOT-NEXT: mov v1.b[12], v5.b[0] -; CHECK-GI-DOT-NEXT: ldr b5, [x0, #13] -; CHECK-GI-DOT-NEXT: mov v2.b[12], v6.b[0] -; CHECK-GI-DOT-NEXT: ldr b6, [x1, #13] -; CHECK-GI-DOT-NEXT: mov v3.b[6], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x0, #23] -; CHECK-GI-DOT-NEXT: mov v4.b[6], v16.b[0] -; CHECK-GI-DOT-NEXT: ldr b16, [x1, #23] -; CHECK-GI-DOT-NEXT: mov v1.b[13], v5.b[0] -; CHECK-GI-DOT-NEXT: ldr b5, [x0, #14] -; CHECK-GI-DOT-NEXT: mov v2.b[13], v6.b[0] -; CHECK-GI-DOT-NEXT: ldr b6, [x1, #14] -; CHECK-GI-DOT-NEXT: mov v3.b[7], v7.b[0] -; CHECK-GI-DOT-NEXT: mov v4.b[7], v16.b[0] -; CHECK-GI-DOT-NEXT: mov v1.b[14], v5.b[0] -; CHECK-GI-DOT-NEXT: ldr b5, [x0, #15] -; CHECK-GI-DOT-NEXT: mov v2.b[14], v6.b[0] -; CHECK-GI-DOT-NEXT: ldr b6, [x1, #15] -; CHECK-GI-DOT-NEXT: fmov d3, d3 -; CHECK-GI-DOT-NEXT: fmov d4, d4 -; CHECK-GI-DOT-NEXT: mov v1.b[15], v5.b[0] -; CHECK-GI-DOT-NEXT: movi v5.2d, #0000000000000000 -; CHECK-GI-DOT-NEXT: mov v2.b[15], v6.b[0] -; CHECK-GI-DOT-NEXT: sdot v0.4s, v4.16b, v3.16b -; CHECK-GI-DOT-NEXT: sdot v5.4s, v2.16b, v1.16b -; CHECK-GI-DOT-NEXT: add v0.4s, v5.4s, v0.4s +; CHECK-GI-DOT-NEXT: movi v1.2d, #0000000000000000 +; CHECK-GI-DOT-NEXT: ldr q2, [x0] +; CHECK-GI-DOT-NEXT: ldr d3, [x0, #16] +; CHECK-GI-DOT-NEXT: ldr q4, [x1] +; CHECK-GI-DOT-NEXT: ldr d5, [x1, #16] +; CHECK-GI-DOT-NEXT: sdot v1.4s, v4.16b, v2.16b +; CHECK-GI-DOT-NEXT: sdot v0.4s, v5.16b, v3.16b +; CHECK-GI-DOT-NEXT: add v0.4s, v1.4s, v0.4s ; CHECK-GI-DOT-NEXT: addv s0, v0.4s ; CHECK-GI-DOT-NEXT: fmov w0, s0 ; CHECK-GI-DOT-NEXT: ret @@ -2835,243 +2526,91 @@ define i32 @test_sdot_v48i8(ptr %p1, ptr %p2) { ; ; CHECK-GI-BASE-LABEL: test_sdot_v48i8: ; CHECK-GI-BASE: // %bb.0: // %entry -; CHECK-GI-BASE-NEXT: ldp q0, q4, [x1] -; CHECK-GI-BASE-NEXT: ldr q2, [x0, #32] -; CHECK-GI-BASE-NEXT: ldp q1, q3, [x0] -; CHECK-GI-BASE-NEXT: ldr q7, [x1, #32] -; CHECK-GI-BASE-NEXT: sshll2 v16.8h, v2.16b, #0 -; CHECK-GI-BASE-NEXT: sshll2 v6.8h, v0.16b, #0 -; CHECK-GI-BASE-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-GI-BASE-NEXT: sshll2 v17.8h, v7.16b, #0 -; CHECK-GI-BASE-NEXT: sshll2 v5.8h, v1.16b, #0 -; CHECK-GI-BASE-NEXT: sshll v1.8h, v1.8b, #0 -; CHECK-GI-BASE-NEXT: smull2 v18.4s, v6.8h, v5.8h -; CHECK-GI-BASE-NEXT: smull v19.4s, v0.4h, v1.4h -; CHECK-GI-BASE-NEXT: smull v5.4s, v6.4h, v5.4h -; CHECK-GI-BASE-NEXT: smull2 v0.4s, v0.8h, v1.8h -; CHECK-GI-BASE-NEXT: sshll v1.8h, v2.8b, #0 -; CHECK-GI-BASE-NEXT: sshll v2.8h, v7.8b, #0 -; CHECK-GI-BASE-NEXT: sshll2 v6.8h, v3.16b, #0 -; CHECK-GI-BASE-NEXT: sshll2 v7.8h, v4.16b, #0 -; CHECK-GI-BASE-NEXT: smlal2 v18.4s, v17.8h, v16.8h -; CHECK-GI-BASE-NEXT: smlal v5.4s, v17.4h, v16.4h -; CHECK-GI-BASE-NEXT: smlal v19.4s, v2.4h, v1.4h -; CHECK-GI-BASE-NEXT: smlal2 v0.4s, v2.8h, v1.8h -; CHECK-GI-BASE-NEXT: sshll v1.8h, v3.8b, #0 -; CHECK-GI-BASE-NEXT: sshll v2.8h, v4.8b, #0 -; CHECK-GI-BASE-NEXT: smlal2 v18.4s, v7.8h, v6.8h -; CHECK-GI-BASE-NEXT: smlal v5.4s, v7.4h, v6.4h -; CHECK-GI-BASE-NEXT: smlal v19.4s, v2.4h, v1.4h -; CHECK-GI-BASE-NEXT: smlal2 v0.4s, v2.8h, v1.8h -; CHECK-GI-BASE-NEXT: add v1.4s, v19.4s, v5.4s -; CHECK-GI-BASE-NEXT: add v0.4s, v0.4s, v18.4s -; CHECK-GI-BASE-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-GI-BASE-NEXT: ldp q0, q3, [x1] +; CHECK-GI-BASE-NEXT: ldr q6, [x1, #32] +; CHECK-GI-BASE-NEXT: ldp q1, q2, [x0] +; CHECK-GI-BASE-NEXT: ldr q17, [x0, #32] +; CHECK-GI-BASE-NEXT: sshll v4.8h, v0.8b, #0 +; CHECK-GI-BASE-NEXT: sshll2 v0.8h, v0.16b, #0 +; CHECK-GI-BASE-NEXT: sshll v7.8h, v3.8b, #0 +; CHECK-GI-BASE-NEXT: sshll v5.8h, v1.8b, #0 +; CHECK-GI-BASE-NEXT: sshll2 v1.8h, v1.16b, #0 +; CHECK-GI-BASE-NEXT: sshll v16.8h, v2.8b, #0 +; CHECK-GI-BASE-NEXT: sshll2 v3.8h, v3.16b, #0 +; CHECK-GI-BASE-NEXT: sshll2 v2.8h, v2.16b, #0 +; CHECK-GI-BASE-NEXT: smull v18.4s, v4.4h, v5.4h +; CHECK-GI-BASE-NEXT: smull2 v4.4s, v4.8h, v5.8h +; CHECK-GI-BASE-NEXT: smull2 v19.4s, v0.8h, v1.8h +; CHECK-GI-BASE-NEXT: smull v20.4s, v7.4h, v16.4h +; CHECK-GI-BASE-NEXT: smull v0.4s, v0.4h, v1.4h +; CHECK-GI-BASE-NEXT: sshll v5.8h, v6.8b, #0 +; CHECK-GI-BASE-NEXT: sshll v1.8h, v17.8b, #0 +; CHECK-GI-BASE-NEXT: smull2 v7.4s, v7.8h, v16.8h +; CHECK-GI-BASE-NEXT: sshll2 v6.8h, v6.16b, #0 +; CHECK-GI-BASE-NEXT: sshll2 v17.8h, v17.16b, #0 +; CHECK-GI-BASE-NEXT: addv s16, v18.4s +; CHECK-GI-BASE-NEXT: addv s4, v4.4s +; CHECK-GI-BASE-NEXT: smull v18.4s, v3.4h, v2.4h +; CHECK-GI-BASE-NEXT: smull2 v2.4s, v3.8h, v2.8h +; CHECK-GI-BASE-NEXT: addv s3, v19.4s +; CHECK-GI-BASE-NEXT: smull v19.4s, v5.4h, v1.4h +; CHECK-GI-BASE-NEXT: smull2 v1.4s, v5.8h, v1.8h +; CHECK-GI-BASE-NEXT: addv s5, v20.4s ; CHECK-GI-BASE-NEXT: addv s0, v0.4s -; CHECK-GI-BASE-NEXT: fmov w0, s0 +; CHECK-GI-BASE-NEXT: addv s7, v7.4s +; CHECK-GI-BASE-NEXT: smull v20.4s, v6.4h, v17.4h +; CHECK-GI-BASE-NEXT: smull2 v6.4s, v6.8h, v17.8h +; CHECK-GI-BASE-NEXT: fmov w8, s16 +; CHECK-GI-BASE-NEXT: fmov w9, s4 +; CHECK-GI-BASE-NEXT: fmov w10, s3 +; CHECK-GI-BASE-NEXT: addv s3, v18.4s +; CHECK-GI-BASE-NEXT: addv s2, v2.4s +; CHECK-GI-BASE-NEXT: fmov w11, s5 +; CHECK-GI-BASE-NEXT: addv s4, v19.4s +; CHECK-GI-BASE-NEXT: add w8, w8, w9 +; CHECK-GI-BASE-NEXT: fmov w9, s0 +; CHECK-GI-BASE-NEXT: addv s0, v1.4s +; CHECK-GI-BASE-NEXT: addv s1, v20.4s +; CHECK-GI-BASE-NEXT: addv s5, v6.4s +; CHECK-GI-BASE-NEXT: add w10, w10, w11 +; CHECK-GI-BASE-NEXT: fmov w11, s3 +; CHECK-GI-BASE-NEXT: fmov w12, s2 +; CHECK-GI-BASE-NEXT: add w8, w8, w9 +; CHECK-GI-BASE-NEXT: fmov w9, s7 +; CHECK-GI-BASE-NEXT: add w9, w10, w9 +; CHECK-GI-BASE-NEXT: add w10, w11, w12 +; CHECK-GI-BASE-NEXT: fmov w11, s4 +; CHECK-GI-BASE-NEXT: add w8, w8, w9 +; CHECK-GI-BASE-NEXT: add w9, w10, w11 +; CHECK-GI-BASE-NEXT: fmov w10, s0 +; CHECK-GI-BASE-NEXT: fmov w11, s5 +; CHECK-GI-BASE-NEXT: add w9, w9, w10 +; CHECK-GI-BASE-NEXT: fmov w10, s1 +; CHECK-GI-BASE-NEXT: add w8, w8, w9 +; CHECK-GI-BASE-NEXT: add w9, w10, w11 +; CHECK-GI-BASE-NEXT: add w0, w8, w9 ; CHECK-GI-BASE-NEXT: ret ; ; CHECK-GI-DOT-LABEL: test_sdot_v48i8: ; CHECK-GI-DOT: // %bb.0: // %entry -; CHECK-GI-DOT-NEXT: ldr b1, [x0] -; CHECK-GI-DOT-NEXT: ldr b5, [x0, #1] ; CHECK-GI-DOT-NEXT: movi v0.2d, #0000000000000000 -; CHECK-GI-DOT-NEXT: ldr b2, [x0, #16] -; CHECK-GI-DOT-NEXT: ldr b6, [x0, #17] -; CHECK-GI-DOT-NEXT: ldr b4, [x1] -; CHECK-GI-DOT-NEXT: ldr b17, [x1, #1] -; CHECK-GI-DOT-NEXT: mov v1.b[1], v5.b[0] -; CHECK-GI-DOT-NEXT: ldr b5, [x1, #16] -; CHECK-GI-DOT-NEXT: ldr b18, [x1, #17] -; CHECK-GI-DOT-NEXT: mov v2.b[1], v6.b[0] -; CHECK-GI-DOT-NEXT: ldr b3, [x0, #32] -; CHECK-GI-DOT-NEXT: ldr b7, [x0, #33] -; CHECK-GI-DOT-NEXT: mov v4.b[1], v17.b[0] -; CHECK-GI-DOT-NEXT: ldr b6, [x1, #32] -; CHECK-GI-DOT-NEXT: ldr b16, [x1, #33] -; CHECK-GI-DOT-NEXT: mov v5.b[1], v18.b[0] -; CHECK-GI-DOT-NEXT: ldr b17, [x0, #2] -; CHECK-GI-DOT-NEXT: ldr b18, [x0, #18] -; CHECK-GI-DOT-NEXT: mov v3.b[1], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x1, #2] -; CHECK-GI-DOT-NEXT: mov v6.b[1], v16.b[0] -; CHECK-GI-DOT-NEXT: mov v1.b[2], v17.b[0] -; CHECK-GI-DOT-NEXT: ldr b16, [x1, #18] -; CHECK-GI-DOT-NEXT: ldr b17, [x0, #34] -; CHECK-GI-DOT-NEXT: mov v2.b[2], v18.b[0] -; CHECK-GI-DOT-NEXT: ldr b18, [x1, #34] -; CHECK-GI-DOT-NEXT: mov v4.b[2], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x0, #3] -; CHECK-GI-DOT-NEXT: mov v5.b[2], v16.b[0] -; CHECK-GI-DOT-NEXT: ldr b16, [x0, #19] -; CHECK-GI-DOT-NEXT: mov v3.b[2], v17.b[0] -; CHECK-GI-DOT-NEXT: ldr b17, [x1, #19] -; CHECK-GI-DOT-NEXT: mov v6.b[2], v18.b[0] -; CHECK-GI-DOT-NEXT: mov v1.b[3], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x1, #3] -; CHECK-GI-DOT-NEXT: ldr b18, [x0, #35] -; CHECK-GI-DOT-NEXT: mov v2.b[3], v16.b[0] -; CHECK-GI-DOT-NEXT: ldr b16, [x1, #35] -; CHECK-GI-DOT-NEXT: mov v4.b[3], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x0, #4] -; CHECK-GI-DOT-NEXT: mov v5.b[3], v17.b[0] -; CHECK-GI-DOT-NEXT: ldr b17, [x0, #20] -; CHECK-GI-DOT-NEXT: mov v3.b[3], v18.b[0] -; CHECK-GI-DOT-NEXT: ldr b18, [x0, #36] -; CHECK-GI-DOT-NEXT: mov v6.b[3], v16.b[0] -; CHECK-GI-DOT-NEXT: mov v1.b[4], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x1, #4] -; CHECK-GI-DOT-NEXT: ldr b16, [x1, #20] -; CHECK-GI-DOT-NEXT: mov v2.b[4], v17.b[0] -; CHECK-GI-DOT-NEXT: ldr b17, [x1, #36] -; CHECK-GI-DOT-NEXT: mov v4.b[4], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x0, #5] -; CHECK-GI-DOT-NEXT: mov v5.b[4], v16.b[0] -; CHECK-GI-DOT-NEXT: ldr b16, [x0, #21] -; CHECK-GI-DOT-NEXT: mov v3.b[4], v18.b[0] -; CHECK-GI-DOT-NEXT: mov v6.b[4], v17.b[0] -; CHECK-GI-DOT-NEXT: mov v1.b[5], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x1, #5] -; CHECK-GI-DOT-NEXT: ldr b17, [x1, #21] -; CHECK-GI-DOT-NEXT: ldr b18, [x0, #37] -; CHECK-GI-DOT-NEXT: mov v2.b[5], v16.b[0] -; CHECK-GI-DOT-NEXT: ldr b16, [x1, #37] -; CHECK-GI-DOT-NEXT: mov v4.b[5], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x0, #6] -; CHECK-GI-DOT-NEXT: mov v5.b[5], v17.b[0] -; CHECK-GI-DOT-NEXT: ldr b17, [x0, #22] -; CHECK-GI-DOT-NEXT: mov v3.b[5], v18.b[0] -; CHECK-GI-DOT-NEXT: mov v6.b[5], v16.b[0] -; CHECK-GI-DOT-NEXT: mov v1.b[6], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x1, #6] -; CHECK-GI-DOT-NEXT: ldr b16, [x1, #22] -; CHECK-GI-DOT-NEXT: ldr b18, [x0, #38] -; CHECK-GI-DOT-NEXT: mov v2.b[6], v17.b[0] -; CHECK-GI-DOT-NEXT: ldr b17, [x1, #38] -; CHECK-GI-DOT-NEXT: mov v4.b[6], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x0, #7] -; CHECK-GI-DOT-NEXT: mov v5.b[6], v16.b[0] -; CHECK-GI-DOT-NEXT: ldr b16, [x0, #23] -; CHECK-GI-DOT-NEXT: mov v3.b[6], v18.b[0] -; CHECK-GI-DOT-NEXT: mov v6.b[6], v17.b[0] -; CHECK-GI-DOT-NEXT: mov v1.b[7], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x1, #7] -; CHECK-GI-DOT-NEXT: ldr b17, [x1, #23] -; CHECK-GI-DOT-NEXT: ldr b18, [x0, #39] -; CHECK-GI-DOT-NEXT: mov v2.b[7], v16.b[0] -; CHECK-GI-DOT-NEXT: ldr b16, [x1, #39] -; CHECK-GI-DOT-NEXT: mov v4.b[7], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x0, #8] -; CHECK-GI-DOT-NEXT: mov v5.b[7], v17.b[0] -; CHECK-GI-DOT-NEXT: ldr b17, [x0, #24] -; CHECK-GI-DOT-NEXT: mov v3.b[7], v18.b[0] -; CHECK-GI-DOT-NEXT: mov v6.b[7], v16.b[0] -; CHECK-GI-DOT-NEXT: mov v1.b[8], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x1, #8] -; CHECK-GI-DOT-NEXT: ldr b16, [x1, #24] -; CHECK-GI-DOT-NEXT: ldr b18, [x0, #40] -; CHECK-GI-DOT-NEXT: mov v2.b[8], v17.b[0] -; CHECK-GI-DOT-NEXT: ldr b17, [x1, #40] -; CHECK-GI-DOT-NEXT: mov v4.b[8], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x0, #9] -; CHECK-GI-DOT-NEXT: mov v5.b[8], v16.b[0] -; CHECK-GI-DOT-NEXT: ldr b16, [x0, #25] -; CHECK-GI-DOT-NEXT: mov v3.b[8], v18.b[0] -; CHECK-GI-DOT-NEXT: mov v6.b[8], v17.b[0] -; CHECK-GI-DOT-NEXT: mov v1.b[9], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x1, #9] -; CHECK-GI-DOT-NEXT: ldr b17, [x1, #25] -; CHECK-GI-DOT-NEXT: ldr b18, [x0, #41] -; CHECK-GI-DOT-NEXT: mov v2.b[9], v16.b[0] -; CHECK-GI-DOT-NEXT: ldr b16, [x1, #41] -; CHECK-GI-DOT-NEXT: mov v4.b[9], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x0, #10] -; CHECK-GI-DOT-NEXT: mov v5.b[9], v17.b[0] -; CHECK-GI-DOT-NEXT: ldr b17, [x0, #26] -; CHECK-GI-DOT-NEXT: mov v3.b[9], v18.b[0] -; CHECK-GI-DOT-NEXT: mov v6.b[9], v16.b[0] -; CHECK-GI-DOT-NEXT: mov v1.b[10], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x1, #10] -; CHECK-GI-DOT-NEXT: ldr b16, [x1, #26] -; CHECK-GI-DOT-NEXT: ldr b18, [x0, #42] -; CHECK-GI-DOT-NEXT: mov v2.b[10], v17.b[0] -; CHECK-GI-DOT-NEXT: ldr b17, [x1, #42] -; CHECK-GI-DOT-NEXT: mov v4.b[10], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x0, #11] -; CHECK-GI-DOT-NEXT: mov v5.b[10], v16.b[0] -; CHECK-GI-DOT-NEXT: ldr b16, [x0, #27] -; CHECK-GI-DOT-NEXT: mov v3.b[10], v18.b[0] -; CHECK-GI-DOT-NEXT: mov v6.b[10], v17.b[0] -; CHECK-GI-DOT-NEXT: mov v1.b[11], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x1, #11] -; CHECK-GI-DOT-NEXT: ldr b17, [x1, #27] -; CHECK-GI-DOT-NEXT: ldr b18, [x0, #43] -; CHECK-GI-DOT-NEXT: mov v2.b[11], v16.b[0] -; CHECK-GI-DOT-NEXT: ldr b16, [x1, #43] -; CHECK-GI-DOT-NEXT: mov v4.b[11], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x0, #12] -; CHECK-GI-DOT-NEXT: mov v5.b[11], v17.b[0] -; CHECK-GI-DOT-NEXT: ldr b17, [x0, #28] -; CHECK-GI-DOT-NEXT: mov v3.b[11], v18.b[0] -; CHECK-GI-DOT-NEXT: mov v6.b[11], v16.b[0] -; CHECK-GI-DOT-NEXT: mov v1.b[12], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x1, #12] -; CHECK-GI-DOT-NEXT: ldr b16, [x1, #28] -; CHECK-GI-DOT-NEXT: ldr b18, [x0, #44] -; CHECK-GI-DOT-NEXT: mov v2.b[12], v17.b[0] -; CHECK-GI-DOT-NEXT: ldr b17, [x1, #44] -; CHECK-GI-DOT-NEXT: mov v4.b[12], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x0, #13] -; CHECK-GI-DOT-NEXT: mov v5.b[12], v16.b[0] -; CHECK-GI-DOT-NEXT: ldr b16, [x0, #29] -; CHECK-GI-DOT-NEXT: mov v3.b[12], v18.b[0] -; CHECK-GI-DOT-NEXT: mov v6.b[12], v17.b[0] -; CHECK-GI-DOT-NEXT: mov v1.b[13], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x1, #13] -; CHECK-GI-DOT-NEXT: ldr b17, [x1, #29] -; CHECK-GI-DOT-NEXT: ldr b18, [x0, #45] -; CHECK-GI-DOT-NEXT: mov v2.b[13], v16.b[0] -; CHECK-GI-DOT-NEXT: ldr b16, [x1, #45] -; CHECK-GI-DOT-NEXT: mov v4.b[13], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x0, #14] -; CHECK-GI-DOT-NEXT: mov v5.b[13], v17.b[0] -; CHECK-GI-DOT-NEXT: ldr b17, [x0, #30] -; CHECK-GI-DOT-NEXT: mov v3.b[13], v18.b[0] -; CHECK-GI-DOT-NEXT: mov v6.b[13], v16.b[0] -; CHECK-GI-DOT-NEXT: mov v1.b[14], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x1, #14] -; CHECK-GI-DOT-NEXT: ldr b16, [x1, #30] -; CHECK-GI-DOT-NEXT: ldr b18, [x0, #46] -; CHECK-GI-DOT-NEXT: mov v2.b[14], v17.b[0] -; CHECK-GI-DOT-NEXT: ldr b17, [x1, #46] -; CHECK-GI-DOT-NEXT: mov v4.b[14], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x0, #15] -; CHECK-GI-DOT-NEXT: mov v5.b[14], v16.b[0] -; CHECK-GI-DOT-NEXT: ldr b16, [x0, #31] -; CHECK-GI-DOT-NEXT: mov v3.b[14], v18.b[0] -; CHECK-GI-DOT-NEXT: mov v6.b[14], v17.b[0] -; CHECK-GI-DOT-NEXT: mov v1.b[15], v7.b[0] -; CHECK-GI-DOT-NEXT: ldr b7, [x1, #15] -; CHECK-GI-DOT-NEXT: ldr b17, [x1, #31] -; CHECK-GI-DOT-NEXT: ldr b18, [x0, #47] -; CHECK-GI-DOT-NEXT: mov v2.b[15], v16.b[0] -; CHECK-GI-DOT-NEXT: ldr b16, [x1, #47] -; CHECK-GI-DOT-NEXT: mov v4.b[15], v7.b[0] -; CHECK-GI-DOT-NEXT: movi v7.2d, #0000000000000000 -; CHECK-GI-DOT-NEXT: mov v5.b[15], v17.b[0] -; CHECK-GI-DOT-NEXT: mov v3.b[15], v18.b[0] -; CHECK-GI-DOT-NEXT: mov v6.b[15], v16.b[0] -; CHECK-GI-DOT-NEXT: movi v16.2d, #0000000000000000 -; CHECK-GI-DOT-NEXT: sdot v0.4s, v4.16b, v1.16b -; CHECK-GI-DOT-NEXT: sdot v7.4s, v5.16b, v2.16b -; CHECK-GI-DOT-NEXT: sdot v16.4s, v6.16b, v3.16b +; CHECK-GI-DOT-NEXT: movi v1.2d, #0000000000000000 +; CHECK-GI-DOT-NEXT: ldr q7, [x0, #32] +; CHECK-GI-DOT-NEXT: ldp q3, q4, [x0] +; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000 +; CHECK-GI-DOT-NEXT: ldp q5, q6, [x1] +; CHECK-GI-DOT-NEXT: ldr q16, [x1, #32] +; CHECK-GI-DOT-NEXT: sdot v0.4s, v5.16b, v3.16b +; CHECK-GI-DOT-NEXT: sdot v1.4s, v6.16b, v4.16b +; CHECK-GI-DOT-NEXT: sdot v2.4s, v16.16b, v7.16b ; CHECK-GI-DOT-NEXT: addv s0, v0.4s -; CHECK-GI-DOT-NEXT: addv s1, v7.4s -; CHECK-GI-DOT-NEXT: addv s2, v16.4s +; CHECK-GI-DOT-NEXT: addv s1, v1.4s +; CHECK-GI-DOT-NEXT: addv s2, v2.4s ; CHECK-GI-DOT-NEXT: fmov w8, s0 ; CHECK-GI-DOT-NEXT: fmov w9, s1 -; CHECK-GI-DOT-NEXT: fmov w10, s2 ; CHECK-GI-DOT-NEXT: add w8, w8, w9 -; CHECK-GI-DOT-NEXT: add w0, w8, w10 +; CHECK-GI-DOT-NEXT: fmov w9, s2 +; CHECK-GI-DOT-NEXT: add w0, w8, w9 ; CHECK-GI-DOT-NEXT: ret entry: %a = load <48 x i8>, ptr %p1 diff --git a/llvm/test/CodeGen/AArch64/zext.ll b/llvm/test/CodeGen/AArch64/zext.ll index 54b29be2132c..716d2398996b 100644 --- a/llvm/test/CodeGen/AArch64/zext.ll +++ b/llvm/test/CodeGen/AArch64/zext.ll @@ -305,15 +305,14 @@ define <3 x i64> @zext_v3i8_v3i64(<3 x i8> %a) { ; ; CHECK-GI-LABEL: zext_v3i8_v3i64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-GI-NEXT: fmov d1, x0 -; CHECK-GI-NEXT: // kill: def $w1 killed $w1 def $x1 -; CHECK-GI-NEXT: movi v0.2d, #0x000000000000ff +; CHECK-GI-NEXT: fmov s0, w0 +; CHECK-GI-NEXT: movi v1.2d, #0x000000000000ff ; CHECK-GI-NEXT: // kill: def $w2 killed $w2 def $x2 ; CHECK-GI-NEXT: and x8, x2, #0xff ; CHECK-GI-NEXT: fmov d2, x8 -; CHECK-GI-NEXT: mov v1.d[1], x1 -; CHECK-GI-NEXT: and v0.16b, v1.16b, v0.16b +; CHECK-GI-NEXT: mov v0.s[1], w1 +; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret @@ -470,15 +469,14 @@ define <3 x i64> @zext_v3i10_v3i64(<3 x i10> %a) { ; ; CHECK-GI-LABEL: zext_v3i10_v3i64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-GI-NEXT: fmov d0, x0 -; CHECK-GI-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK-GI-NEXT: fmov s0, w0 ; CHECK-GI-NEXT: adrp x8, .LCPI27_0 ; CHECK-GI-NEXT: // kill: def $w2 killed $w2 def $x2 ; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI27_0] ; CHECK-GI-NEXT: and x8, x2, #0x3ff ; CHECK-GI-NEXT: fmov d2, x8 -; CHECK-GI-NEXT: mov v0.d[1], x1 +; CHECK-GI-NEXT: mov v0.s[1], w1 +; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 |