diff options
author | Alexey Bataev <a.bataev@outlook.com> | 2024-05-01 15:53:25 -0400 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-05-01 15:53:25 -0400 |
commit | fc382db239abea827e7f2ad7a0e18eee17ff5709 (patch) | |
tree | ef6265d9e432771ab128c8fcfbdfe77a77bb841e | |
parent | 59ef94d7cf3cb4f5aac514a72d00d1f0fa4a9fb3 (diff) |
[SLP]Improve comparison of shuffled loads/masked gathers by adding GEP cost.
In some cases masked gather is less profitable than insert-subvector of
consecutive/strided stores. SLP has this kind of analysis, but need to
improve it by adding the cost of the GEP analysis.
Also, the GEP cost estimation for masked gather is fixed.
Reviewers: RKSimon
Reviewed By: RKSimon
Pull Request: https://github.com/llvm/llvm-project/pull/90737
-rw-r--r-- | llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 76 | ||||
-rw-r--r-- | llvm/test/Transforms/SLPVectorizer/RISCV/combined-loads-stored.ll | 15 |
2 files changed, 62 insertions, 29 deletions
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 038bceb39cb8..bc553c5009ed 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -4326,6 +4326,11 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy, return Expander.expandCodeFor(Stride, Stride->getType(), Inst); } +static std::pair<InstructionCost, InstructionCost> +getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs, + Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, + Type *ScalarTy, VectorType *VecTy); + BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads( ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order, SmallVectorImpl<Value *> &PointerOps, bool TryRecursiveCheck) const { @@ -4464,31 +4469,56 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads( if (VectorizedCnt == VL.size() / VF) { // Compare masked gather cost and loads + insersubvector costs. TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; - InstructionCost MaskedGatherCost = TTI.getGatherScatterOpCost( - Instruction::Load, VecTy, - cast<LoadInst>(VL0)->getPointerOperand(), - /*VariableMask=*/false, CommonAlignment, CostKind); + auto [ScalarGEPCost, VectorGEPCost] = getGEPCosts( + TTI, PointerOps, PointerOps.front(), Instruction::GetElementPtr, + CostKind, ScalarTy, VecTy); + InstructionCost MaskedGatherCost = + TTI.getGatherScatterOpCost( + Instruction::Load, VecTy, + cast<LoadInst>(VL0)->getPointerOperand(), + /*VariableMask=*/false, CommonAlignment, CostKind) + + VectorGEPCost - ScalarGEPCost; InstructionCost VecLdCost = 0; auto *SubVecTy = FixedVectorType::get(ScalarTy, VF); for (auto [I, LS] : enumerate(States)) { auto *LI0 = cast<LoadInst>(VL[I * VF]); switch (LS) { - case LoadsState::Vectorize: + case LoadsState::Vectorize: { + auto [ScalarGEPCost, VectorGEPCost] = + getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF), + LI0->getPointerOperand(), Instruction::Load, + CostKind, ScalarTy, SubVecTy); VecLdCost += TTI.getMemoryOpCost( - Instruction::Load, SubVecTy, LI0->getAlign(), - LI0->getPointerAddressSpace(), CostKind, - TTI::OperandValueInfo()); + Instruction::Load, SubVecTy, LI0->getAlign(), + LI0->getPointerAddressSpace(), CostKind, + TTI::OperandValueInfo()) + + VectorGEPCost - ScalarGEPCost; break; - case LoadsState::StridedVectorize: - VecLdCost += TTI.getStridedMemoryOpCost( - Instruction::Load, SubVecTy, LI0->getPointerOperand(), - /*VariableMask=*/false, CommonAlignment, CostKind); + } + case LoadsState::StridedVectorize: { + auto [ScalarGEPCost, VectorGEPCost] = + getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF), + LI0->getPointerOperand(), Instruction::Load, + CostKind, ScalarTy, SubVecTy); + VecLdCost += + TTI.getStridedMemoryOpCost( + Instruction::Load, SubVecTy, LI0->getPointerOperand(), + /*VariableMask=*/false, CommonAlignment, CostKind) + + VectorGEPCost - ScalarGEPCost; break; - case LoadsState::ScatterVectorize: - VecLdCost += TTI.getGatherScatterOpCost( - Instruction::Load, SubVecTy, LI0->getPointerOperand(), - /*VariableMask=*/false, CommonAlignment, CostKind); + } + case LoadsState::ScatterVectorize: { + auto [ScalarGEPCost, VectorGEPCost] = getGEPCosts( + TTI, ArrayRef(PointerOps).slice(I * VF, VF), + LI0->getPointerOperand(), Instruction::GetElementPtr, + CostKind, ScalarTy, SubVecTy); + VecLdCost += + TTI.getGatherScatterOpCost( + Instruction::Load, SubVecTy, LI0->getPointerOperand(), + /*VariableMask=*/false, CommonAlignment, CostKind) + + VectorGEPCost - ScalarGEPCost; break; + } case LoadsState::Gather: llvm_unreachable( "Expected only consecutive, strided or masked gather loads."); @@ -4497,13 +4527,13 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads( for (int Idx : seq<int>(0, VL.size())) ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx; VecLdCost += - TTI.getShuffleCost(TTI ::SK_InsertSubvector, VecTy, - ShuffleMask, CostKind, I * VF, SubVecTy); + TTI.getShuffleCost(TTI::SK_InsertSubvector, VecTy, ShuffleMask, + CostKind, I * VF, SubVecTy); } // If masked gather cost is higher - better to vectorize, so // consider it as a gather node. It will be better estimated // later. - if (MaskedGatherCost > VecLdCost) + if (MaskedGatherCost >= VecLdCost) return true; } } @@ -7951,7 +7981,13 @@ getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs, ScalarCost = TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind); - if (auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr)) { + auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr); + if (!BaseGEP) { + auto *It = find_if(Ptrs, IsaPred<GEPOperator>); + if (It != Ptrs.end()) + BaseGEP = cast<GEPOperator>(*It); + } + if (BaseGEP) { SmallVector<const Value *> Indices(BaseGEP->indices()); VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(), BaseGEP->getPointerOperand(), Indices, VecTy, diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/combined-loads-stored.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/combined-loads-stored.ll index 05c3151fca54..94a55c435c8c 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/combined-loads-stored.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/combined-loads-stored.ll @@ -4,16 +4,13 @@ define void @test(ptr noalias %p, ptr %p1) { ; CHECK-LABEL: define void @test( ; CHECK-SAME: ptr noalias [[P:%.*]], ptr [[P1:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: [[GEP799:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 16 -; CHECK-NEXT: [[L3:%.*]] = load i16, ptr [[GEP799]], align 2 -; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 18 -; CHECK-NEXT: [[L4:%.*]] = load i16, ptr [[GEP3]], align 2 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr [[P]], align 2 -; CHECK-NEXT: store <2 x i16> [[TMP1]], ptr [[P1]], align 2 -; CHECK-NEXT: [[GEPS2:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 4 -; CHECK-NEXT: store i16 [[L3]], ptr [[GEPS2]], align 2 -; CHECK-NEXT: [[GEPS3:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 6 -; CHECK-NEXT: store i16 [[L4]], ptr [[GEPS3]], align 2 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 16 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i16>, ptr [[GEP2]], align 2 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 4, i32 5> +; CHECK-NEXT: store <4 x i16> [[TMP5]], ptr [[P1]], align 2 ; CHECK-NEXT: ret void ; %l1 = load i16, ptr %p, align 2 |