diff options
author | Alexey Bataev <a.bataev@outlook.com> | 2024-04-03 15:58:58 -0400 |
---|---|---|
committer | Alexey Bataev <a.bataev@outlook.com> | 2024-04-03 14:18:45 -0700 |
commit | 42cbceb0f0160d67145723613fda325dbd129308 (patch) | |
tree | d92eec8e4ec3f510864f6c82b1bc49ba508df139 | |
parent | 3ee93f486293420852fb9ec95af9c5f54cecdb08 (diff) |
[SLP]Improve minbitwidth analysis for operands of IToFP and ICmp instructions.
Compiler can improve analysis for operands of UIToFP/SIToFP instructions
and operands of ICmp instruction.
Reviewers: RKSimon
Reviewed By: RKSimon
Pull Request: https://github.com/llvm/llvm-project/pull/85966
3 files changed, 50 insertions, 16 deletions
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 9b87e6e11e06..99769540f780 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1107,7 +1107,7 @@ public: MinBWs.clear(); ReductionBitWidth = 0; CastMaxMinBWSizes.reset(); - TruncNodes.clear(); + ExtraBitWidthNodes.clear(); InstrElementSize.clear(); UserIgnoreList = nullptr; PostponedGathers.clear(); @@ -3683,8 +3683,9 @@ private: /// type sizes, used in the tree. std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes; - /// Indices of the vectorized trunc nodes. - DenseSet<unsigned> TruncNodes; + /// Indices of the vectorized nodes, which supposed to be the roots of the new + /// bitwidth analysis attempt, like trunc, IToFP or ICmp. + DenseSet<unsigned> ExtraBitWidthNodes; }; } // end namespace slpvectorizer @@ -6612,7 +6613,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, PrevMaxBW), std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()), PrevMinBW)); - TruncNodes.insert(VectorizableTree.size()); + ExtraBitWidthNodes.insert(VectorizableTree.size() + 1); + } else if (ShuffleOrOp == Instruction::SIToFP || + ShuffleOrOp == Instruction::UIToFP) { + unsigned NumSignBits = + ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT); + if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) { + APInt Mask = DB->getDemandedBits(OpI); + NumSignBits = std::max(NumSignBits, Mask.countl_zero()); + } + if (NumSignBits * 2 >= + DL->getTypeSizeInBits(VL0->getOperand(0)->getType())) + ExtraBitWidthNodes.insert(VectorizableTree.size() + 1); } TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); @@ -6660,6 +6672,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, TE->setOperand(1, Right); buildTree_rec(Left, Depth + 1, {TE, 0}); buildTree_rec(Right, Depth + 1, {TE, 1}); + if (ShuffleOrOp == Instruction::ICmp) { + unsigned NumSignBits0 = + ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT); + if (NumSignBits0 * 2 >= + DL->getTypeSizeInBits(VL0->getOperand(0)->getType())) + ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx); + unsigned NumSignBits1 = + ComputeNumSignBits(VL0->getOperand(1), *DL, 0, AC, nullptr, DT); + if (NumSignBits1 * 2 >= + DL->getTypeSizeInBits(VL0->getOperand(1)->getType())) + ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx); + } return; } case Instruction::Select: @@ -14302,7 +14326,8 @@ void BoUpSLP::computeMinimumValueSizes() { bool IsStoreOrInsertElt = VectorizableTree.front()->getOpcode() == Instruction::Store || VectorizableTree.front()->getOpcode() == Instruction::InsertElement; - if ((IsStoreOrInsertElt || UserIgnoreList) && TruncNodes.size() <= 1 && + if ((IsStoreOrInsertElt || UserIgnoreList) && + ExtraBitWidthNodes.size() <= 1 && (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 || CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2)) return; @@ -14506,16 +14531,23 @@ void BoUpSLP::computeMinimumValueSizes() { IsTopRoot = false; IsProfitableToDemoteRoot = true; - if (TruncNodes.empty()) { + if (ExtraBitWidthNodes.empty()) { NodeIdx = VectorizableTree.size(); } else { unsigned NewIdx = 0; do { - NewIdx = *TruncNodes.begin() + 1; - TruncNodes.erase(TruncNodes.begin()); - } while (NewIdx <= NodeIdx && !TruncNodes.empty()); + NewIdx = *ExtraBitWidthNodes.begin(); + ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin()); + } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty()); NodeIdx = NewIdx; - IsTruncRoot = true; + IsTruncRoot = + NodeIdx < VectorizableTree.size() && + any_of(VectorizableTree[NodeIdx]->UserTreeIndices, + [](const EdgeInfo &EI) { + return EI.EdgeIdx == 0 && + EI.UserTE->getOpcode() == Instruction::Trunc && + !EI.UserTE->isAltShuffle(); + }); } // If the maximum bit width we compute is less than the with of the roots' diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-icmp-to-trunc.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-icmp-to-trunc.ll index fc28d7ab4ee7..e1fd8a7ec88a 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-icmp-to-trunc.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-icmp-to-trunc.ll @@ -19,8 +19,8 @@ define i1 @test(ptr noalias %0, i64 %1, ptr noalias %p, ptr %p1) { ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <2 x i24> [[TMP8]], <i24 24, i24 24> ; CHECK-NEXT: [[TMP10:%.*]] = select <2 x i1> [[TMP9]], <2 x i24> <i24 23, i24 23>, <2 x i24> [[TMP8]] ; CHECK-NEXT: [[TMP23:%.*]] = trunc <2 x i24> [[TMP10]] to <2 x i8> -; CHECK-NEXT: [[TMP11:%.*]] = zext <2 x i8> [[TMP23]] to <2 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = and <2 x i32> [[TMP11]], <i32 254, i32 254> +; CHECK-NEXT: [[TMP26:%.*]] = zext <2 x i8> [[TMP23]] to <2 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = and <2 x i32> [[TMP26]], <i32 254, i32 254> ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq <2 x i32> [[TMP12]], <i32 4, i32 4> ; CHECK-NEXT: [[TMP25:%.*]] = select <2 x i1> [[TMP13]], <2 x i8> <i8 2, i8 2>, <2 x i8> [[TMP23]] ; CHECK-NEXT: [[TMP14:%.*]] = zext <2 x i8> [[TMP25]] to <2 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-node-with-multi-users.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-node-with-multi-users.ll index 136ab6400773..668d3c3c8c82 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-node-with-multi-users.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-node-with-multi-users.ll @@ -10,12 +10,14 @@ define void @test() { ; CHECK-NEXT: [[TMP3:%.*]] = select i1 false, i32 0, i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i8> <i8 poison, i8 0, i8 poison, i8 poison>, i8 [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1> -; CHECK-NEXT: [[TMP6:%.*]] = sext <4 x i8> [[TMP5]] to <4 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = trunc <4 x i8> [[TMP5]] to <4 x i1> ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP8:%.*]] = or <4 x i8> [[TMP7]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = sext <4 x i8> [[TMP8]] to <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = or <4 x i32> zeroinitializer, [[TMP6]] -; CHECK-NEXT: [[TMP11:%.*]] = icmp eq <4 x i32> [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP9:%.*]] = trunc <4 x i8> [[TMP8]] to <4 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = or <4 x i1> zeroinitializer, [[TMP15]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq <4 x i1> [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i1> [[TMP15]], <4 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; CHECK-NEXT: [[TMP6:%.*]] = zext <4 x i1> [[TMP16]] to <4 x i32> ; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 0, i32 poison, i32 0>, <4 x i32> <i32 4, i32 5, i32 2, i32 7> ; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP13]]) |