diff options
author | Alexey Bataev <a.bataev@outlook.com> | 2024-05-01 15:52:23 -0400 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-05-01 15:52:23 -0400 |
commit | 59ef94d7cf3cb4f5aac514a72d00d1f0fa4a9fb3 (patch) | |
tree | 0642ec35b16c2e738e1fcbffe6d5bbd2b896152a | |
parent | e846778e52f8586b5092c2fd4cdbec2334e31770 (diff) |
[SLP]Do not include the cost of and -1, <v> and emit just <v> after MinBitWidth.
After minbitwidth analysis, and <v>, (power_of_2 - 1 const) can be
transformed into just an <v>, (all_ones const), which can be ignored at
the cost estimation and at the codegen. x264 benchmark has this pattern.
Reviewers: RKSimon
Reviewed By: RKSimon
Pull Request: https://github.com/llvm/llvm-project/pull/90739
3 files changed, 26 insertions, 4 deletions
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index c33d90d531bf..038bceb39cb8 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -9484,6 +9484,16 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, Op1Info, Op2Info, Operands, VI); }; auto GetVectorCost = [=](InstructionCost CommonCost) { + if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) { + for (unsigned I : seq<unsigned>(0, E->getNumOperands())) { + ArrayRef<Value *> Ops = E->getOperand(I); + if (all_of(Ops, [&](Value *Op) { + auto *CI = dyn_cast<ConstantInt>(Op); + return CI && CI->getValue().countr_one() >= It->second.first; + })) + return CommonCost; + } + } unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1; TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0)); TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx)); @@ -12969,6 +12979,20 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); return E->VectorizedValue; } + if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) { + for (unsigned I : seq<unsigned>(0, E->getNumOperands())) { + ArrayRef<Value *> Ops = E->getOperand(I); + if (all_of(Ops, [&](Value *Op) { + auto *CI = dyn_cast<ConstantInt>(Op); + return CI && CI->getValue().countr_one() >= It->second.first; + })) { + V = FinalShuffle(I == 0 ? RHS : LHS, E, VecTy); + E->VectorizedValue = V; + ++NumVectorInstructions; + return V; + } + } + } if (LHS->getType() != VecTy || RHS->getType() != VecTy) { assert((It != MinBWs.end() || getOperandEntry(E, 0)->State == TreeEntry::NeedToGather || diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/minbw-with-and-and-scalar-trunc.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/minbw-with-and-and-scalar-trunc.ll index fc977585614b..d6dc3bcc3354 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/minbw-with-and-and-scalar-trunc.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/minbw-with-and-and-scalar-trunc.ll @@ -12,8 +12,7 @@ define i16 @test() { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i64(ptr align 8 @c, i64 24, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 4) ; CHECK-NEXT: [[TMP1:%.*]] = trunc <4 x i64> [[TMP0]] to <4 x i16> -; CHECK-NEXT: [[TMP2:%.*]] = and <4 x i16> [[TMP1]], <i16 -1, i16 -1, i16 -1, i16 -1> -; CHECK-NEXT: [[TMP3:%.*]] = xor <4 x i16> [[TMP2]], <i16 -1, i16 -1, i16 -1, i16 -1> +; CHECK-NEXT: [[TMP3:%.*]] = xor <4 x i16> [[TMP1]], <i16 -1, i16 -1, i16 -1, i16 -1> ; CHECK-NEXT: [[TMP4:%.*]] = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> [[TMP3]]) ; CHECK-NEXT: [[TMP5:%.*]] = zext i16 [[TMP4]] to i32 ; CHECK-NEXT: [[T:%.*]] = trunc i32 [[TMP5]] to i16 diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/trunc-to-large-than-bw.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/trunc-to-large-than-bw.ll index 04d275742832..0c0c723e6699 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/trunc-to-large-than-bw.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/trunc-to-large-than-bw.ll @@ -9,8 +9,7 @@ define i32 @test() { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i64(ptr align 8 @c, i64 24, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 4) ; CHECK-NEXT: [[TMP1:%.*]] = trunc <4 x i64> [[TMP0]] to <4 x i16> -; CHECK-NEXT: [[TMP2:%.*]] = and <4 x i16> [[TMP1]], <i16 -1, i16 -1, i16 -1, i16 -1> -; CHECK-NEXT: [[TMP3:%.*]] = xor <4 x i16> [[TMP2]], <i16 -1, i16 -1, i16 -1, i16 -1> +; CHECK-NEXT: [[TMP3:%.*]] = xor <4 x i16> [[TMP1]], <i16 -1, i16 -1, i16 -1, i16 -1> ; CHECK-NEXT: [[TMP4:%.*]] = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> [[TMP3]]) ; CHECK-NEXT: [[TMP5:%.*]] = zext i16 [[TMP4]] to i32 ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.umax.i32(i32 [[TMP5]], i32 1) |