summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAlexey Bataev <a.bataev@outlook.com>2024-04-03 14:28:36 -0400
committerGitHub <noreply@github.com>2024-04-03 14:28:36 -0400
commitd57884011e8c57b118b831614b692ba4bc8b5aca (patch)
tree001c00aa28bde941580b104820e0138df55c754f
parent5bbce06ac642bedcb93158ed04253cf6deedf5e6 (diff)
[SLP]Add support for commutative intrinsics.
Implemented long-standing TODO to support commutative intrinsics. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: https://github.com/llvm/llvm-project/pull/86316
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp44
-rw-r--r--llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll23
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll2
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll6
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll4
-rw-r--r--llvm/test/Transforms/SLPVectorizer/slp-umax-rdx-matcher-crash.ll2
6 files changed, 54 insertions, 27 deletions
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 9f8bc5527a6a..9b87e6e11e06 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -306,10 +306,7 @@ static bool isCommutative(Instruction *I) {
return Cmp->isCommutative();
if (auto *BO = dyn_cast<BinaryOperator>(I))
return BO->isCommutative();
- // TODO: This should check for generic Instruction::isCommutative(), but
- // we need to confirm that the caller code correctly handles Intrinsics
- // for example (does not have 2 operands).
- return false;
+ return I->isCommutative();
}
/// \returns inserting index of InsertElement or InsertValue instruction,
@@ -1975,6 +1972,9 @@ public:
"Expected same number of lanes");
assert(isa<Instruction>(VL[0]) && "Expected instruction");
unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
+ constexpr unsigned IntrinsicNumOperands = 2;
+ if (auto *CI = dyn_cast<IntrinsicInst>(VL[0]))
+ NumOperands = IntrinsicNumOperands;
OpsVec.resize(NumOperands);
unsigned NumLanes = VL.size();
for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
@@ -3420,10 +3420,11 @@ private:
// immediates do not affect scheduler behavior this is considered
// okay.
auto *In = BundleMember->Inst;
- assert(In &&
- (isa<ExtractValueInst, ExtractElementInst>(In) ||
- In->getNumOperands() == TE->getNumOperands()) &&
- "Missed TreeEntry operands?");
+ assert(
+ In &&
+ (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
+ In->getNumOperands() == TE->getNumOperands()) &&
+ "Missed TreeEntry operands?");
(void)In; // fake use to avoid build failure when assertions disabled
for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
@@ -6798,6 +6799,33 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
+ // Sort operands of the instructions so that each side is more likely to
+ // have the same opcode.
+ if (isCommutative(VL0)) {
+ ValueList Left, Right;
+ reorderInputsAccordingToOpcode(VL, Left, Right, *this);
+ TE->setOperand(0, Left);
+ TE->setOperand(1, Right);
+ SmallVector<ValueList> Operands;
+ for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
+ Operands.emplace_back();
+ if (isVectorIntrinsicWithScalarOpAtArg(ID, I))
+ continue;
+ for (Value *V : VL) {
+ auto *CI2 = cast<CallInst>(V);
+ Operands.back().push_back(CI2->getArgOperand(I));
+ }
+ TE->setOperand(I, Operands.back());
+ }
+ buildTree_rec(Left, Depth + 1, {TE, 0});
+ buildTree_rec(Right, Depth + 1, {TE, 1});
+ for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
+ if (Operands[I - 2].empty())
+ continue;
+ buildTree_rec(Operands[I - 2], Depth + 1, {TE, I});
+ }
+ return;
+ }
TE->setOperandsInOrder();
for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
// For scalar operands no need to create an entry since no need to
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
index 89ea15d3ab3b..e4925961f284 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
@@ -142,17 +142,16 @@ define void @gather_2(ptr %mat1, float %0, float %1) {
; CHECK-LABEL: define void @gather_2(
; CHECK-SAME: ptr [[MAT1:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP2:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float 0.000000e+00, float 0.000000e+00)
-; CHECK-NEXT: [[TMP3:%.*]] = call float @llvm.fmuladd.f32(float [[TMP1]], float [[TMP0]], float 0.000000e+00)
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> <float 0.000000e+00, float poison>, float [[TMP1]], i32 1
+; CHECK-NEXT: [[TMP5:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP6]], <2 x float> zeroinitializer)
; CHECK-NEXT: [[TMP4:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float 0.000000e+00)
-; CHECK-NEXT: [[TMP5:%.*]] = fmul float [[TMP2]], 0.000000e+00
-; CHECK-NEXT: [[TMP6:%.*]] = fmul float [[TMP3]], 0.000000e+00
; CHECK-NEXT: [[TMP7:%.*]] = fmul float [[TMP4]], 0.000000e+00
; CHECK-NEXT: [[ARRAYIDX163:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1
-; CHECK-NEXT: [[ARRAYIDX2_I_I_I278:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1, i64 1
; CHECK-NEXT: [[ARRAYIDX5_I_I_I280:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1, i64 2
-; CHECK-NEXT: store float [[TMP5]], ptr [[ARRAYIDX163]], align 4
-; CHECK-NEXT: store float [[TMP6]], ptr [[ARRAYIDX2_I_I_I278]], align 4
+; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x float> [[TMP5]], zeroinitializer
+; CHECK-NEXT: store <2 x float> [[TMP8]], ptr [[ARRAYIDX163]], align 4
; CHECK-NEXT: store float [[TMP7]], ptr [[ARRAYIDX5_I_I_I280]], align 4
; CHECK-NEXT: ret void
;
@@ -358,12 +357,12 @@ define void @reuse_shuffle_indices_cost_crash_2(ptr %bezt, float %0) {
; CHECK-NEXT: [[FNEG:%.*]] = fmul float [[TMP0]], 0.000000e+00
; CHECK-NEXT: [[TMP1:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[FNEG]], float 0.000000e+00)
; CHECK-NEXT: store float [[TMP1]], ptr [[BEZT]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[FNEG]], float 0.000000e+00)
; CHECK-NEXT: [[ARRAYIDX5_I:%.*]] = getelementptr float, ptr [[BEZT]], i64 1
-; CHECK-NEXT: store float [[TMP2]], ptr [[ARRAYIDX5_I]], align 4
-; CHECK-NEXT: [[TMP3:%.*]] = tail call float @llvm.fmuladd.f32(float [[FNEG]], float 0.000000e+00, float 0.000000e+00)
-; CHECK-NEXT: [[ARRAYIDX8_I831:%.*]] = getelementptr float, ptr [[BEZT]], i64 2
-; CHECK-NEXT: store float [[TMP3]], ptr [[ARRAYIDX8_I831]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[TMP0]], i32 0
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[FNEG]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP5:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> [[TMP4]], <2 x float> zeroinitializer)
+; CHECK-NEXT: store <2 x float> [[TMP5]], ptr [[ARRAYIDX5_I]], align 4
; CHECK-NEXT: ret void
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
index 66e3fbf7845a..4cc3c1241b56 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
@@ -1295,7 +1295,7 @@ define i8 @umin_intrinsic_rdx_v16i8(ptr %p0) {
define void @PR49730() {
; CHECK-LABEL: @PR49730(
-; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 1, i32 1>)
+; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> <i32 2, i32 undef, i32 1, i32 undef>, <4 x i32> <i32 undef, i32 2, i32 undef, i32 1>)
; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <4 x i32> undef, [[TMP1]]
; CHECK-NEXT: [[T12:%.*]] = sub nsw i32 undef, undef
; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP2]])
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll
index fb2b653aefc8..82085ade519e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll
@@ -12,10 +12,10 @@ define void @test() {
; CHECK-NEXT: [[TMP1:%.*]] = fsub <2 x float> zeroinitializer, [[TMP0]]
; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX10_I_I86]], align 4
; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr undef, align 4
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> <float poison, float 0.000000e+00>, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[TMP2]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> poison, <2 x i32> <i32 1, i32 poison>
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP3]], i32 1
-; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> <float 0.000000e+00, float poison>, <2 x i32> <i32 2, i32 0>
+; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> <float 0.000000e+00, float poison>, <2 x i32> <i32 2, i32 0>
; CHECK-NEXT: [[TMP8:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP6]], <2 x float> [[TMP7]])
; CHECK-NEXT: br i1 false, label [[BB2:%.*]], label [[BB3:%.*]]
; CHECK: bb2:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll
index 46cca9b078ac..1faeea716b17 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll
@@ -142,8 +142,8 @@ define void @gather_2(ptr %mat1, float %0, float %1) {
; CHECK-SAME: ptr [[MAT1:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> <float 0.000000e+00, float poison>, <2 x i32> <i32 2, i32 0>
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> <float 0.000000e+00, float poison>, float [[TMP1]], i32 1
; CHECK-NEXT: [[TMP5:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x float> zeroinitializer)
; CHECK-NEXT: [[TMP6:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float 0.000000e+00)
; CHECK-NEXT: [[TMP7:%.*]] = fmul float [[TMP6]], 0.000000e+00
diff --git a/llvm/test/Transforms/SLPVectorizer/slp-umax-rdx-matcher-crash.ll b/llvm/test/Transforms/SLPVectorizer/slp-umax-rdx-matcher-crash.ll
index 66229c27f1a3..8b131ccd01c0 100644
--- a/llvm/test/Transforms/SLPVectorizer/slp-umax-rdx-matcher-crash.ll
+++ b/llvm/test/Transforms/SLPVectorizer/slp-umax-rdx-matcher-crash.ll
@@ -43,7 +43,7 @@ declare i32 @llvm.umin.i32(i32, i32)
define void @test2() {
; CHECK-LABEL: @test2(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>)
+; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> <i32 2, i32 undef, i32 1, i32 undef>, <4 x i32> <i32 undef, i32 3, i32 undef, i32 0>)
; CHECK-NEXT: [[TMP1:%.*]] = sub nsw <4 x i32> undef, [[TMP0]]
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP1]])
; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP2]], i32 77)