diff options
author | Nilanjana Basu <n_basu@apple.com> | 2023-11-17 17:38:04 -0800 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-11-17 17:38:04 -0800 |
commit | e2210cefb18171496573957945f9bd48eb631170 (patch) | |
tree | 8d6a48534d16319574d2324147142302edb4886c | |
parent | 9aa88b0f02a4a5dbc4072c11ed992eb617a6b2e2 (diff) |
[LV] Pre-committing tests for changing loop interleaving count computation (#70272)
Added tests for evaluating changes to loop interleaving count computation and for removing loop interleaving threshold in subsequent patches.
-rw-r--r-- | llvm/test/Transforms/LoopVectorize/AArch64/interleave_count.ll | 107 | ||||
-rw-r--r-- | llvm/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll | 254 |
2 files changed, 344 insertions, 17 deletions
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count.ll new file mode 100644 index 000000000000..061cdb564367 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count.ll @@ -0,0 +1,107 @@ +; RUN: opt < %s -tiny-trip-count-interleave-threshold=32 -p loop-vectorize -S -pass-remarks=loop-vectorize -disable-output 2>&1 | FileCheck %s +; TODO: remove -tiny-trip-count-interleave-threshold once the interleave threshold is removed + +target triple = "aarch64-linux-gnu" + +%pair = type { i8, i8 } + +; For this loop with known TC of 32, when the auto-vectorizer chooses VF 16, it should choose +; IC 2 since there is no remainder loop run needed when the vector loop runs. +; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2) +define void @loop_with_tc_32(ptr noalias %p, ptr noalias %q) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0 + %tmp1 = load i8, ptr %tmp0, align 1 + %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1 + %tmp3 = load i8, ptr %tmp2, align 1 + %add = add i8 %tmp1, %tmp3 + %qi = getelementptr i8, ptr %q, i64 %i + store i8 %add, ptr %qi, align 1 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp eq i64 %i.next, 32 + br i1 %cond, label %for.end, label %for.body + +for.end: + ret void +} + +; TODO: For this loop with known TC of 33, when the auto-vectorizer chooses VF 16, it should choose +; IC 1 since there may be a remainder loop that needs to run after the vector loop. +; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2) +define void @loop_with_tc_33(ptr noalias %p, ptr noalias %q) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0 + %tmp1 = load i8, ptr %tmp0, align 1 + %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1 + %tmp3 = load i8, ptr %tmp2, align 1 + %add = add i8 %tmp1, %tmp3 + %qi = getelementptr i8, ptr %q, i64 %i + store i8 %add, ptr %qi, align 1 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp eq i64 %i.next, 33 + br i1 %cond, label %for.end, label %for.body + +for.end: + ret void +} + +; For a loop with unknown trip count but a profile showing an approx TC estimate of 32, when the +; auto-vectorizer chooses VF 16, it should choose IC 2 since chances are high that the remainder loop +; won't need to run +; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2) +define void @loop_with_profile_tc_32(ptr noalias %p, ptr noalias %q, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0 + %tmp1 = load i8, ptr %tmp0, align 1 + %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1 + %tmp3 = load i8, ptr %tmp2, align 1 + %add = add i8 %tmp1, %tmp3 + %qi = getelementptr i8, ptr %q, i64 %i + store i8 %add, ptr %qi, align 1 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp eq i64 %i.next, %n + br i1 %cond, label %for.end, label %for.body, !prof !0 + +for.end: + ret void +} + +; TODO: For a loop with unknown trip count but a profile showing an approx TC estimate of 33, +; when the auto-vectorizer chooses VF 16, it should choose IC 1 since chances are high that the +; remainder loop will need to run +; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2) +define void @loop_with_profile_tc_33(ptr noalias %p, ptr noalias %q, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0 + %tmp1 = load i8, ptr %tmp0, align 1 + %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1 + %tmp3 = load i8, ptr %tmp2, align 1 + %add = add i8 %tmp1, %tmp3 + %qi = getelementptr i8, ptr %q, i64 %i + store i8 %add, ptr %qi, align 1 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp eq i64 %i.next, %n + br i1 %cond, label %for.end, label %for.body, !prof !1 + +for.end: + ret void +} + +!0 = !{!"branch_weights", i32 1, i32 31} +!1 = !{!"branch_weights", i32 1, i32 32} diff --git a/llvm/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll b/llvm/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll index 290be569bc12..5b79d6af9ed9 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll @@ -6,37 +6,257 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" -; We don't unroll this loop because it has a small constant trip count. +; We don't unroll this loop because it has a small constant trip count +; that is not profitable for generating a scalar epilogue ; -; CHECK-VECTOR-LABEL: @foo( +; CHECK-VECTOR-LABEL: @foo_trip_count_8( ; CHECK-VECTOR: load <4 x i32> ; CHECK-VECTOR-NOT: load <4 x i32> ; CHECK-VECTOR: store <4 x i32> ; CHECK-VECTOR-NOT: store <4 x i32> ; CHECK-VECTOR: ret ; -; CHECK-SCALAR-LABEL: @foo( +; CHECK-SCALAR-LABEL: @foo_trip_count_8( ; CHECK-SCALAR: load i32, ptr ; CHECK-SCALAR-NOT: load i32, ptr ; CHECK-SCALAR: store i32 ; CHECK-SCALAR-NOT: store i32 ; CHECK-SCALAR: ret -define i32 @foo(ptr nocapture %A) nounwind uwtable ssp { - br label %1 +define void @foo_trip_count_8(ptr nocapture %A) nounwind uwtable ssp { +entry: + br label %for.body -; <label>:1 ; preds = %1, %0 - %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ] - %2 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv - %3 = load i32, ptr %2, align 4 - %4 = add nsw i32 %3, 6 - store i32 %4, ptr %2, align 4 +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %0 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv + %1 = load i32, ptr %0, align 4 + %2 = add nsw i32 %1, 6 + store i32 %2, ptr %0, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, 8 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +; TODO: We should unroll this loop 4 times since TC being a multiple of VF means +; that the epilogue loop may not need to run, making it profitable for +; the vector loop to run even once +; +; CHECK-VECTOR-LABEL: @foo_trip_count_16( +; CHECK-VECTOR: load <4 x i32> +; CHECK-VECTOR-NOT: load <4 x i32> +; CHECK-VECTOR: store <4 x i32> +; CHECK-VECTOR-NOT: store <4 x i32> +; CHECK-VECTOR: ret +; +; CHECK-SCALAR-LABEL: @foo_trip_count_16( +; CHECK-SCALAR: load i32, ptr +; CHECK-SCALAR-NOT: load i32, ptr +; CHECK-SCALAR: store i32 +; CHECK-SCALAR-NOT: store i32 +; CHECK-SCALAR: ret +define void @foo_trip_count_16(ptr nocapture %A) nounwind uwtable ssp { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %0 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv + %1 = load i32, ptr %0, align 4 + %2 = add nsw i32 %1, 6 + store i32 %2, ptr %0, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, 16 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +; TODO: We should unroll this loop twice since TC not being a multiple of VF may require +; the epilogue loop to run, making it profitable when the vector loop runs +; at least twice. +; +; CHECK-VECTOR-LABEL: @foo_trip_count_17( +; CHECK-VECTOR: load <4 x i32> +; CHECK-VECTOR-NOT: load <4 x i32> +; CHECK-VECTOR: store <4 x i32> +; CHECK-VECTOR-NOT: store <4 x i32> +; CHECK-VECTOR: ret +; +; CHECK-SCALAR-LABEL: @foo_trip_count_17( +; CHECK-SCALAR: load i32, ptr +; CHECK-SCALAR-NOT: load i32, ptr +; CHECK-SCALAR: store i32 +; CHECK-SCALAR-NOT: store i32 +; CHECK-SCALAR: ret +define void @foo_trip_count_17(ptr nocapture %A) nounwind uwtable ssp { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %0 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv + %1 = load i32, ptr %0, align 4 + %2 = add nsw i32 %1, 6 + store i32 %2, ptr %0, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, 17 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +; TODO: We should unroll this loop 4 times since TC being a multiple of VF means +; that the epilogue loop may not need to run, making it profitable for +; the vector loop to run even once. The IC is restricted to 4 since +; that is the maximum supported for the target. +; +; CHECK-VECTOR-LABEL: @foo_trip_count_24( +; CHECK-VECTOR: load <4 x i32> +; CHECK-VECTOR-NOT: load <4 x i32> +; CHECK-VECTOR: store <4 x i32> +; CHECK-VECTOR-NOT: store <4 x i32> +; CHECK-VECTOR: ret +; +; CHECK-SCALAR-LABEL: @foo_trip_count_24( +; CHECK-SCALAR: load i32, ptr +; CHECK-SCALAR-NOT: load i32, ptr +; CHECK-SCALAR: store i32 +; CHECK-SCALAR-NOT: store i32 +; CHECK-SCALAR: ret +define void @foo_trip_count_24(ptr nocapture %A) nounwind uwtable ssp { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %0 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv + %1 = load i32, ptr %0, align 4 + %2 = add nsw i32 %1, 6 + store i32 %2, ptr %0, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, 24 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +; TODO: We should unroll this loop twice since TC not being a multiple of VF may require +; the epilogue loop to run, making it profitable when the vector loop runs +; at least twice. +; +; CHECK-VECTOR-LABEL: @foo_trip_count_25( +; CHECK-VECTOR: load <4 x i32> +; CHECK-VECTOR-NOT: load <4 x i32> +; CHECK-VECTOR: store <4 x i32> +; CHECK-VECTOR-NOT: store <4 x i32> +; CHECK-VECTOR: ret +; +; CHECK-SCALAR-LABEL: @foo_trip_count_25( +; CHECK-SCALAR: load i32, ptr +; CHECK-SCALAR-NOT: load i32, ptr +; CHECK-SCALAR: store i32 +; CHECK-SCALAR-NOT: store i32 +; CHECK-SCALAR: ret +define void @foo_trip_count_25(ptr nocapture %A) nounwind uwtable ssp { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %0 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv + %1 = load i32, ptr %0, align 4 + %2 = add nsw i32 %1, 6 + store i32 %2, ptr %0, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, 25 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +; TODO: We should unroll this loop 4 times since TC not being a multiple of VF may require +; the epilogue loop to run, making it profitable when the vector loop runs +; at least twice. +; +; CHECK-VECTOR-LABEL: @foo_trip_count_33( +; CHECK-VECTOR: load <4 x i32> +; CHECK-VECTOR-NOT: load <4 x i32> +; CHECK-VECTOR: store <4 x i32> +; CHECK-VECTOR-NOT: store <4 x i32> +; CHECK-VECTOR: ret +; +; CHECK-SCALAR-LABEL: @foo_trip_count_33( +; CHECK-SCALAR: load i32, ptr +; CHECK-SCALAR-NOT: load i32, ptr +; CHECK-SCALAR: store i32 +; CHECK-SCALAR-NOT: store i32 +; CHECK-SCALAR: ret +define void @foo_trip_count_33(ptr nocapture %A) nounwind uwtable ssp { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %0 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv + %1 = load i32, ptr %0, align 4 + %2 = add nsw i32 %1, 6 + store i32 %2, ptr %0, align 4 %indvars.iv.next = add i64 %indvars.iv, 1 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 - %exitcond = icmp eq i32 %lftr.wideiv, 100 - br i1 %exitcond, label %5, label %1 + %exitcond = icmp eq i32 %lftr.wideiv, 33 + br i1 %exitcond, label %for.end, label %for.body -; <label>:5 ; preds = %1 - ret i32 undef +for.end: ; preds = %for.body + ret void +} + +; TODO: We should unroll this loop 4 times since TC not being a multiple of VF may require +; the epilogue loop to run, making it profitable when the vector loop runs +; at least twice. The IC is restricted to 4 since that is the maximum supported +; for the target. +; +; CHECK-VECTOR-LABEL: @foo_trip_count_101( +; CHECK-VECTOR: load <4 x i32> +; CHECK-VECTOR-NOT: load <4 x i32> +; CHECK-VECTOR: store <4 x i32> +; CHECK-VECTOR-NOT: store <4 x i32> +; CHECK-VECTOR: ret +; +; CHECK-SCALAR-LABEL: @foo_trip_count_101( +; CHECK-SCALAR: load i32, ptr +; CHECK-SCALAR-NOT: load i32, ptr +; CHECK-SCALAR: store i32 +; CHECK-SCALAR-NOT: store i32 +; CHECK-SCALAR: ret +define void @foo_trip_count_101(ptr nocapture %A) nounwind uwtable ssp { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %0 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv + %1 = load i32, ptr %0, align 4 + %2 = add nsw i32 %1, 6 + store i32 %2, ptr %0, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, 101 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void } ; But this is a good small loop to unroll as we don't know of a bound on its @@ -53,7 +273,7 @@ define i32 @foo(ptr nocapture %A) nounwind uwtable ssp { ; CHECK-SCALAR: store i32 ; CHECK-SCALAR-NOT: store i32 ; CHECK-SCALAR: ret -define i32 @bar(ptr nocapture %A, i32 %n) nounwind uwtable ssp { +define void @bar(ptr nocapture %A, i32 %n) nounwind uwtable ssp { %1 = icmp sgt i32 %n, 0 br i1 %1, label %.lr.ph, label %._crit_edge @@ -69,7 +289,7 @@ define i32 @bar(ptr nocapture %A, i32 %n) nounwind uwtable ssp { br i1 %exitcond, label %._crit_edge, label %.lr.ph ._crit_edge: ; preds = %.lr.ph, %0 - ret i32 undef + ret void } ; Also unroll if we need a runtime check but it was going to be added for |