summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAlexey Bataev <a.bataev@outlook.com>2024-05-07 21:18:00 +0000
committerAlexey Bataev <a.bataev@outlook.com>2024-05-07 21:18:00 +0000
commitaabb1d72f596b727ac9108e12ef25175b3cf64ad (patch)
treeec097b8928d0ba7d86fde2a3f0aa1cc765252db9
parentf00f2941307e04d3b7320969ee3fec9af31246ba (diff)
Created using spr 1.3.5
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp70
-rw-r--r--llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll102
-rw-r--r--llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll132
3 files changed, 199 insertions, 105 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3be0102bea3e..d27391142b5f 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1554,6 +1554,12 @@ public:
}
}
+ void disableTailFolding() {
+ assert(ChosenTailFoldingStyle && "Tail folding must be selected.");
+ ChosenTailFoldingStyle =
+ std::make_pair(TailFoldingStyle::None, TailFoldingStyle::None);
+ }
+
/// Returns true if all loop blocks should be masked to fold tail loop.
bool foldTailByMasking() const {
// TODO: check if it is possible to check for None style independent of
@@ -1642,6 +1648,14 @@ private:
ElementCount MaxSafeVF,
bool FoldTailByMasking);
+ /// true of scalable vectorization is supported and enabled.
+ std::optional<bool> IsScalableVectorizationAllowed;
+
+ /// Checks if the scalable vectorization is supported and enabled. The result
+ /// is stored in \p IsScalableVectorizationAllowed and used later, if
+ /// requested.
+ bool isScalableVectorizationAllowed();
+
/// \return the maximum legal scalable VF, based on the safe max number
/// of elements.
ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
@@ -4079,9 +4093,7 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
// needs predication, or it was decided to use masking to deal with gaps
// (either a gap at the end of a load-access that may result in a speculative
// load, or any gaps in a store-access).
- bool PredicatedAccessRequiresMasking =
- blockNeedsPredicationForAnyReason(I->getParent()) &&
- Legal->isMaskRequired(I);
+ bool PredicatedAccessRequiresMasking = isPredicatedInst(I);
bool LoadAccessWithGapsRequiresEpilogMasking =
isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
!isScalarEpilogueAllowed();
@@ -4397,15 +4409,17 @@ bool LoopVectorizationCostModel::runtimeChecksRequired() {
return false;
}
-ElementCount
-LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
+bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {
+ if (IsScalableVectorizationAllowed)
+ return *IsScalableVectorizationAllowed;
+ IsScalableVectorizationAllowed = false;
if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
- return ElementCount::getScalable(0);
+ return false;
if (Hints->isScalableVectorizationDisabled()) {
reportVectorizationInfo("Scalable vectorization is explicitly disabled",
"ScalableVectorizationDisabled", ORE, TheLoop);
- return ElementCount::getScalable(0);
+ return false;
}
LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
@@ -4425,7 +4439,7 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
"Scalable vectorization not supported for the reduction "
"operations found in this loop.",
"ScalableVFUnfeasible", ORE, TheLoop);
- return ElementCount::getScalable(0);
+ return false;
}
// Disable scalable vectorization if the loop contains any instructions
@@ -4437,9 +4451,20 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
reportVectorizationInfo("Scalable vectorization is not supported "
"for all element types found in this loop.",
"ScalableVFUnfeasible", ORE, TheLoop);
- return ElementCount::getScalable(0);
+ return false;
}
+ IsScalableVectorizationAllowed = true;
+ return true;
+}
+
+ElementCount
+LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
+ if (!isScalableVectorizationAllowed())
+ return ElementCount::getScalable(0);
+
+ auto MaxScalableVF = ElementCount::getScalable(
+ std::numeric_limits<ElementCount::ScalarTy>::max());
if (Legal->isSafeForAnyVectorWidth())
return MaxScalableVF;
@@ -4642,6 +4667,11 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
}
+ // If we don't know the precise trip count, or if the trip count that we
+ // found modulo the vectorization factor is not zero, try to fold the tail
+ // by masking.
+ // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
+ setTailFoldingStyles(isScalableVectorizationAllowed(), UserIC);
FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
// Avoid tail folding if the trip count is known to be a multiple of any VF
@@ -4673,15 +4703,11 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
if (Rem->isZero()) {
// Accept MaxFixedVF if we do not have a tail.
LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
+ disableTailFolding();
return MaxFactors;
}
}
- // If we don't know the precise trip count, or if the trip count that we
- // found modulo the vectorization factor is not zero, try to fold the tail
- // by masking.
- // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
- setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC);
if (foldTailByMasking()) {
if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) {
LLVM_DEBUG(
@@ -6096,7 +6122,7 @@ LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
"Stride should be 1 or -1 for consecutive memory access");
const Align Alignment = getLoadStoreAlignment(I);
InstructionCost Cost = 0;
- if (Legal->isMaskRequired(I)) {
+ if (isPredicatedInst(I)) {
Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
CostKind);
} else {
@@ -6150,7 +6176,7 @@ LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
return TTI.getAddressComputationCost(VectorTy) +
TTI.getGatherScatterOpCost(
- I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
+ I->getOpcode(), VectorTy, Ptr, isPredicatedInst(I), Alignment,
TargetTransformInfo::TCK_RecipThroughput, I);
}
@@ -6180,7 +6206,7 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
(isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
- AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps);
+ AS, CostKind, isPredicatedInst(I), UseMaskForGaps);
if (Group->isReverse()) {
// TODO: Add support for reversed masked interleaved access.
@@ -6675,7 +6701,7 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
Function *ScalarFunc = CI->getCalledFunction();
Type *ScalarRetTy = CI->getType();
SmallVector<Type *, 4> Tys, ScalarTys;
- bool MaskRequired = Legal->isMaskRequired(CI);
+ bool MaskRequired = isPredicatedInst(CI);
for (auto &ArgOp : CI->args())
ScalarTys.push_back(ArgOp->getType());
@@ -7072,8 +7098,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
return TTI::CastContextHint::Interleave;
case LoopVectorizationCostModel::CM_Scalarize:
case LoopVectorizationCostModel::CM_Widen:
- return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
- : TTI::CastContextHint::Normal;
+ return isPredicatedInst(I) ? TTI::CastContextHint::Masked
+ : TTI::CastContextHint::Normal;
case LoopVectorizationCostModel::CM_Widen_Reverse:
return TTI::CastContextHint::Reversed;
case LoopVectorizationCostModel::CM_Unknown:
@@ -8121,7 +8147,7 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
return nullptr;
VPValue *Mask = nullptr;
- if (Legal->isMaskRequired(I))
+ if (CM.isPredicatedInst(I))
Mask = getBlockInMask(I->getParent());
// Determine if the pointer operand of the access is either consecutive or
@@ -8329,7 +8355,7 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
// vector variant at this VF requires a mask, so we synthesize an
// all-true mask.
VPValue *Mask = nullptr;
- if (Legal->isMaskRequired(CI))
+ if (CM.isPredicatedInst(CI))
Mask = getBlockInMask(CI->getParent());
else
Mask = Plan.getOrAddLiveIn(ConstantInt::getTrue(
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll
index 0b495bc680f0..404c48facbef 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll
@@ -13,18 +13,18 @@
define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
; IF-EVL-LABEL: @interleave(
; IF-EVL-NEXT: entry:
-; IF-EVL-NEXT: [[TMP17:%.*]] = sub i64 -1, [[N:%.*]]
-; IF-EVL-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP31]], 8
-; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP17]], [[TMP2]]
+; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 8
+; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; IF-EVL: vector.ph:
; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
; IF-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 8
-; IF-EVL-NEXT: [[TMP32:%.*]] = sub i64 [[TMP7]], 1
-; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP32]]
+; IF-EVL-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]]
; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
@@ -36,8 +36,8 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
; IF-EVL-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP13]]
; IF-EVL-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4
-; IF-EVL-NEXT: [[TMP37:%.*]] = mul i64 1, [[TMP15]]
-; IF-EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP37]], i64 0
+; IF-EVL-NEXT: [[TMP16:%.*]] = mul i64 1, [[TMP15]]
+; IF-EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP16]], i64 0
; IF-EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
@@ -46,12 +46,12 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[STEP_ADD:%.*]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; IF-EVL-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; IF-EVL-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 0
; IF-EVL-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 4
-; IF-EVL-NEXT: [[TMP38:%.*]] = add i64 [[TMP19]], 0
-; IF-EVL-NEXT: [[TMP39:%.*]] = mul i64 [[TMP38]], 1
-; IF-EVL-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], [[TMP39]]
+; IF-EVL-NEXT: [[TMP20:%.*]] = add i64 [[TMP19]], 0
+; IF-EVL-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 1
+; IF-EVL-NEXT: [[TMP22:%.*]] = add i64 [[INDEX]], [[TMP21]]
; IF-EVL-NEXT: [[TMP23:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
; IF-EVL-NEXT: [[TMP24:%.*]] = icmp ule <vscale x 4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]]
; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], <vscale x 4 x i64> [[VEC_IND]], i32 0
@@ -64,18 +64,18 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
; IF-EVL-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP28]], i32 4, <vscale x 4 x i1> [[TMP24]], <vscale x 4 x i32> poison)
; IF-EVL-NEXT: [[TMP29:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_GATHER3]], [[WIDE_MASKED_GATHER]]
; IF-EVL-NEXT: [[TMP30:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_GATHER4]], [[WIDE_MASKED_GATHER2]]
-; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
-; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP1]]
-; IF-EVL-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
+; IF-EVL-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP17]]
+; IF-EVL-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP22]]
+; IF-EVL-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP31]], i32 0
; IF-EVL-NEXT: [[TMP34:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 4
-; IF-EVL-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i64 [[TMP35]]
+; IF-EVL-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[TMP31]], i64 [[TMP35]]
; IF-EVL-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP29]], ptr [[TMP33]], i32 4, <vscale x 4 x i1> [[TMP23]])
; IF-EVL-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP30]], ptr [[TMP36]], i32 4, <vscale x 4 x i1> [[TMP24]])
; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
; IF-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[STEP_ADD]], [[DOTSPLAT]]
-; IF-EVL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL-NEXT: [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; IF-EVL: middle.block:
; IF-EVL-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; IF-EVL: scalar.ph:
@@ -84,10 +84,10 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
; IF-EVL: for.body:
; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[IV]], i32 0
-; IF-EVL-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[TMP38:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; IF-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[IV]], i32 1
-; IF-EVL-NEXT: [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; IF-EVL-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP22]], [[TMP21]]
+; IF-EVL-NEXT: [[TMP39:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; IF-EVL-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP39]], [[TMP38]]
; IF-EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
; IF-EVL-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
@@ -98,35 +98,49 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
;
; NO-VP-LABEL: @interleave(
; NO-VP-NEXT: entry:
-; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 16
+; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; NO-VP: vector.ph:
-; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]]
; NO-VP: vector.body:
; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-VP-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0
-; NO-VP-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8
-; NO-VP-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], i64 [[TMP10]], i32 0
-; NO-VP-NEXT: [[TMP3:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[TMP1]], i32 0
-; NO-VP-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
-; NO-VP-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
-; NO-VP-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP4]], align 4
-; NO-VP-NEXT: [[WIDE_VEC1:%.*]] = load <16 x i32>, ptr [[TMP5]], align 4
-; NO-VP-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-; NO-VP-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-; NO-VP-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; NO-VP-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; NO-VP-NEXT: [[TMP6:%.*]] = add nsw <8 x i32> [[STRIDED_VEC3]], [[STRIDED_VEC]]
-; NO-VP-NEXT: [[TMP7:%.*]] = add nsw <8 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC2]]
-; NO-VP-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]]
-; NO-VP-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP1]]
-; NO-VP-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 0
-; NO-VP-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 8
-; NO-VP-NEXT: store <8 x i32> [[TMP6]], ptr [[TMP12]], align 4
-; NO-VP-NEXT: store <8 x i32> [[TMP7]], ptr [[TMP11]], align 4
-; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; NO-VP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; NO-VP-NEXT: [[TMP9:%.*]] = add i64 [[TMP8]], 0
+; NO-VP-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 1
+; NO-VP-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]]
+; NO-VP-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], i64 [[TMP6]], i32 0
+; NO-VP-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[TMP11]], i32 0
+; NO-VP-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
+; NO-VP-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
+; NO-VP-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP14]], align 4
+; NO-VP-NEXT: [[WIDE_VEC1:%.*]] = load <vscale x 8 x i32>, ptr [[TMP15]], align 4
+; NO-VP-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
+; NO-VP-NEXT: [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
+; NO-VP-NEXT: [[TMP17:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
+; NO-VP-NEXT: [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC1]])
+; NO-VP-NEXT: [[TMP18:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0
+; NO-VP-NEXT: [[TMP19:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 1
+; NO-VP-NEXT: [[TMP20:%.*]] = add nsw <vscale x 4 x i32> [[TMP17]], [[TMP16]]
+; NO-VP-NEXT: [[TMP21:%.*]] = add nsw <vscale x 4 x i32> [[TMP19]], [[TMP18]]
+; NO-VP-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]]
+; NO-VP-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]]
+; NO-VP-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i32 0
+; NO-VP-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 4
+; NO-VP-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i64 [[TMP26]]
+; NO-VP-NEXT: store <vscale x 4 x i32> [[TMP20]], ptr [[TMP24]], align 4
+; NO-VP-NEXT: store <vscale x 4 x i32> [[TMP21]], ptr [[TMP27]], align 4
+; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; NO-VP-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; NO-VP-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; NO-VP: middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll b/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll
index fcc3864a7aeb..d3f2e4472b64 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll
@@ -489,54 +489,108 @@ loop.exit:
; Note that the then block has UB, but I could not find any other way to
; construct a suitable test case.
define void @pr70590_recipe_without_underlying_instr(i64 %n, ptr noalias %dst) {
-; CHECK-LABEL: @pr70590_recipe_without_underlying_instr(
+; CHECK-LABEL: define void @pr70590_recipe_without_underlying_instr(
+; CHECK-SAME: i64 [[N:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.+]] ], [ [[INDEX_NEXT:%.*]], [[PRED_SREM_CONTINUE6:%.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_SREM_CONTINUE6]] ]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = icmp eq <4 x i64> [[VEC_IND]],
+; CHECK-NEXT: [[TMP1:%.*]] = icmp eq <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i1> [[TMP1]], <i1 true, i1 true, i1 true, i1 true>
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[TMP2]], i32 0
-; CHECK-NEXT: br i1 [[TMP3]], label [[PRED_SREM_IF:%.*]], label [[PRED_SREM_CONTINUE:%.*]]
-; CHECK: pred.srem.if:
+; CHECK-NEXT: br i1 [[TMP3]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; CHECK: pred.load.if:
; CHECK-NEXT: [[TMP4:%.*]] = srem i64 3, 0
-; CHECK-NEXT: br label [[PRED_SREM_CONTINUE]]
-; CHECK: pred.srem.continue:
-; CHECK-NEXT: [[TMP5:%.*]] = phi i64 [ poison, %vector.body ], [ [[TMP4]], [[PRED_SREM_IF]] ]
-; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP2]], i32 1
-; CHECK-NEXT: br i1 [[TMP6]], label [[PRED_SREM_IF1:%.*]], label [[PRED_SREM_CONTINUE2:%.*]]
-; CHECK: pred.srem.if1:
-; CHECK-NEXT: [[TMP7:%.*]] = srem i64 3, 0
-; CHECK-NEXT: br label [[PRED_SREM_CONTINUE2]]
-; CHECK: pred.srem.continue2:
-; CHECK-NEXT: [[TMP8:%.*]] = phi i64 [ poison, [[PRED_SREM_CONTINUE]] ], [ [[TMP7]], [[PRED_SREM_IF1]] ]
-; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP2]], i32 2
-; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_SREM_IF3:%.*]], label [[PRED_SREM_CONTINUE4:%.*]]
-; CHECK: pred.srem.if3:
-; CHECK-NEXT: [[TMP10:%.*]] = srem i64 3, 0
-; CHECK-NEXT: br label [[PRED_SREM_CONTINUE4]]
-; CHECK: pred.srem.continue4:
-; CHECK-NEXT: [[TMP11:%.*]] = phi i64 [ poison, [[PRED_SREM_CONTINUE2]] ], [ [[TMP10]], [[PRED_SREM_IF3]] ]
-; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i1> [[TMP2]], i32 3
-; CHECK-NEXT: br i1 [[TMP12]], label [[PRED_SREM_IF5:%.*]], label [[PRED_SREM_CONTINUE6]]
-; CHECK: pred.srem.if5:
-; CHECK-NEXT: [[TMP13:%.*]] = srem i64 3, 0
-; CHECK-NEXT: br label [[PRED_SREM_CONTINUE6]]
-; CHECK: pred.srem.continue6:
-; CHECK-NEXT: [[TMP14:%.*]] = phi i64 [ poison, [[PRED_SREM_CONTINUE4]] ], [ [[TMP13]], [[PRED_SREM_IF5]] ]
-; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[TMP5]], -3
-; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[TMP0]], [[TMP15]]
+; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP4]], -3
+; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[TMP0]], [[TMP5]]
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr [5 x i8], ptr @c, i64 0, i64 [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = load i8, ptr [[TMP7]], align 1
+; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i8> poison, i8 [[TMP8]], i32 0
+; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]]
+; CHECK: pred.load.continue:
+; CHECK-NEXT: [[TMP10:%.*]] = phi i64 [ poison, [[VECTOR_BODY]] ], [ [[TMP4]], [[PRED_LOAD_IF]] ]
+; CHECK-NEXT: [[TMP11:%.*]] = phi <4 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP9]], [[PRED_LOAD_IF]] ]
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i1> [[TMP2]], i32 1
+; CHECK-NEXT: br i1 [[TMP12]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]]
+; CHECK: pred.load.if1:
+; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT: [[TMP14:%.*]] = srem i64 3, 0
+; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[TMP14]], -3
+; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[TMP13]], [[TMP15]]
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr [5 x i8], ptr @c, i64 0, i64 [[TMP16]]
-; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i32 0
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP18]], align 1
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP2]], <4 x i8> [[WIDE_LOAD]], <4 x i8> zeroinitializer
-; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr %dst, i64 [[TMP0]]
-; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[TMP19]], i32 0
-; CHECK-NEXT: store <4 x i8> [[PREDPHI]], ptr [[TMP20]], align 4
+; CHECK-NEXT: [[TMP18:%.*]] = load i8, ptr [[TMP17]], align 1
+; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> [[TMP11]], i8 [[TMP18]], i32 1
+; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE2]]
+; CHECK: pred.load.continue2:
+; CHECK-NEXT: [[TMP20:%.*]] = phi i64 [ poison, [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF1]] ]
+; CHECK-NEXT: [[TMP21:%.*]] = phi <4 x i8> [ [[TMP11]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP19]], [[PRED_LOAD_IF1]] ]
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i1> [[TMP2]], i32 2
+; CHECK-NEXT: br i1 [[TMP22]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
+; CHECK: pred.load.if3:
+; CHECK-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT: [[TMP24:%.*]] = srem i64 3, 0
+; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[TMP24]], -3
+; CHECK-NEXT: [[TMP26:%.*]] = add i64 [[TMP23]], [[TMP25]]
+; CHECK-NEXT: [[TMP27:%.*]] = getelementptr [5 x i8], ptr @c, i64 0, i64 [[TMP26]]
+; CHECK-NEXT: [[TMP28:%.*]] = load i8, ptr [[TMP27]], align 1
+; CHECK-NEXT: [[TMP29:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP28]], i32 2
+; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE4]]
+; CHECK: pred.load.continue4:
+; CHECK-NEXT: [[TMP30:%.*]] = phi i64 [ poison, [[PRED_LOAD_CONTINUE2]] ], [ [[TMP24]], [[PRED_LOAD_IF3]] ]
+; CHECK-NEXT: [[TMP31:%.*]] = phi <4 x i8> [ [[TMP21]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP29]], [[PRED_LOAD_IF3]] ]
+; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x i1> [[TMP2]], i32 3
+; CHECK-NEXT: br i1 [[TMP32]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6]]
+; CHECK: pred.load.if5:
+; CHECK-NEXT: [[TMP33:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT: [[TMP34:%.*]] = srem i64 3, 0
+; CHECK-NEXT: [[TMP35:%.*]] = add i64 [[TMP34]], -3
+; CHECK-NEXT: [[TMP36:%.*]] = add i64 [[TMP33]], [[TMP35]]
+; CHECK-NEXT: [[TMP37:%.*]] = getelementptr [5 x i8], ptr @c, i64 0, i64 [[TMP36]]
+; CHECK-NEXT: [[TMP38:%.*]] = load i8, ptr [[TMP37]], align 1
+; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i8> [[TMP31]], i8 [[TMP38]], i32 3
+; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE6]]
+; CHECK: pred.load.continue6:
+; CHECK-NEXT: [[TMP40:%.*]] = phi i64 [ poison, [[PRED_LOAD_CONTINUE4]] ], [ [[TMP34]], [[PRED_LOAD_IF5]] ]
+; CHECK-NEXT: [[TMP41:%.*]] = phi <4 x i8> [ [[TMP31]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP39]], [[PRED_LOAD_IF5]] ]
+; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP2]], <4 x i8> [[TMP41]], <4 x i8> zeroinitializer
+; CHECK-NEXT: [[TMP42:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP43:%.*]] = getelementptr i8, ptr [[TMP42]], i32 0
+; CHECK-NEXT: store <4 x i8> [[PREDPHI]], ptr [[TMP43]], align 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: br i1 true, label %middle.block, label %vector.body
+; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
+; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
+; CHECK: loop.header:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV]], [[N]]
+; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_LATCH]], label [[THEN:%.*]]
+; CHECK: then:
+; CHECK-NEXT: [[REM:%.*]] = srem i64 3, 0
+; CHECK-NEXT: [[ADD3:%.*]] = add i64 [[REM]], -3
+; CHECK-NEXT: [[ADD5:%.*]] = add i64 [[IV]], [[ADD3]]
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr [5 x i8], ptr @c, i64 0, i64 [[ADD5]]
+; CHECK-NEXT: [[L:%.*]] = load i8, ptr [[GEP]], align 1
+; CHECK-NEXT: br label [[LOOP_LATCH]]
+; CHECK: loop.latch:
+; CHECK-NEXT: [[SR:%.*]] = phi i8 [ 0, [[LOOP_HEADER]] ], [ [[L]], [[THEN]] ]
+; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT: store i8 [[SR]], ptr [[GEP_DST]], align 4
+; CHECK-NEXT: [[INC]] = add i64 [[IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 4
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK: exit:
+; CHECK-NEXT: ret void
+;
entry:
br label %loop.header