diff options
author | Alexey Bataev <a.bataev@outlook.com> | 2024-05-07 21:18:00 +0000 |
---|---|---|
committer | Alexey Bataev <a.bataev@outlook.com> | 2024-05-07 21:18:00 +0000 |
commit | aabb1d72f596b727ac9108e12ef25175b3cf64ad (patch) | |
tree | ec097b8928d0ba7d86fde2a3f0aa1cc765252db9 | |
parent | f00f2941307e04d3b7320969ee3fec9af31246ba (diff) |
[𝘀𝗽𝗿] initial versionupstream/users/alexey-bataev/spr/lvset-tailfolding-styles-before-computing-feasible-max-vf
Created using spr 1.3.5
3 files changed, 199 insertions, 105 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 3be0102bea3e..d27391142b5f 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1554,6 +1554,12 @@ public: } } + void disableTailFolding() { + assert(ChosenTailFoldingStyle && "Tail folding must be selected."); + ChosenTailFoldingStyle = + std::make_pair(TailFoldingStyle::None, TailFoldingStyle::None); + } + /// Returns true if all loop blocks should be masked to fold tail loop. bool foldTailByMasking() const { // TODO: check if it is possible to check for None style independent of @@ -1642,6 +1648,14 @@ private: ElementCount MaxSafeVF, bool FoldTailByMasking); + /// true of scalable vectorization is supported and enabled. + std::optional<bool> IsScalableVectorizationAllowed; + + /// Checks if the scalable vectorization is supported and enabled. The result + /// is stored in \p IsScalableVectorizationAllowed and used later, if + /// requested. + bool isScalableVectorizationAllowed(); + /// \return the maximum legal scalable VF, based on the safe max number /// of elements. ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); @@ -4079,9 +4093,7 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( // needs predication, or it was decided to use masking to deal with gaps // (either a gap at the end of a load-access that may result in a speculative // load, or any gaps in a store-access). - bool PredicatedAccessRequiresMasking = - blockNeedsPredicationForAnyReason(I->getParent()) && - Legal->isMaskRequired(I); + bool PredicatedAccessRequiresMasking = isPredicatedInst(I); bool LoadAccessWithGapsRequiresEpilogMasking = isa<LoadInst>(I) && Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); @@ -4397,15 +4409,17 @@ bool LoopVectorizationCostModel::runtimeChecksRequired() { return false; } -ElementCount -LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { +bool LoopVectorizationCostModel::isScalableVectorizationAllowed() { + if (IsScalableVectorizationAllowed) + return *IsScalableVectorizationAllowed; + IsScalableVectorizationAllowed = false; if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) - return ElementCount::getScalable(0); + return false; if (Hints->isScalableVectorizationDisabled()) { reportVectorizationInfo("Scalable vectorization is explicitly disabled", "ScalableVectorizationDisabled", ORE, TheLoop); - return ElementCount::getScalable(0); + return false; } LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); @@ -4425,7 +4439,7 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { "Scalable vectorization not supported for the reduction " "operations found in this loop.", "ScalableVFUnfeasible", ORE, TheLoop); - return ElementCount::getScalable(0); + return false; } // Disable scalable vectorization if the loop contains any instructions @@ -4437,9 +4451,20 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { reportVectorizationInfo("Scalable vectorization is not supported " "for all element types found in this loop.", "ScalableVFUnfeasible", ORE, TheLoop); - return ElementCount::getScalable(0); + return false; } + IsScalableVectorizationAllowed = true; + return true; +} + +ElementCount +LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { + if (!isScalableVectorizationAllowed()) + return ElementCount::getScalable(0); + + auto MaxScalableVF = ElementCount::getScalable( + std::numeric_limits<ElementCount::ScalarTy>::max()); if (Legal->isSafeForAnyVectorWidth()) return MaxScalableVF; @@ -4642,6 +4667,11 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); } + // If we don't know the precise trip count, or if the trip count that we + // found modulo the vectorization factor is not zero, try to fold the tail + // by masking. + // FIXME: look for a smaller MaxVF that does divide TC rather than masking. + setTailFoldingStyles(isScalableVectorizationAllowed(), UserIC); FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true); // Avoid tail folding if the trip count is known to be a multiple of any VF @@ -4673,15 +4703,11 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { if (Rem->isZero()) { // Accept MaxFixedVF if we do not have a tail. LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); + disableTailFolding(); return MaxFactors; } } - // If we don't know the precise trip count, or if the trip count that we - // found modulo the vectorization factor is not zero, try to fold the tail - // by masking. - // FIXME: look for a smaller MaxVF that does divide TC rather than masking. - setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC); if (foldTailByMasking()) { if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) { LLVM_DEBUG( @@ -6096,7 +6122,7 @@ LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, "Stride should be 1 or -1 for consecutive memory access"); const Align Alignment = getLoadStoreAlignment(I); InstructionCost Cost = 0; - if (Legal->isMaskRequired(I)) { + if (isPredicatedInst(I)) { Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, CostKind); } else { @@ -6150,7 +6176,7 @@ LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, return TTI.getAddressComputationCost(VectorTy) + TTI.getGatherScatterOpCost( - I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, + I->getOpcode(), VectorTy, Ptr, isPredicatedInst(I), Alignment, TargetTransformInfo::TCK_RecipThroughput, I); } @@ -6180,7 +6206,7 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor())); InstructionCost Cost = TTI.getInterleavedMemoryOpCost( I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), - AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps); + AS, CostKind, isPredicatedInst(I), UseMaskForGaps); if (Group->isReverse()) { // TODO: Add support for reversed masked interleaved access. @@ -6675,7 +6701,7 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) { Function *ScalarFunc = CI->getCalledFunction(); Type *ScalarRetTy = CI->getType(); SmallVector<Type *, 4> Tys, ScalarTys; - bool MaskRequired = Legal->isMaskRequired(CI); + bool MaskRequired = isPredicatedInst(CI); for (auto &ArgOp : CI->args()) ScalarTys.push_back(ArgOp->getType()); @@ -7072,8 +7098,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, return TTI::CastContextHint::Interleave; case LoopVectorizationCostModel::CM_Scalarize: case LoopVectorizationCostModel::CM_Widen: - return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked - : TTI::CastContextHint::Normal; + return isPredicatedInst(I) ? TTI::CastContextHint::Masked + : TTI::CastContextHint::Normal; case LoopVectorizationCostModel::CM_Widen_Reverse: return TTI::CastContextHint::Reversed; case LoopVectorizationCostModel::CM_Unknown: @@ -8121,7 +8147,7 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands, return nullptr; VPValue *Mask = nullptr; - if (Legal->isMaskRequired(I)) + if (CM.isPredicatedInst(I)) Mask = getBlockInMask(I->getParent()); // Determine if the pointer operand of the access is either consecutive or @@ -8329,7 +8355,7 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, // vector variant at this VF requires a mask, so we synthesize an // all-true mask. VPValue *Mask = nullptr; - if (Legal->isMaskRequired(CI)) + if (CM.isPredicatedInst(CI)) Mask = getBlockInMask(CI->getParent()); else Mask = Plan.getOrAddLiveIn(ConstantInt::getTrue( diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll index 0b495bc680f0..404c48facbef 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll @@ -13,18 +13,18 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) { ; IF-EVL-LABEL: @interleave( ; IF-EVL-NEXT: entry: -; IF-EVL-NEXT: [[TMP17:%.*]] = sub i64 -1, [[N:%.*]] -; IF-EVL-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64() -; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP31]], 8 -; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP17]], [[TMP2]] +; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]] +; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 8 +; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] ; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; IF-EVL: vector.ph: ; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 ; IF-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 8 -; IF-EVL-NEXT: [[TMP32:%.*]] = sub i64 [[TMP7]], 1 -; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP32]] +; IF-EVL-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]] ; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 @@ -36,8 +36,8 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) { ; IF-EVL-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP13]] ; IF-EVL-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4 -; IF-EVL-NEXT: [[TMP37:%.*]] = mul i64 1, [[TMP15]] -; IF-EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP37]], i64 0 +; IF-EVL-NEXT: [[TMP16:%.*]] = mul i64 1, [[TMP15]] +; IF-EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP16]], i64 0 ; IF-EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer ; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 ; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer @@ -46,12 +46,12 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) { ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[STEP_ADD:%.*]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]] -; IF-EVL-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; IF-EVL-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 0 ; IF-EVL-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 4 -; IF-EVL-NEXT: [[TMP38:%.*]] = add i64 [[TMP19]], 0 -; IF-EVL-NEXT: [[TMP39:%.*]] = mul i64 [[TMP38]], 1 -; IF-EVL-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], [[TMP39]] +; IF-EVL-NEXT: [[TMP20:%.*]] = add i64 [[TMP19]], 0 +; IF-EVL-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 1 +; IF-EVL-NEXT: [[TMP22:%.*]] = add i64 [[INDEX]], [[TMP21]] ; IF-EVL-NEXT: [[TMP23:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; IF-EVL-NEXT: [[TMP24:%.*]] = icmp ule <vscale x 4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]] ; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], <vscale x 4 x i64> [[VEC_IND]], i32 0 @@ -64,18 +64,18 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) { ; IF-EVL-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP28]], i32 4, <vscale x 4 x i1> [[TMP24]], <vscale x 4 x i32> poison) ; IF-EVL-NEXT: [[TMP29:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_GATHER3]], [[WIDE_MASKED_GATHER]] ; IF-EVL-NEXT: [[TMP30:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_GATHER4]], [[WIDE_MASKED_GATHER2]] -; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]] -; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP1]] -; IF-EVL-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0 +; IF-EVL-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP17]] +; IF-EVL-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP22]] +; IF-EVL-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP31]], i32 0 ; IF-EVL-NEXT: [[TMP34:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 4 -; IF-EVL-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i64 [[TMP35]] +; IF-EVL-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[TMP31]], i64 [[TMP35]] ; IF-EVL-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP29]], ptr [[TMP33]], i32 4, <vscale x 4 x i1> [[TMP23]]) ; IF-EVL-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP30]], ptr [[TMP36]], i32 4, <vscale x 4 x i1> [[TMP24]]) ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]] ; IF-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[STEP_ADD]], [[DOTSPLAT]] -; IF-EVL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IF-EVL-NEXT: [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: @@ -84,10 +84,10 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) { ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[IV]], i32 0 -; IF-EVL-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[TMP38:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; IF-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[IV]], i32 1 -; IF-EVL-NEXT: [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 -; IF-EVL-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP22]], [[TMP21]] +; IF-EVL-NEXT: [[TMP39:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; IF-EVL-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP39]], [[TMP38]] ; IF-EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] ; IF-EVL-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4 ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 @@ -98,35 +98,49 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) { ; ; NO-VP-LABEL: @interleave( ; NO-VP-NEXT: entry: -; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 16 +; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]] ; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; NO-VP: vector.ph: -; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16 +; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 ; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] ; NO-VP: vector.body: ; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; NO-VP-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0 -; NO-VP-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8 -; NO-VP-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], i64 [[TMP10]], i32 0 -; NO-VP-NEXT: [[TMP3:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[TMP1]], i32 0 -; NO-VP-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0 -; NO-VP-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0 -; NO-VP-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP4]], align 4 -; NO-VP-NEXT: [[WIDE_VEC1:%.*]] = load <16 x i32>, ptr [[TMP5]], align 4 -; NO-VP-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> -; NO-VP-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> -; NO-VP-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> -; NO-VP-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> -; NO-VP-NEXT: [[TMP6:%.*]] = add nsw <8 x i32> [[STRIDED_VEC3]], [[STRIDED_VEC]] -; NO-VP-NEXT: [[TMP7:%.*]] = add nsw <8 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC2]] -; NO-VP-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]] -; NO-VP-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP1]] -; NO-VP-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 0 -; NO-VP-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 8 -; NO-VP-NEXT: store <8 x i32> [[TMP6]], ptr [[TMP12]], align 4 -; NO-VP-NEXT: store <8 x i32> [[TMP7]], ptr [[TMP11]], align 4 -; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; NO-VP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; NO-VP-NEXT: [[TMP9:%.*]] = add i64 [[TMP8]], 0 +; NO-VP-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 1 +; NO-VP-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]] +; NO-VP-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], i64 [[TMP6]], i32 0 +; NO-VP-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[TMP11]], i32 0 +; NO-VP-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 +; NO-VP-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0 +; NO-VP-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP14]], align 4 +; NO-VP-NEXT: [[WIDE_VEC1:%.*]] = load <vscale x 8 x i32>, ptr [[TMP15]], align 4 +; NO-VP-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]]) +; NO-VP-NEXT: [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0 +; NO-VP-NEXT: [[TMP17:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1 +; NO-VP-NEXT: [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC1]]) +; NO-VP-NEXT: [[TMP18:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0 +; NO-VP-NEXT: [[TMP19:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 1 +; NO-VP-NEXT: [[TMP20:%.*]] = add nsw <vscale x 4 x i32> [[TMP17]], [[TMP16]] +; NO-VP-NEXT: [[TMP21:%.*]] = add nsw <vscale x 4 x i32> [[TMP19]], [[TMP18]] +; NO-VP-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]] +; NO-VP-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]] +; NO-VP-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i32 0 +; NO-VP-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 4 +; NO-VP-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i64 [[TMP26]] +; NO-VP-NEXT: store <vscale x 4 x i32> [[TMP20]], ptr [[TMP24]], align 4 +; NO-VP-NEXT: store <vscale x 4 x i32> [[TMP21]], ptr [[TMP27]], align 4 +; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; NO-VP-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; NO-VP-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; NO-VP: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll b/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll index fcc3864a7aeb..d3f2e4472b64 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll @@ -489,54 +489,108 @@ loop.exit: ; Note that the then block has UB, but I could not find any other way to ; construct a suitable test case. define void @pr70590_recipe_without_underlying_instr(i64 %n, ptr noalias %dst) { -; CHECK-LABEL: @pr70590_recipe_without_underlying_instr( +; CHECK-LABEL: define void @pr70590_recipe_without_underlying_instr( +; CHECK-SAME: i64 [[N:%.*]], ptr noalias [[DST:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.+]] ], [ [[INDEX_NEXT:%.*]], [[PRED_SREM_CONTINUE6:%.*]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_SREM_CONTINUE6]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = icmp eq <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i1> [[TMP1]], <i1 true, i1 true, i1 true, i1 true> ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[TMP2]], i32 0 -; CHECK-NEXT: br i1 [[TMP3]], label [[PRED_SREM_IF:%.*]], label [[PRED_SREM_CONTINUE:%.*]] -; CHECK: pred.srem.if: +; CHECK-NEXT: br i1 [[TMP3]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; CHECK: pred.load.if: ; CHECK-NEXT: [[TMP4:%.*]] = srem i64 3, 0 -; CHECK-NEXT: br label [[PRED_SREM_CONTINUE]] -; CHECK: pred.srem.continue: -; CHECK-NEXT: [[TMP5:%.*]] = phi i64 [ poison, %vector.body ], [ [[TMP4]], [[PRED_SREM_IF]] ] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP2]], i32 1 -; CHECK-NEXT: br i1 [[TMP6]], label [[PRED_SREM_IF1:%.*]], label [[PRED_SREM_CONTINUE2:%.*]] -; CHECK: pred.srem.if1: -; CHECK-NEXT: [[TMP7:%.*]] = srem i64 3, 0 -; CHECK-NEXT: br label [[PRED_SREM_CONTINUE2]] -; CHECK: pred.srem.continue2: -; CHECK-NEXT: [[TMP8:%.*]] = phi i64 [ poison, [[PRED_SREM_CONTINUE]] ], [ [[TMP7]], [[PRED_SREM_IF1]] ] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP2]], i32 2 -; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_SREM_IF3:%.*]], label [[PRED_SREM_CONTINUE4:%.*]] -; CHECK: pred.srem.if3: -; CHECK-NEXT: [[TMP10:%.*]] = srem i64 3, 0 -; CHECK-NEXT: br label [[PRED_SREM_CONTINUE4]] -; CHECK: pred.srem.continue4: -; CHECK-NEXT: [[TMP11:%.*]] = phi i64 [ poison, [[PRED_SREM_CONTINUE2]] ], [ [[TMP10]], [[PRED_SREM_IF3]] ] -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i1> [[TMP2]], i32 3 -; CHECK-NEXT: br i1 [[TMP12]], label [[PRED_SREM_IF5:%.*]], label [[PRED_SREM_CONTINUE6]] -; CHECK: pred.srem.if5: -; CHECK-NEXT: [[TMP13:%.*]] = srem i64 3, 0 -; CHECK-NEXT: br label [[PRED_SREM_CONTINUE6]] -; CHECK: pred.srem.continue6: -; CHECK-NEXT: [[TMP14:%.*]] = phi i64 [ poison, [[PRED_SREM_CONTINUE4]] ], [ [[TMP13]], [[PRED_SREM_IF5]] ] -; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[TMP5]], -3 -; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[TMP0]], [[TMP15]] +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP4]], -3 +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[TMP0]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr [5 x i8], ptr @c, i64 0, i64 [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = load i8, ptr [[TMP7]], align 1 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i8> poison, i8 [[TMP8]], i32 0 +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]] +; CHECK: pred.load.continue: +; CHECK-NEXT: [[TMP10:%.*]] = phi i64 [ poison, [[VECTOR_BODY]] ], [ [[TMP4]], [[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP11:%.*]] = phi <4 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP9]], [[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i1> [[TMP2]], i32 1 +; CHECK-NEXT: br i1 [[TMP12]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] +; CHECK: pred.load.if1: +; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP14:%.*]] = srem i64 3, 0 +; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[TMP14]], -3 +; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[TMP13]], [[TMP15]] ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr [5 x i8], ptr @c, i64 0, i64 [[TMP16]] -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP18]], align 1 -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP2]], <4 x i8> [[WIDE_LOAD]], <4 x i8> zeroinitializer -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr %dst, i64 [[TMP0]] -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[TMP19]], i32 0 -; CHECK-NEXT: store <4 x i8> [[PREDPHI]], ptr [[TMP20]], align 4 +; CHECK-NEXT: [[TMP18:%.*]] = load i8, ptr [[TMP17]], align 1 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> [[TMP11]], i8 [[TMP18]], i32 1 +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE2]] +; CHECK: pred.load.continue2: +; CHECK-NEXT: [[TMP20:%.*]] = phi i64 [ poison, [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF1]] ] +; CHECK-NEXT: [[TMP21:%.*]] = phi <4 x i8> [ [[TMP11]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP19]], [[PRED_LOAD_IF1]] ] +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i1> [[TMP2]], i32 2 +; CHECK-NEXT: br i1 [[TMP22]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] +; CHECK: pred.load.if3: +; CHECK-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP24:%.*]] = srem i64 3, 0 +; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[TMP24]], -3 +; CHECK-NEXT: [[TMP26:%.*]] = add i64 [[TMP23]], [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr [5 x i8], ptr @c, i64 0, i64 [[TMP26]] +; CHECK-NEXT: [[TMP28:%.*]] = load i8, ptr [[TMP27]], align 1 +; CHECK-NEXT: [[TMP29:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP28]], i32 2 +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE4]] +; CHECK: pred.load.continue4: +; CHECK-NEXT: [[TMP30:%.*]] = phi i64 [ poison, [[PRED_LOAD_CONTINUE2]] ], [ [[TMP24]], [[PRED_LOAD_IF3]] ] +; CHECK-NEXT: [[TMP31:%.*]] = phi <4 x i8> [ [[TMP21]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP29]], [[PRED_LOAD_IF3]] ] +; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x i1> [[TMP2]], i32 3 +; CHECK-NEXT: br i1 [[TMP32]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6]] +; CHECK: pred.load.if5: +; CHECK-NEXT: [[TMP33:%.*]] = add i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP34:%.*]] = srem i64 3, 0 +; CHECK-NEXT: [[TMP35:%.*]] = add i64 [[TMP34]], -3 +; CHECK-NEXT: [[TMP36:%.*]] = add i64 [[TMP33]], [[TMP35]] +; CHECK-NEXT: [[TMP37:%.*]] = getelementptr [5 x i8], ptr @c, i64 0, i64 [[TMP36]] +; CHECK-NEXT: [[TMP38:%.*]] = load i8, ptr [[TMP37]], align 1 +; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i8> [[TMP31]], i8 [[TMP38]], i32 3 +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE6]] +; CHECK: pred.load.continue6: +; CHECK-NEXT: [[TMP40:%.*]] = phi i64 [ poison, [[PRED_LOAD_CONTINUE4]] ], [ [[TMP34]], [[PRED_LOAD_IF5]] ] +; CHECK-NEXT: [[TMP41:%.*]] = phi <4 x i8> [ [[TMP31]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP39]], [[PRED_LOAD_IF5]] ] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP2]], <4 x i8> [[TMP41]], <4 x i8> zeroinitializer +; CHECK-NEXT: [[TMP42:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP43:%.*]] = getelementptr i8, ptr [[TMP42]], i32 0 +; CHECK-NEXT: store <4 x i8> [[PREDPHI]], ptr [[TMP43]], align 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4> ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: br i1 true, label %middle.block, label %vector.body +; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] +; CHECK: loop.header: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[LOOP_LATCH:%.*]] ] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV]], [[N]] +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_LATCH]], label [[THEN:%.*]] +; CHECK: then: +; CHECK-NEXT: [[REM:%.*]] = srem i64 3, 0 +; CHECK-NEXT: [[ADD3:%.*]] = add i64 [[REM]], -3 +; CHECK-NEXT: [[ADD5:%.*]] = add i64 [[IV]], [[ADD3]] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr [5 x i8], ptr @c, i64 0, i64 [[ADD5]] +; CHECK-NEXT: [[L:%.*]] = load i8, ptr [[GEP]], align 1 +; CHECK-NEXT: br label [[LOOP_LATCH]] +; CHECK: loop.latch: +; CHECK-NEXT: [[SR:%.*]] = phi i8 [ 0, [[LOOP_HEADER]] ], [ [[L]], [[THEN]] ] +; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]] +; CHECK-NEXT: store i8 [[SR]], ptr [[GEP_DST]], align 4 +; CHECK-NEXT: [[INC]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 4 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: ret void +; entry: br label %loop.header |