diff options
author | Alexey Bataev <a.bataev@outlook.com> | 2024-04-08 17:55:22 +0000 |
---|---|---|
committer | Alexey Bataev <a.bataev@outlook.com> | 2024-04-08 17:55:22 +0000 |
commit | 48da8bd547f7904ba110c87c82aff8a8c43e2d01 (patch) | |
tree | 5c8448fe2d8cebce77e5cba9b6d9f8a959326ecf | |
parent | 78c50bbd45de595e9992bf97aa097f7f589f8370 (diff) |
[𝘀𝗽𝗿] initial versionupstream/users/alexey-bataev/spr/lvevlsupport-reversed-loadsstores
Created using spr 1.3.5
-rw-r--r-- | llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 67 | ||||
-rw-r--r-- | llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll | 23 |
2 files changed, 55 insertions, 35 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 9e22dce38477..797a3fd1e9db 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1579,13 +1579,7 @@ public: /// Returns true if VP intrinsics with explicit vector length support should /// be generated in the tail folded loop. bool foldTailWithEVL() const { - return getTailFoldingStyle() == TailFoldingStyle::DataWithEVL && - // FIXME: remove this once vp_reverse is supported. - none_of( - WideningDecisions, - [](const std::pair<std::pair<Instruction *, ElementCount>, - std::pair<InstWidening, InstructionCost>> - &Data) { return Data.second.first == CM_Widen_Reverse; }); + return getTailFoldingStyle() == TailFoldingStyle::DataWithEVL; } /// Returns true if the Phi is part of an inloop reduction. @@ -9361,10 +9355,17 @@ void VPReplicateRecipe::execute(VPTransformState &State) { /// Creates either vp_store or vp_scatter intrinsics calls to represent /// predicated store/scatter. -static Instruction * -lowerStoreUsingVectorIntrinsics(IRBuilderBase &Builder, Value *Addr, - Value *StoredVal, bool IsScatter, Value *Mask, - Value *EVL, const Align &Alignment) { +static Instruction *lowerStoreUsingVectorIntrinsics( + IRBuilderBase &Builder, Value *Addr, Value *StoredVal, bool IsScatter, + bool IsReverse, Value *Mask, Value *EVL, const Align &Alignment) { + if (IsReverse) { + auto *StoredValTy = cast<VectorType>(StoredVal->getType()); + Value *BlockInMaskPart = + Builder.getAllOnesMask(StoredValTy->getElementCount()); + StoredVal = Builder.CreateIntrinsic( + StoredValTy, Intrinsic::experimental_vp_reverse, + {StoredVal, BlockInMaskPart, EVL}, nullptr, "vp.reverse"); + } CallInst *Call; if (IsScatter) { Call = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()), @@ -9384,11 +9385,9 @@ lowerStoreUsingVectorIntrinsics(IRBuilderBase &Builder, Value *Addr, /// Creates either vp_load or vp_gather intrinsics calls to represent /// predicated load/gather. -static Instruction *lowerLoadUsingVectorIntrinsics(IRBuilderBase &Builder, - VectorType *DataTy, - Value *Addr, bool IsGather, - Value *Mask, Value *EVL, - const Align &Alignment) { +static Instruction *lowerLoadUsingVectorIntrinsics( + IRBuilderBase &Builder, VectorType *DataTy, Value *Addr, bool IsGather, + bool IsReverse, Value *Mask, Value *EVL, const Align &Alignment) { CallInst *Call; if (IsGather) { Call = @@ -9402,7 +9401,14 @@ static Instruction *lowerLoadUsingVectorIntrinsics(IRBuilderBase &Builder, } Call->addParamAttr( 0, Attribute::getWithAlignment(Call->getContext(), Alignment)); - return Call; + Instruction *Res = Call; + if (IsReverse) { + Value *BlockInMaskPart = Builder.getAllOnesMask(DataTy->getElementCount()); + Res = Builder.CreateIntrinsic(DataTy, Intrinsic::experimental_vp_reverse, + {Res, BlockInMaskPart, EVL}, nullptr, + "vp.reverse"); + } + return Res; } void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { @@ -9430,7 +9436,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { // a null all-one mask is a null mask. for (unsigned Part = 0; Part < State.UF; ++Part) { Value *Mask = State.get(getMask(), Part); - if (isReverse()) + if (isReverse() && !State.EVL) Mask = Builder.CreateVectorReverse(Mask, "reverse"); BlockInMaskParts[Part] = Mask; } @@ -9456,11 +9462,20 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { // is created only if TTI prefers predicated vectorization, thus if EVL // is not nullptr it also implies preference for predicated // vectorization. - // FIXME: Support reverse store after vp_reverse is added. Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; + if (isMaskRequired && isReverse() && !getMask()->isLiveIn()) { + VectorType *MaskTy = cast<VectorType>(MaskPart->getType()); + Value *BlockInMaskPart = + Builder.getAllOnesMask(MaskTy->getElementCount()); + MaskPart = Builder.CreateIntrinsic( + MaskTy, Intrinsic::experimental_vp_reverse, + {MaskPart, BlockInMaskPart, EVL}, nullptr, "vp.reverse.mask"); + BlockInMaskParts[Part] = MaskPart; + } NewSI = lowerStoreUsingVectorIntrinsics( Builder, State.get(getAddr(), Part, !CreateGatherScatter), - StoredVal, CreateGatherScatter, MaskPart, EVL, Alignment); + StoredVal, CreateGatherScatter, isReverse(), MaskPart, EVL, + Alignment); } else if (CreateGatherScatter) { Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; Value *VectorGep = State.get(getAddr(), Part); @@ -9504,11 +9519,19 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { // is created only if TTI prefers predicated vectorization, thus if EVL // is not nullptr it also implies preference for predicated // vectorization. - // FIXME: Support reverse loading after vp_reverse is added. Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; + if (isMaskRequired && isReverse() && !getMask()->isLiveIn()) { + VectorType *MaskTy = cast<VectorType>(MaskPart->getType()); + Value *BlockInMaskPart = + Builder.getAllOnesMask(MaskTy->getElementCount()); + MaskPart = Builder.CreateIntrinsic( + MaskTy, Intrinsic::experimental_vp_reverse, + {MaskPart, BlockInMaskPart, EVL}, nullptr, "vp.reverse.mask"); + BlockInMaskParts[Part] = MaskPart; + } NewLI = lowerLoadUsingVectorIntrinsics( Builder, DataTy, State.get(getAddr(), Part, !CreateGatherScatter), - CreateGatherScatter, MaskPart, EVL, Alignment); + CreateGatherScatter, isReverse(), MaskPart, EVL, Alignment); } else if (CreateGatherScatter) { Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; Value *VectorGep = State.get(getAddr(), Part); diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll index f2222e0a1f93..f839eafe9b2a 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll @@ -30,14 +30,11 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL]], [[INDEX]] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 1024, [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) +; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0 -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[INDEX]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer -; IF-EVL-NEXT: [[TMP8:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64() -; IF-EVL-NEXT: [[TMP9:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]] -; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP9]] -; IF-EVL-NEXT: [[TMP10:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1023, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer) ; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[TMP7]], -1 ; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP11]] ; IF-EVL-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() @@ -46,9 +43,8 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt ; IF-EVL-NEXT: [[TMP16:%.*]] = sub i64 1, [[TMP14]] ; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i64 [[TMP15]] ; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i64 [[TMP16]] -; IF-EVL-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[TMP10]]) -; IF-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP18]], i32 4, <vscale x 4 x i1> [[REVERSE]], <vscale x 4 x i32> poison) -; IF-EVL-NEXT: [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]]) +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP8]]) +; IF-EVL-NEXT: [[TMP31:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP8]]) ; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP11]] ; IF-EVL-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 4 @@ -56,9 +52,10 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt ; IF-EVL-NEXT: [[TMP23:%.*]] = sub i64 1, [[TMP21]] ; IF-EVL-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 [[TMP22]] ; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i64 [[TMP23]] -; IF-EVL-NEXT: [[REVERSE4:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[TMP10]]) -; IF-EVL-NEXT: [[REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[REVERSE3]]) -; IF-EVL-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[REVERSE5]], ptr [[TMP25]], i32 4, <vscale x 4 x i1> [[REVERSE4]]) +; IF-EVL-NEXT: [[TMP28:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[TMP31]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP8]]) +; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP28]], ptr align 4 [[TMP25]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP8]]) +; IF-EVL-NEXT: [[TMP29:%.*]] = zext i32 [[TMP8]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP29]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] ; IF-EVL-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] |