diff options
Diffstat (limited to 'lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp')
-rw-r--r-- | lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp | 1089 |
1 files changed, 803 insertions, 286 deletions
diff --git a/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp index 7046ab3aa3..ca1e9311b6 100644 --- a/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ b/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -1,9 +1,8 @@ //===---- CGOpenMPRuntimeNVPTX.cpp - Interface to OpenMP NVPTX Runtimes ---===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -61,13 +60,19 @@ enum OpenMPRTLFunctionNVPTX { /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t /// lane_offset, int16_t shortCircuit), /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num)); - OMPRTL_NVPTX__kmpc_parallel_reduce_nowait_v2, - /// Call to __kmpc_nvptx_teams_reduce_nowait_simple(ident_t *loc, kmp_int32 - /// global_tid, kmp_critical_name *lck) - OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_simple, - /// Call to __kmpc_nvptx_teams_end_reduce_nowait_simple(ident_t *loc, - /// kmp_int32 global_tid, kmp_critical_name *lck) - OMPRTL_NVPTX__kmpc_nvptx_teams_end_reduce_nowait_simple, + OMPRTL_NVPTX__kmpc_nvptx_parallel_reduce_nowait_v2, + /// Call to __kmpc_nvptx_teams_reduce_nowait_v2(ident_t *loc, kmp_int32 + /// global_tid, void *global_buffer, int32_t num_of_records, void* + /// reduce_data, + /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t + /// lane_offset, int16_t shortCircuit), + /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num), void + /// (*kmp_ListToGlobalCpyFctPtr)(void *buffer, int idx, void *reduce_data), + /// void (*kmp_GlobalToListCpyFctPtr)(void *buffer, int idx, + /// void *reduce_data), void (*kmp_GlobalToListCpyPtrsFctPtr)(void *buffer, + /// int idx, void *reduce_data), void (*kmp_GlobalToListRedFctPtr)(void + /// *buffer, int idx, void *reduce_data)); + OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_v2, /// Call to __kmpc_nvptx_end_reduce_nowait(int32_t global_tid); OMPRTL_NVPTX__kmpc_end_reduce_nowait, /// Call to void __kmpc_data_sharing_init_stack(); @@ -106,17 +111,18 @@ enum OpenMPRTLFunctionNVPTX { /// Pre(post)-action for different OpenMP constructs specialized for NVPTX. class NVPTXActionTy final : public PrePostActionTy { - llvm::Value *EnterCallee = nullptr; + llvm::FunctionCallee EnterCallee = nullptr; ArrayRef<llvm::Value *> EnterArgs; - llvm::Value *ExitCallee = nullptr; + llvm::FunctionCallee ExitCallee = nullptr; ArrayRef<llvm::Value *> ExitArgs; bool Conditional = false; llvm::BasicBlock *ContBlock = nullptr; public: - NVPTXActionTy(llvm::Value *EnterCallee, ArrayRef<llvm::Value *> EnterArgs, - llvm::Value *ExitCallee, ArrayRef<llvm::Value *> ExitArgs, - bool Conditional = false) + NVPTXActionTy(llvm::FunctionCallee EnterCallee, + ArrayRef<llvm::Value *> EnterArgs, + llvm::FunctionCallee ExitCallee, + ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false) : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee), ExitArgs(ExitArgs), Conditional(Conditional) {} void Enter(CodeGenFunction &CGF) override { @@ -215,16 +221,13 @@ static const ValueDecl *getPrivateItem(const Expr *RefExpr) { return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl()); } -typedef std::pair<CharUnits /*Align*/, const ValueDecl *> VarsDataTy; -static bool stable_sort_comparator(const VarsDataTy P1, const VarsDataTy P2) { - return P1.first > P2.first; -} static RecordDecl *buildRecordForGlobalizedVars( ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls, ArrayRef<const ValueDecl *> EscapedDeclsForTeams, llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> - &MappedDeclsFields) { + &MappedDeclsFields, int BufSize) { + using VarsDataTy = std::pair<CharUnits /*Align*/, const ValueDecl *>; if (EscapedDecls.empty() && EscapedDeclsForTeams.empty()) return nullptr; SmallVector<VarsDataTy, 4> GlobalizedVars; @@ -236,8 +239,10 @@ static RecordDecl *buildRecordForGlobalizedVars( D); for (const ValueDecl *D : EscapedDeclsForTeams) GlobalizedVars.emplace_back(C.getDeclAlign(D), D); - std::stable_sort(GlobalizedVars.begin(), GlobalizedVars.end(), - stable_sort_comparator); + llvm::stable_sort(GlobalizedVars, [](VarsDataTy L, VarsDataTy R) { + return L.first > R.first; + }); + // Build struct _globalized_locals_ty { // /* globalized vars */[WarSize] align (max(decl_align, // GlobalMemoryAlignment)) @@ -270,7 +275,7 @@ static RecordDecl *buildRecordForGlobalizedVars( Field->addAttr(*I); } } else { - llvm::APInt ArraySize(32, WarpSize); + llvm::APInt ArraySize(32, BufSize); Type = C.getConstantArrayType(Type, ArraySize, ArrayType::Normal, 0); Field = FieldDecl::Create( C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type, @@ -312,6 +317,9 @@ class CheckVarsEscapingDeclContext final OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD)) return; VD = cast<ValueDecl>(VD->getCanonicalDecl()); + // Use user-specified allocation. + if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>()) + return; // Variables captured by value must be globalized. if (auto *CSI = CGF.CapturedStmtInfo) { if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) { @@ -419,7 +427,7 @@ class CheckVarsEscapingDeclContext final EscapedDeclsForParallel = EscapedDecls.getArrayRef(); GlobalizedRD = ::buildRecordForGlobalizedVars( CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams, - MappedDeclsFields); + MappedDeclsFields, WarpSize); } public: @@ -705,112 +713,37 @@ getDataSharingMode(CodeGenModule &CGM) { : CGOpenMPRuntimeNVPTX::Generic; } -/// Checks if the expression is constant or does not have non-trivial function -/// calls. -static bool isTrivial(ASTContext &Ctx, const Expr * E) { - // We can skip constant expressions. - // We can skip expressions with trivial calls or simple expressions. - return (E->isEvaluatable(Ctx, Expr::SE_AllowUndefinedBehavior) || - !E->hasNonTrivialCall(Ctx)) && - !E->HasSideEffects(Ctx, /*IncludePossibleEffects=*/true); -} - -/// Checks if the \p Body is the \a CompoundStmt and returns its child statement -/// iff there is only one that is not evaluatable at the compile time. -static const Stmt *getSingleCompoundChild(ASTContext &Ctx, const Stmt *Body) { - if (const auto *C = dyn_cast<CompoundStmt>(Body)) { - const Stmt *Child = nullptr; - for (const Stmt *S : C->body()) { - if (const auto *E = dyn_cast<Expr>(S)) { - if (isTrivial(Ctx, E)) - continue; - } - // Some of the statements can be ignored. - if (isa<AsmStmt>(S) || isa<NullStmt>(S) || isa<OMPFlushDirective>(S) || - isa<OMPBarrierDirective>(S) || isa<OMPTaskyieldDirective>(S)) - continue; - // Analyze declarations. - if (const auto *DS = dyn_cast<DeclStmt>(S)) { - if (llvm::all_of(DS->decls(), [&Ctx](const Decl *D) { - if (isa<EmptyDecl>(D) || isa<DeclContext>(D) || - isa<TypeDecl>(D) || isa<PragmaCommentDecl>(D) || - isa<PragmaDetectMismatchDecl>(D) || isa<UsingDecl>(D) || - isa<UsingDirectiveDecl>(D) || - isa<OMPDeclareReductionDecl>(D) || - isa<OMPThreadPrivateDecl>(D)) - return true; - const auto *VD = dyn_cast<VarDecl>(D); - if (!VD) - return false; - return VD->isConstexpr() || - ((VD->getType().isTrivialType(Ctx) || - VD->getType()->isReferenceType()) && - (!VD->hasInit() || isTrivial(Ctx, VD->getInit()))); - })) - continue; - } - // Found multiple children - cannot get the one child only. - if (Child) - return Body; - Child = S; - } - if (Child) - return Child; - } - return Body; -} - -/// Check if the parallel directive has an 'if' clause with non-constant or -/// false condition. Also, check if the number of threads is strictly specified -/// and run those directives in non-SPMD mode. -static bool hasParallelIfNumThreadsClause(ASTContext &Ctx, - const OMPExecutableDirective &D) { - if (D.hasClausesOfKind<OMPNumThreadsClause>()) - return true; - for (const auto *C : D.getClausesOfKind<OMPIfClause>()) { - OpenMPDirectiveKind NameModifier = C->getNameModifier(); - if (NameModifier != OMPD_parallel && NameModifier != OMPD_unknown) - continue; - const Expr *Cond = C->getCondition(); - bool Result; - if (!Cond->EvaluateAsBooleanCondition(Result, Ctx) || !Result) - return true; - } - return false; -} - /// Check for inner (nested) SPMD construct, if any static bool hasNestedSPMDDirective(ASTContext &Ctx, const OMPExecutableDirective &D) { const auto *CS = D.getInnermostCapturedStmt(); const auto *Body = CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true); - const Stmt *ChildStmt = getSingleCompoundChild(Ctx, Body); + const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body); - if (const auto *NestedDir = dyn_cast<OMPExecutableDirective>(ChildStmt)) { + if (const auto *NestedDir = + dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) { OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind(); switch (D.getDirectiveKind()) { case OMPD_target: - if (isOpenMPParallelDirective(DKind) && - !hasParallelIfNumThreadsClause(Ctx, *NestedDir)) + if (isOpenMPParallelDirective(DKind)) return true; if (DKind == OMPD_teams) { Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers( /*IgnoreCaptured=*/true); if (!Body) return false; - ChildStmt = getSingleCompoundChild(Ctx, Body); - if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) { + ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body); + if (const auto *NND = + dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) { DKind = NND->getDirectiveKind(); - if (isOpenMPParallelDirective(DKind) && - !hasParallelIfNumThreadsClause(Ctx, *NND)) + if (isOpenMPParallelDirective(DKind)) return true; } } return false; case OMPD_target_teams: - return isOpenMPParallelDirective(DKind) && - !hasParallelIfNumThreadsClause(Ctx, *NestedDir); + return isOpenMPParallelDirective(DKind); case OMPD_target_simd: case OMPD_target_parallel: case OMPD_target_parallel_for: @@ -829,6 +762,7 @@ static bool hasNestedSPMDDirective(ASTContext &Ctx, case OMPD_cancellation_point: case OMPD_ordered: case OMPD_threadprivate: + case OMPD_allocate: case OMPD_task: case OMPD_simd: case OMPD_sections: @@ -859,6 +793,7 @@ static bool hasNestedSPMDDirective(ASTContext &Ctx, case OMPD_declare_target: case OMPD_end_declare_target: case OMPD_declare_reduction: + case OMPD_declare_mapper: case OMPD_taskloop: case OMPD_taskloop_simd: case OMPD_requires: @@ -882,10 +817,10 @@ static bool supportsSPMDExecutionMode(ASTContext &Ctx, case OMPD_target_parallel_for_simd: case OMPD_target_teams_distribute_parallel_for: case OMPD_target_teams_distribute_parallel_for_simd: - return !hasParallelIfNumThreadsClause(Ctx, D); case OMPD_target_simd: - case OMPD_target_teams_distribute: case OMPD_target_teams_distribute_simd: + return true; + case OMPD_target_teams_distribute: return false; case OMPD_parallel: case OMPD_for: @@ -897,6 +832,7 @@ static bool supportsSPMDExecutionMode(ASTContext &Ctx, case OMPD_cancellation_point: case OMPD_ordered: case OMPD_threadprivate: + case OMPD_allocate: case OMPD_task: case OMPD_simd: case OMPD_sections: @@ -927,6 +863,7 @@ static bool supportsSPMDExecutionMode(ASTContext &Ctx, case OMPD_declare_target: case OMPD_end_declare_target: case OMPD_declare_reduction: + case OMPD_declare_mapper: case OMPD_taskloop: case OMPD_taskloop_simd: case OMPD_requires: @@ -958,9 +895,10 @@ static bool hasNestedLightweightDirective(ASTContext &Ctx, const auto *CS = D.getInnermostCapturedStmt(); const auto *Body = CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true); - const Stmt *ChildStmt = getSingleCompoundChild(Ctx, Body); + const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body); - if (const auto *NestedDir = dyn_cast<OMPExecutableDirective>(ChildStmt)) { + if (const auto *NestedDir = + dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) { OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind(); switch (D.getDirectiveKind()) { case OMPD_target: @@ -968,13 +906,16 @@ static bool hasNestedLightweightDirective(ASTContext &Ctx, isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NestedDir)) return true; + if (DKind == OMPD_teams_distribute_simd || DKind == OMPD_simd) + return true; if (DKind == OMPD_parallel) { Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers( /*IgnoreCaptured=*/true); if (!Body) return false; - ChildStmt = getSingleCompoundChild(Ctx, Body); - if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) { + ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body); + if (const auto *NND = + dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) { DKind = NND->getDirectiveKind(); if (isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND)) @@ -985,8 +926,9 @@ static bool hasNestedLightweightDirective(ASTContext &Ctx, /*IgnoreCaptured=*/true); if (!Body) return false; - ChildStmt = getSingleCompoundChild(Ctx, Body); - if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) { + ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body); + if (const auto *NND = + dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) { DKind = NND->getDirectiveKind(); if (isOpenMPParallelDirective(DKind) && isOpenMPWorksharingDirective(DKind) && @@ -997,8 +939,9 @@ static bool hasNestedLightweightDirective(ASTContext &Ctx, /*IgnoreCaptured=*/true); if (!Body) return false; - ChildStmt = getSingleCompoundChild(Ctx, Body); - if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) { + ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body); + if (const auto *NND = + dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) { DKind = NND->getDirectiveKind(); if (isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND)) @@ -1013,13 +956,16 @@ static bool hasNestedLightweightDirective(ASTContext &Ctx, isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NestedDir)) return true; + if (DKind == OMPD_distribute_simd || DKind == OMPD_simd) + return true; if (DKind == OMPD_parallel) { Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers( /*IgnoreCaptured=*/true); if (!Body) return false; - ChildStmt = getSingleCompoundChild(Ctx, Body); - if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) { + ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body); + if (const auto *NND = + dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) { DKind = NND->getDirectiveKind(); if (isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND)) @@ -1028,6 +974,8 @@ static bool hasNestedLightweightDirective(ASTContext &Ctx, } return false; case OMPD_target_parallel: + if (DKind == OMPD_simd) + return true; return isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NestedDir); case OMPD_target_teams_distribute: @@ -1047,6 +995,7 @@ static bool hasNestedLightweightDirective(ASTContext &Ctx, case OMPD_cancellation_point: case OMPD_ordered: case OMPD_threadprivate: + case OMPD_allocate: case OMPD_task: case OMPD_simd: case OMPD_sections: @@ -1077,6 +1026,7 @@ static bool hasNestedLightweightDirective(ASTContext &Ctx, case OMPD_declare_target: case OMPD_end_declare_target: case OMPD_declare_reduction: + case OMPD_declare_mapper: case OMPD_taskloop: case OMPD_taskloop_simd: case OMPD_requires: @@ -1107,8 +1057,9 @@ static bool supportsLightweightRuntime(ASTContext &Ctx, // (Last|First)-privates must be shared in parallel region. return hasStaticScheduling(D); case OMPD_target_simd: - case OMPD_target_teams_distribute: case OMPD_target_teams_distribute_simd: + return true; + case OMPD_target_teams_distribute: return false; case OMPD_parallel: case OMPD_for: @@ -1120,6 +1071,7 @@ static bool supportsLightweightRuntime(ASTContext &Ctx, case OMPD_cancellation_point: case OMPD_ordered: case OMPD_threadprivate: + case OMPD_allocate: case OMPD_task: case OMPD_simd: case OMPD_sections: @@ -1150,6 +1102,7 @@ static bool supportsLightweightRuntime(ASTContext &Ctx, case OMPD_declare_target: case OMPD_end_declare_target: case OMPD_declare_reduction: + case OMPD_declare_mapper: case OMPD_taskloop: case OMPD_taskloop_simd: case OMPD_requires: @@ -1512,14 +1465,14 @@ void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF, // directive. auto *ParallelFnTy = llvm::FunctionType::get(CGM.VoidTy, {CGM.Int16Ty, CGM.Int32Ty}, - /*isVarArg=*/false) - ->getPointerTo(); - llvm::Value *WorkFnCast = Bld.CreateBitCast(WorkID, ParallelFnTy); + /*isVarArg=*/false); + llvm::Value *WorkFnCast = + Bld.CreateBitCast(WorkID, ParallelFnTy->getPointerTo()); // Insert call to work function via shared wrapper. The shared // wrapper takes two arguments: // - the parallelism level; // - the thread ID; - emitCall(CGF, WST.Loc, WorkFnCast, + emitCall(CGF, WST.Loc, {ParallelFnTy, WorkFnCast}, {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)}); // Go to end of parallel region. CGF.EmitBranch(TerminateBB); @@ -1547,9 +1500,9 @@ void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF, /// implementation. Specialized for the NVPTX device. /// \param Function OpenMP runtime function. /// \return Specified function. -llvm::Constant * +llvm::FunctionCallee CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) { - llvm::Constant *RTLFn = nullptr; + llvm::FunctionCallee RTLFn = nullptr; switch (static_cast<OpenMPRTLFunctionNVPTX>(Function)) { case OMPRTL_NVPTX__kmpc_kernel_init: { // Build void __kmpc_kernel_init(kmp_int32 thread_limit, int16_t @@ -1647,7 +1600,7 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) { RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int64"); break; } - case OMPRTL_NVPTX__kmpc_parallel_reduce_nowait_v2: { + case OMPRTL_NVPTX__kmpc_nvptx_parallel_reduce_nowait_v2: { // Build int32_t kmpc_nvptx_parallel_reduce_nowait_v2(ident_t *loc, // kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, void* // reduce_data, void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t @@ -1684,28 +1637,47 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) { FnTy, /*Name=*/"__kmpc_nvptx_end_reduce_nowait"); break; } - case OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_simple: { - // Build __kmpc_nvptx_teams_reduce_nowait_simple(ident_t *loc, kmp_int32 - // global_tid, kmp_critical_name *lck) - llvm::Type *TypeParams[] = { - getIdentTyPointerTy(), CGM.Int32Ty, - llvm::PointerType::getUnqual(getKmpCriticalNameTy())}; + case OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_v2: { + // Build int32_t __kmpc_nvptx_teams_reduce_nowait_v2(ident_t *loc, kmp_int32 + // global_tid, void *global_buffer, int32_t num_of_records, void* + // reduce_data, + // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t + // lane_offset, int16_t shortCircuit), + // void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num), void + // (*kmp_ListToGlobalCpyFctPtr)(void *buffer, int idx, void *reduce_data), + // void (*kmp_GlobalToListCpyFctPtr)(void *buffer, int idx, + // void *reduce_data), void (*kmp_GlobalToListCpyPtrsFctPtr)(void *buffer, + // int idx, void *reduce_data), void (*kmp_GlobalToListRedFctPtr)(void + // *buffer, int idx, void *reduce_data)); + llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty, + CGM.Int16Ty, CGM.Int16Ty}; + auto *ShuffleReduceFnTy = + llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams, + /*isVarArg=*/false); + llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty}; + auto *InterWarpCopyFnTy = + llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams, + /*isVarArg=*/false); + llvm::Type *GlobalListTypeParams[] = {CGM.VoidPtrTy, CGM.IntTy, + CGM.VoidPtrTy}; + auto *GlobalListFnTy = + llvm::FunctionType::get(CGM.VoidTy, GlobalListTypeParams, + /*isVarArg=*/false); + llvm::Type *TypeParams[] = {getIdentTyPointerTy(), + CGM.Int32Ty, + CGM.VoidPtrTy, + CGM.Int32Ty, + CGM.VoidPtrTy, + ShuffleReduceFnTy->getPointerTo(), + InterWarpCopyFnTy->getPointerTo(), + GlobalListFnTy->getPointerTo(), + GlobalListFnTy->getPointerTo(), + GlobalListFnTy->getPointerTo(), + GlobalListFnTy->getPointerTo()}; auto *FnTy = llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false); RTLFn = CGM.CreateRuntimeFunction( - FnTy, /*Name=*/"__kmpc_nvptx_teams_reduce_nowait_simple"); - break; - } - case OMPRTL_NVPTX__kmpc_nvptx_teams_end_reduce_nowait_simple: { - // Build __kmpc_nvptx_teams_end_reduce_nowait_simple(ident_t *loc, kmp_int32 - // global_tid, kmp_critical_name *lck) - llvm::Type *TypeParams[] = { - getIdentTyPointerTy(), CGM.Int32Ty, - llvm::PointerType::getUnqual(getKmpCriticalNameTy())}; - auto *FnTy = - llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false); - RTLFn = CGM.CreateRuntimeFunction( - FnTy, /*Name=*/"__kmpc_nvptx_teams_end_reduce_nowait_simple"); + FnTy, /*Name=*/"__kmpc_nvptx_teams_reduce_nowait_v2"); break; } case OMPRTL_NVPTX__kmpc_data_sharing_init_stack: { @@ -1806,7 +1778,8 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) { auto *FnTy = llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name*/ "__kmpc_barrier"); - cast<llvm::Function>(RTLFn)->addFnAttr(llvm::Attribute::Convergent); + cast<llvm::Function>(RTLFn.getCallee()) + ->addFnAttr(llvm::Attribute::Convergent); break; } case OMPRTL__kmpc_barrier_simple_spmd: { @@ -1817,7 +1790,8 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) { llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name*/ "__kmpc_barrier_simple_spmd"); - cast<llvm::Function>(RTLFn)->addFnAttr(llvm::Attribute::Convergent); + cast<llvm::Function>(RTLFn.getCallee()) + ->addFnAttr(llvm::Attribute::Convergent); break; } } @@ -1928,7 +1902,7 @@ void CGOpenMPRuntimeNVPTX::emitNumTeamsClause(CodeGenFunction &CGF, const Expr *ThreadLimit, SourceLocation Loc) {} -llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOutlinedFunction( +llvm::Function *CGOpenMPRuntimeNVPTX::emitParallelOutlinedFunction( const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) { // Emit target region as a standalone region. @@ -1976,11 +1950,11 @@ getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D, "expected teams directive."); const OMPExecutableDirective *Dir = &D; if (!isOpenMPDistributeDirective(D.getDirectiveKind())) { - if (const Stmt *S = getSingleCompoundChild( + if (const Stmt *S = CGOpenMPRuntime::getSingleCompoundChild( Ctx, D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers( /*IgnoreCaptured=*/true))) { - Dir = dyn_cast<OMPExecutableDirective>(S); + Dir = dyn_cast_or_null<OMPExecutableDirective>(S); if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind())) Dir = nullptr; } @@ -2005,7 +1979,7 @@ getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D, } } -llvm::Value *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction( +llvm::Function *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction( const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) { SourceLocation Loc = D.getBeginLoc(); @@ -2014,13 +1988,14 @@ llvm::Value *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction( llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions; llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields; // Globalize team reductions variable unconditionally in all modes. - getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions); + if (getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD) + getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions); if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) { getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions); if (!LastPrivatesReductions.empty()) { GlobalizedRD = ::buildRecordForGlobalizedVars( CGM.getContext(), llvm::None, LastPrivatesReductions, - MappedDeclsFields); + MappedDeclsFields, WarpSize); } } else if (!LastPrivatesReductions.empty()) { assert(!TeamAndReductions.first && @@ -2068,9 +2043,8 @@ llvm::Value *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction( } } Action(Loc, GlobalizedRD, MappedDeclsFields); CodeGen.setAction(Action); - llvm::Value *OutlinedFunVal = CGOpenMPRuntime::emitTeamsOutlinedFunction( + llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction( D, ThreadIDVar, InnermostKind, CodeGen); - llvm::Function *OutlinedFun = cast<llvm::Function>(OutlinedFunVal); OutlinedFun->removeFnAttr(llvm::Attribute::NoInline); OutlinedFun->removeFnAttr(llvm::Attribute::OptimizeNone); OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline); @@ -2235,8 +2209,7 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF, .getPointerType(CGM.getContext().VoidPtrTy) .castAs<PointerType>()); llvm::Value *GlobalRecValue = - Bld.CreateConstInBoundsGEP(FrameAddr, Offset, CharUnits::One()) - .getPointer(); + Bld.CreateConstInBoundsGEP(FrameAddr, Offset).getPointer(); I->getSecond().GlobalRecordAddr = GlobalRecValue; I->getSecond().IsInSPMDModeFlag = nullptr; GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( @@ -2429,7 +2402,7 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsEpilog(CodeGenFunction &CGF, void CGOpenMPRuntimeNVPTX::emitTeamsCall(CodeGenFunction &CGF, const OMPExecutableDirective &D, SourceLocation Loc, - llvm::Value *OutlinedFn, + llvm::Function *OutlinedFn, ArrayRef<llvm::Value *> CapturedVars) { if (!CGF.HaveInsertPoint()) return; @@ -2446,7 +2419,7 @@ void CGOpenMPRuntimeNVPTX::emitTeamsCall(CodeGenFunction &CGF, } void CGOpenMPRuntimeNVPTX::emitParallelCall( - CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, + CodeGenFunction &CGF, SourceLocation Loc, llvm::Function *OutlinedFn, ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) { if (!CGF.HaveInsertPoint()) return; @@ -2536,8 +2509,7 @@ void CGOpenMPRuntimeNVPTX::emitNonSPMDParallelCall( SharedArgs, Ctx.getPointerType(Ctx.getPointerType(Ctx.VoidPtrTy)) .castAs<PointerType>()); for (llvm::Value *V : CapturedVars) { - Address Dst = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx, - CGF.getPointerSize()); + Address Dst = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx); llvm::Value *PtrV; if (V->getType()->isIntegerTy()) PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy); @@ -2625,7 +2597,7 @@ void CGOpenMPRuntimeNVPTX::emitNonSPMDParallelCall( } void CGOpenMPRuntimeNVPTX::emitSPMDParallelCall( - CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, + CodeGenFunction &CGF, SourceLocation Loc, llvm::Function *OutlinedFn, ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) { // Just call the outlined function to execute the parallel region. // OutlinedFn(>id, &zero, CapturedStruct); @@ -2846,7 +2818,7 @@ static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr, Address ElemPtr = DestAddr; Address Ptr = SrcAddr; Address PtrEnd = Bld.CreatePointerBitCastOrAddrSpaceCast( - Bld.CreateConstGEP(SrcAddr, 1, Size), CGF.VoidPtrTy); + Bld.CreateConstGEP(SrcAddr, 1), CGF.VoidPtrTy); for (int IntSize = 8; IntSize >= 1; IntSize /= 2) { if (Size < CharUnits::fromQuantity(IntSize)) continue; @@ -2881,10 +2853,8 @@ static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr, CGF, CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc), IntType, Offset, Loc); CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType); - Address LocalPtr = - Bld.CreateConstGEP(Ptr, 1, CharUnits::fromQuantity(IntSize)); - Address LocalElemPtr = - Bld.CreateConstGEP(ElemPtr, 1, CharUnits::fromQuantity(IntSize)); + Address LocalPtr = Bld.CreateConstGEP(Ptr, 1); + Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1); PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB); PhiDest->addIncoming(LocalElemPtr.getPointer(), ThenBB); CGF.EmitBranch(PreCondBB); @@ -2894,9 +2864,8 @@ static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr, CGF, CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc), IntType, Offset, Loc); CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType); - Ptr = Bld.CreateConstGEP(Ptr, 1, CharUnits::fromQuantity(IntSize)); - ElemPtr = - Bld.CreateConstGEP(ElemPtr, 1, CharUnits::fromQuantity(IntSize)); + Ptr = Bld.CreateConstGEP(Ptr, 1); + ElemPtr = Bld.CreateConstGEP(ElemPtr, 1); } Size = Size % IntSize; } @@ -2959,16 +2928,14 @@ static void emitReductionListCopy( switch (Action) { case RemoteLaneToThread: { // Step 1.1: Get the address for the src element in the Reduce list. - Address SrcElementPtrAddr = - Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize()); + Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx); SrcElementAddr = CGF.EmitLoadOfPointer( SrcElementPtrAddr, C.getPointerType(Private->getType())->castAs<PointerType>()); // Step 1.2: Create a temporary to store the element in the destination // Reduce list. - DestElementPtrAddr = - Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize()); + DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx); DestElementAddr = CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element"); ShuffleInElement = true; @@ -2977,16 +2944,14 @@ static void emitReductionListCopy( } case ThreadCopy: { // Step 1.1: Get the address for the src element in the Reduce list. - Address SrcElementPtrAddr = - Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize()); + Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx); SrcElementAddr = CGF.EmitLoadOfPointer( SrcElementPtrAddr, C.getPointerType(Private->getType())->castAs<PointerType>()); // Step 1.2: Get the address for dest element. The destination // element has already been created on the thread's stack. - DestElementPtrAddr = - Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize()); + DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx); DestElementAddr = CGF.EmitLoadOfPointer( DestElementPtrAddr, C.getPointerType(Private->getType())->castAs<PointerType>()); @@ -2994,8 +2959,7 @@ static void emitReductionListCopy( } case ThreadToScratchpad: { // Step 1.1: Get the address for the src element in the Reduce list. - Address SrcElementPtrAddr = - Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize()); + Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx); SrcElementAddr = CGF.EmitLoadOfPointer( SrcElementPtrAddr, C.getPointerType(Private->getType())->castAs<PointerType>()); @@ -3030,8 +2994,7 @@ static void emitReductionListCopy( // Step 1.2: Create a temporary to store the element in the destination // Reduce list. - DestElementPtrAddr = - Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize()); + DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx); DestElementAddr = CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element"); UpdateDestListPtr = true; @@ -3052,18 +3015,31 @@ static void emitReductionListCopy( shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(), RemoteLaneOffset, Private->getExprLoc()); } else { - if (Private->getType()->isScalarType()) { + switch (CGF.getEvaluationKind(Private->getType())) { + case TEK_Scalar: { llvm::Value *Elem = CGF.EmitLoadOfScalar(SrcElementAddr, /*Volatile=*/false, Private->getType(), Private->getExprLoc()); // Store the source element value to the dest element address. CGF.EmitStoreOfScalar(Elem, DestElementAddr, /*Volatile=*/false, Private->getType()); - } else { + break; + } + case TEK_Complex: { + CodeGenFunction::ComplexPairTy Elem = CGF.EmitLoadOfComplex( + CGF.MakeAddrLValue(SrcElementAddr, Private->getType()), + Private->getExprLoc()); + CGF.EmitStoreOfComplex( + Elem, CGF.MakeAddrLValue(DestElementAddr, Private->getType()), + /*isInit=*/false); + break; + } + case TEK_Aggregate: CGF.EmitAggregateCopy( CGF.MakeAddrLValue(DestElementAddr, Private->getType()), CGF.MakeAddrLValue(SrcElementAddr, Private->getType()), Private->getType(), AggValueSlot::DoesNotOverlap); + break; } } @@ -3147,9 +3123,9 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM, const CGFunctionInfo &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); - auto *Fn = llvm::Function::Create( - CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, - "_omp_reduction_inter_warp_copy_func", &CGM.getModule()); + auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI), + llvm::GlobalValue::InternalLinkage, + "_omp_reduction_inter_warp_copy_func", &M); CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI); Fn->setDoesNotRecurse(); CodeGenFunction CGF(CGM); @@ -3246,8 +3222,7 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM, CGF.EmitBlock(ThenBB); // Reduce element = LocalReduceList[i] - Address ElemPtrPtrAddr = - Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize()); + Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx); llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar( ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation()); // elemptr = ((CopyType*)(elemptrptr)) + I @@ -3313,8 +3288,7 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM, SrcMediumPtr = Bld.CreateElementBitCast(SrcMediumPtr, CopyType); // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I - Address TargetElemPtrPtr = - Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize()); + Address TargetElemPtrPtr = Bld.CreateConstArrayGEP(LocalReduceList, Idx); llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar( TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, Loc); Address TargetElemPtr = Address(TargetElemPtrVal, Align); @@ -3418,9 +3392,9 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM, /// (2k+1)th thread is ignored in the value aggregation. Therefore /// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so /// that the contiguity assumption still holds. -static llvm::Value *emitShuffleAndReduceFunction( +static llvm::Function *emitShuffleAndReduceFunction( CodeGenModule &CGM, ArrayRef<const Expr *> Privates, - QualType ReductionArrayTy, llvm::Value *ReduceFn, SourceLocation Loc) { + QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc) { ASTContext &C = CGM.getContext(); // Thread local Reduce list used to host the values of data to be reduced. @@ -3568,6 +3542,406 @@ static llvm::Value *emitShuffleAndReduceFunction( return Fn; } +/// This function emits a helper that copies all the reduction variables from +/// the team into the provided global buffer for the reduction variables. +/// +/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data) +/// For all data entries D in reduce_data: +/// Copy local D to buffer.D[Idx] +static llvm::Value *emitListToGlobalCopyFunction( + CodeGenModule &CGM, ArrayRef<const Expr *> Privates, + QualType ReductionArrayTy, SourceLocation Loc, + const RecordDecl *TeamReductionRec, + const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> + &VarFieldMap) { + ASTContext &C = CGM.getContext(); + + // Buffer: global reduction buffer. + ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, + C.VoidPtrTy, ImplicitParamDecl::Other); + // Idx: index of the buffer. + ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy, + ImplicitParamDecl::Other); + // ReduceList: thread local Reduce list. + ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, + C.VoidPtrTy, ImplicitParamDecl::Other); + FunctionArgList Args; + Args.push_back(&BufferArg); + Args.push_back(&IdxArg); + Args.push_back(&ReduceListArg); + + const CGFunctionInfo &CGFI = + CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); + auto *Fn = llvm::Function::Create( + CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, + "_omp_reduction_list_to_global_copy_func", &CGM.getModule()); + CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI); + Fn->setDoesNotRecurse(); + CodeGenFunction CGF(CGM); + CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc); + + CGBuilderTy &Bld = CGF.Builder; + + Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg); + Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg); + Address LocalReduceList( + Bld.CreatePointerBitCastOrAddrSpaceCast( + CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false, + C.VoidPtrTy, Loc), + CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()), + CGF.getPointerAlign()); + QualType StaticTy = C.getRecordType(TeamReductionRec); + llvm::Type *LLVMReductionsBufferTy = + CGM.getTypes().ConvertTypeForMem(StaticTy); + llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( + CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc), + LLVMReductionsBufferTy->getPointerTo()); + llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty), + CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg), + /*Volatile=*/false, C.IntTy, + Loc)}; + unsigned Idx = 0; + for (const Expr *Private : Privates) { + // Reduce element = LocalReduceList[i] + Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx); + llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar( + ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation()); + // elemptr = ((CopyType*)(elemptrptr)) + I + ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( + ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo()); + Address ElemPtr = + Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType())); + const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl(); + // Global = Buffer.VD[Idx]; + const FieldDecl *FD = VarFieldMap.lookup(VD); + LValue GlobLVal = CGF.EmitLValueForField( + CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD); + llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobLVal.getPointer(), Idxs); + GlobLVal.setAddress(Address(BufferPtr, GlobLVal.getAlignment())); + switch (CGF.getEvaluationKind(Private->getType())) { + case TEK_Scalar: { + llvm::Value *V = CGF.EmitLoadOfScalar(ElemPtr, /*Volatile=*/false, + Private->getType(), Loc); + CGF.EmitStoreOfScalar(V, GlobLVal); + break; + } + case TEK_Complex: { + CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex( + CGF.MakeAddrLValue(ElemPtr, Private->getType()), Loc); + CGF.EmitStoreOfComplex(V, GlobLVal, /*isInit=*/false); + break; + } + case TEK_Aggregate: + CGF.EmitAggregateCopy(GlobLVal, + CGF.MakeAddrLValue(ElemPtr, Private->getType()), + Private->getType(), AggValueSlot::DoesNotOverlap); + break; + } + ++Idx; + } + + CGF.FinishFunction(); + return Fn; +} + +/// This function emits a helper that reduces all the reduction variables from +/// the team into the provided global buffer for the reduction variables. +/// +/// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data) +/// void *GlobPtrs[]; +/// GlobPtrs[0] = (void*)&buffer.D0[Idx]; +/// ... +/// GlobPtrs[N] = (void*)&buffer.DN[Idx]; +/// reduce_function(GlobPtrs, reduce_data); +static llvm::Value *emitListToGlobalReduceFunction( + CodeGenModule &CGM, ArrayRef<const Expr *> Privates, + QualType ReductionArrayTy, SourceLocation Loc, + const RecordDecl *TeamReductionRec, + const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> + &VarFieldMap, + llvm::Function *ReduceFn) { + ASTContext &C = CGM.getContext(); + + // Buffer: global reduction buffer. + ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, + C.VoidPtrTy, ImplicitParamDecl::Other); + // Idx: index of the buffer. + ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy, + ImplicitParamDecl::Other); + // ReduceList: thread local Reduce list. + ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, + C.VoidPtrTy, ImplicitParamDecl::Other); + FunctionArgList Args; + Args.push_back(&BufferArg); + Args.push_back(&IdxArg); + Args.push_back(&ReduceListArg); + + const CGFunctionInfo &CGFI = + CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); + auto *Fn = llvm::Function::Create( + CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, + "_omp_reduction_list_to_global_reduce_func", &CGM.getModule()); + CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI); + Fn->setDoesNotRecurse(); + CodeGenFunction CGF(CGM); + CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc); + + CGBuilderTy &Bld = CGF.Builder; + + Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg); + QualType StaticTy = C.getRecordType(TeamReductionRec); + llvm::Type *LLVMReductionsBufferTy = + CGM.getTypes().ConvertTypeForMem(StaticTy); + llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( + CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc), + LLVMReductionsBufferTy->getPointerTo()); + + // 1. Build a list of reduction variables. + // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]}; + Address ReductionList = + CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list"); + auto IPriv = Privates.begin(); + llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty), + CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg), + /*Volatile=*/false, C.IntTy, + Loc)}; + unsigned Idx = 0; + for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) { + Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx); + // Global = Buffer.VD[Idx]; + const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl(); + const FieldDecl *FD = VarFieldMap.lookup(VD); + LValue GlobLVal = CGF.EmitLValueForField( + CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD); + llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobLVal.getPointer(), Idxs); + llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr); + CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy); + if ((*IPriv)->getType()->isVariablyModifiedType()) { + // Store array size. + ++Idx; + Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx); + llvm::Value *Size = CGF.Builder.CreateIntCast( + CGF.getVLASize( + CGF.getContext().getAsVariableArrayType((*IPriv)->getType())) + .NumElts, + CGF.SizeTy, /*isSigned=*/false); + CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy), + Elem); + } + } + + // Call reduce_function(GlobalReduceList, ReduceList) + llvm::Value *GlobalReduceList = + CGF.EmitCastToVoidPtr(ReductionList.getPointer()); + Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg); + llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar( + AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc); + CGM.getOpenMPRuntime().emitOutlinedFunctionCall( + CGF, Loc, ReduceFn, {GlobalReduceList, ReducedPtr}); + CGF.FinishFunction(); + return Fn; +} + +/// This function emits a helper that copies all the reduction variables from +/// the team into the provided global buffer for the reduction variables. +/// +/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data) +/// For all data entries D in reduce_data: +/// Copy buffer.D[Idx] to local D; +static llvm::Value *emitGlobalToListCopyFunction( + CodeGenModule &CGM, ArrayRef<const Expr *> Privates, + QualType ReductionArrayTy, SourceLocation Loc, + const RecordDecl *TeamReductionRec, + const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> + &VarFieldMap) { + ASTContext &C = CGM.getContext(); + + // Buffer: global reduction buffer. + ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, + C.VoidPtrTy, ImplicitParamDecl::Other); + // Idx: index of the buffer. + ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy, + ImplicitParamDecl::Other); + // ReduceList: thread local Reduce list. + ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, + C.VoidPtrTy, ImplicitParamDecl::Other); + FunctionArgList Args; + Args.push_back(&BufferArg); + Args.push_back(&IdxArg); + Args.push_back(&ReduceListArg); + + const CGFunctionInfo &CGFI = + CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); + auto *Fn = llvm::Function::Create( + CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, + "_omp_reduction_global_to_list_copy_func", &CGM.getModule()); + CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI); + Fn->setDoesNotRecurse(); + CodeGenFunction CGF(CGM); + CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc); + + CGBuilderTy &Bld = CGF.Builder; + + Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg); + Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg); + Address LocalReduceList( + Bld.CreatePointerBitCastOrAddrSpaceCast( + CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false, + C.VoidPtrTy, Loc), + CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()), + CGF.getPointerAlign()); + QualType StaticTy = C.getRecordType(TeamReductionRec); + llvm::Type *LLVMReductionsBufferTy = + CGM.getTypes().ConvertTypeForMem(StaticTy); + llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( + CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc), + LLVMReductionsBufferTy->getPointerTo()); + + llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty), + CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg), + /*Volatile=*/false, C.IntTy, + Loc)}; + unsigned Idx = 0; + for (const Expr *Private : Privates) { + // Reduce element = LocalReduceList[i] + Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx); + llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar( + ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation()); + // elemptr = ((CopyType*)(elemptrptr)) + I + ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( + ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo()); + Address ElemPtr = + Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType())); + const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl(); + // Global = Buffer.VD[Idx]; + const FieldDecl *FD = VarFieldMap.lookup(VD); + LValue GlobLVal = CGF.EmitLValueForField( + CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD); + llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobLVal.getPointer(), Idxs); + GlobLVal.setAddress(Address(BufferPtr, GlobLVal.getAlignment())); + switch (CGF.getEvaluationKind(Private->getType())) { + case TEK_Scalar: { + llvm::Value *V = CGF.EmitLoadOfScalar(GlobLVal, Loc); + CGF.EmitStoreOfScalar(V, ElemPtr, /*Volatile=*/false, Private->getType()); + break; + } + case TEK_Complex: { + CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(GlobLVal, Loc); + CGF.EmitStoreOfComplex(V, CGF.MakeAddrLValue(ElemPtr, Private->getType()), + /*isInit=*/false); + break; + } + case TEK_Aggregate: + CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()), + GlobLVal, Private->getType(), + AggValueSlot::DoesNotOverlap); + break; + } + ++Idx; + } + + CGF.FinishFunction(); + return Fn; +} + +/// This function emits a helper that reduces all the reduction variables from +/// the team into the provided global buffer for the reduction variables. +/// +/// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data) +/// void *GlobPtrs[]; +/// GlobPtrs[0] = (void*)&buffer.D0[Idx]; +/// ... +/// GlobPtrs[N] = (void*)&buffer.DN[Idx]; +/// reduce_function(reduce_data, GlobPtrs); +static llvm::Value *emitGlobalToListReduceFunction( + CodeGenModule &CGM, ArrayRef<const Expr *> Privates, + QualType ReductionArrayTy, SourceLocation Loc, + const RecordDecl *TeamReductionRec, + const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> + &VarFieldMap, + llvm::Function *ReduceFn) { + ASTContext &C = CGM.getContext(); + + // Buffer: global reduction buffer. + ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, + C.VoidPtrTy, ImplicitParamDecl::Other); + // Idx: index of the buffer. + ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy, + ImplicitParamDecl::Other); + // ReduceList: thread local Reduce list. + ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, + C.VoidPtrTy, ImplicitParamDecl::Other); + FunctionArgList Args; + Args.push_back(&BufferArg); + Args.push_back(&IdxArg); + Args.push_back(&ReduceListArg); + + const CGFunctionInfo &CGFI = + CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); + auto *Fn = llvm::Function::Create( + CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, + "_omp_reduction_global_to_list_reduce_func", &CGM.getModule()); + CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI); + Fn->setDoesNotRecurse(); + CodeGenFunction CGF(CGM); + CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc); + + CGBuilderTy &Bld = CGF.Builder; + + Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg); + QualType StaticTy = C.getRecordType(TeamReductionRec); + llvm::Type *LLVMReductionsBufferTy = + CGM.getTypes().ConvertTypeForMem(StaticTy); + llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( + CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc), + LLVMReductionsBufferTy->getPointerTo()); + + // 1. Build a list of reduction variables. + // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]}; + Address ReductionList = + CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list"); + auto IPriv = Privates.begin(); + llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty), + CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg), + /*Volatile=*/false, C.IntTy, + Loc)}; + unsigned Idx = 0; + for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) { + Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx); + // Global = Buffer.VD[Idx]; + const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl(); + const FieldDecl *FD = VarFieldMap.lookup(VD); + LValue GlobLVal = CGF.EmitLValueForField( + CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD); + llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobLVal.getPointer(), Idxs); + llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr); + CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy); + if ((*IPriv)->getType()->isVariablyModifiedType()) { + // Store array size. + ++Idx; + Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx); + llvm::Value *Size = CGF.Builder.CreateIntCast( + CGF.getVLASize( + CGF.getContext().getAsVariableArrayType((*IPriv)->getType())) + .NumElts, + CGF.SizeTy, /*isSigned=*/false); + CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy), + Elem); + } + } + + // Call reduce_function(ReduceList, GlobalReduceList) + llvm::Value *GlobalReduceList = + CGF.EmitCastToVoidPtr(ReductionList.getPointer()); + Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg); + llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar( + AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc); + CGM.getOpenMPRuntime().emitOutlinedFunctionCall( + CGF, Loc, ReduceFn, {ReducedPtr, GlobalReduceList}); + CGF.FinishFunction(); + return Fn; +} + /// /// Design of OpenMP reductions on the GPU /// @@ -3841,57 +4215,55 @@ void CGOpenMPRuntimeNVPTX::emitReduction( llvm::Value *ThreadId = getThreadID(CGF, Loc); llvm::Value *Res; - if (ParallelReduction) { - ASTContext &C = CGM.getContext(); - // 1. Build a list of reduction variables. - // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]}; - auto Size = RHSExprs.size(); - for (const Expr *E : Privates) { - if (E->getType()->isVariablyModifiedType()) - // Reserve place for array size. - ++Size; - } - llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size); - QualType ReductionArrayTy = - C.getConstantArrayType(C.VoidPtrTy, ArraySize, ArrayType::Normal, - /*IndexTypeQuals=*/0); - Address ReductionList = - CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list"); - auto IPriv = Privates.begin(); - unsigned Idx = 0; - for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) { - Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx, - CGF.getPointerSize()); - CGF.Builder.CreateStore( - CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( - CGF.EmitLValue(RHSExprs[I]).getPointer(), CGF.VoidPtrTy), - Elem); - if ((*IPriv)->getType()->isVariablyModifiedType()) { - // Store array size. - ++Idx; - Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx, - CGF.getPointerSize()); - llvm::Value *Size = CGF.Builder.CreateIntCast( - CGF.getVLASize( - CGF.getContext().getAsVariableArrayType((*IPriv)->getType())) - .NumElts, - CGF.SizeTy, /*isSigned=*/false); - CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy), - Elem); - } + ASTContext &C = CGM.getContext(); + // 1. Build a list of reduction variables. + // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]}; + auto Size = RHSExprs.size(); + for (const Expr *E : Privates) { + if (E->getType()->isVariablyModifiedType()) + // Reserve place for array size. + ++Size; + } + llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size); + QualType ReductionArrayTy = + C.getConstantArrayType(C.VoidPtrTy, ArraySize, ArrayType::Normal, + /*IndexTypeQuals=*/0); + Address ReductionList = + CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list"); + auto IPriv = Privates.begin(); + unsigned Idx = 0; + for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) { + Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx); + CGF.Builder.CreateStore( + CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( + CGF.EmitLValue(RHSExprs[I]).getPointer(), CGF.VoidPtrTy), + Elem); + if ((*IPriv)->getType()->isVariablyModifiedType()) { + // Store array size. + ++Idx; + Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx); + llvm::Value *Size = CGF.Builder.CreateIntCast( + CGF.getVLASize( + CGF.getContext().getAsVariableArrayType((*IPriv)->getType())) + .NumElts, + CGF.SizeTy, /*isSigned=*/false); + CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy), + Elem); } + } - llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy); - llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( - ReductionList.getPointer(), CGF.VoidPtrTy); - llvm::Value *ReductionFn = emitReductionFunction( - CGM, Loc, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(), - Privates, LHSExprs, RHSExprs, ReductionOps); - llvm::Value *ShuffleAndReduceFn = emitShuffleAndReduceFunction( - CGM, Privates, ReductionArrayTy, ReductionFn, Loc); - llvm::Value *InterWarpCopyFn = - emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc); + llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( + ReductionList.getPointer(), CGF.VoidPtrTy); + llvm::Function *ReductionFn = emitReductionFunction( + Loc, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(), Privates, + LHSExprs, RHSExprs, ReductionOps); + llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy); + llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction( + CGM, Privates, ReductionArrayTy, ReductionFn, Loc); + llvm::Value *InterWarpCopyFn = + emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc); + if (ParallelReduction) { llvm::Value *Args[] = {RTLoc, ThreadId, CGF.Builder.getInt32(RHSExprs.size()), @@ -3900,17 +4272,59 @@ void CGOpenMPRuntimeNVPTX::emitReduction( ShuffleAndReduceFn, InterWarpCopyFn}; - Res = CGF.EmitRuntimeCall(createNVPTXRuntimeFunction( - OMPRTL_NVPTX__kmpc_parallel_reduce_nowait_v2), - Args); + Res = CGF.EmitRuntimeCall( + createNVPTXRuntimeFunction( + OMPRTL_NVPTX__kmpc_nvptx_parallel_reduce_nowait_v2), + Args); } else { assert(TeamsReduction && "expected teams reduction."); - std::string Name = getName({"reduction"}); - llvm::Value *Lock = getCriticalRegionLock(Name); - llvm::Value *Args[] = {RTLoc, ThreadId, Lock}; + llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap; + llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size()); + int Cnt = 0; + for (const Expr *DRE : Privates) { + PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl(); + ++Cnt; + } + const RecordDecl *TeamReductionRec = ::buildRecordForGlobalizedVars( + CGM.getContext(), PrivatesReductions, llvm::None, VarFieldMap, + C.getLangOpts().OpenMPCUDAReductionBufNum); + TeamsReductions.push_back(TeamReductionRec); + if (!KernelTeamsReductionPtr) { + KernelTeamsReductionPtr = new llvm::GlobalVariable( + CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true, + llvm::GlobalValue::InternalLinkage, nullptr, + "_openmp_teams_reductions_buffer_$_$ptr"); + } + llvm::Value *GlobalBufferPtr = CGF.EmitLoadOfScalar( + Address(KernelTeamsReductionPtr, CGM.getPointerAlign()), + /*Volatile=*/false, C.getPointerType(C.VoidPtrTy), Loc); + llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction( + CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap); + llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction( + CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap, + ReductionFn); + llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction( + CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap); + llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction( + CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap, + ReductionFn); + + llvm::Value *Args[] = { + RTLoc, + ThreadId, + GlobalBufferPtr, + CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum), + RL, + ShuffleAndReduceFn, + InterWarpCopyFn, + GlobalToBufferCpyFn, + GlobalToBufferRedFn, + BufferToGlobalCpyFn, + BufferToGlobalRedFn}; + Res = CGF.EmitRuntimeCall( createNVPTXRuntimeFunction( - OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_simple), + OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_v2), Args); } @@ -3941,30 +4355,14 @@ void CGOpenMPRuntimeNVPTX::emitReduction( ++IRHS; } }; - if (ParallelReduction) { - llvm::Value *EndArgs[] = {ThreadId}; - RegionCodeGenTy RCG(CodeGen); - NVPTXActionTy Action( - nullptr, llvm::None, - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_reduce_nowait), - EndArgs); - RCG.setAction(Action); - RCG(CGF); - } else { - assert(TeamsReduction && "expected teams reduction."); - llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); - std::string Name = getName({"reduction"}); - llvm::Value *Lock = getCriticalRegionLock(Name); - llvm::Value *EndArgs[] = {RTLoc, ThreadId, Lock}; - RegionCodeGenTy RCG(CodeGen); - NVPTXActionTy Action( - nullptr, llvm::None, - createNVPTXRuntimeFunction( - OMPRTL_NVPTX__kmpc_nvptx_teams_end_reduce_nowait_simple), - EndArgs); - RCG.setAction(Action); - RCG(CGF); - } + llvm::Value *EndArgs[] = {ThreadId}; + RegionCodeGenTy RCG(CodeGen); + NVPTXActionTy Action( + nullptr, llvm::None, + createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_reduce_nowait), + EndArgs); + RCG.setAction(Action); + RCG(CGF); // There is no need to emit line number for unconditional branch. (void)ApplyDebugLocation::CreateEmpty(CGF); CGF.EmitBlock(ExitBB, /*IsFinished=*/true); @@ -3983,6 +4381,10 @@ CGOpenMPRuntimeNVPTX::translateParameter(const FieldDecl *FD, if (Attr->getCaptureKind() == OMPC_map) { PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy, LangAS::opencl_global); + } else if (Attr->getCaptureKind() == OMPC_firstprivate && + PointeeTy.isConstant(CGM.getContext())) { + PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy, + LangAS::opencl_generic); } } ArgType = CGM.getContext().getPointerType(PointeeTy); @@ -4034,12 +4436,11 @@ CGOpenMPRuntimeNVPTX::getParameterAddress(CodeGenFunction &CGF, } void CGOpenMPRuntimeNVPTX::emitOutlinedFunctionCall( - CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, + CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn, ArrayRef<llvm::Value *> Args) const { SmallVector<llvm::Value *, 4> TargetArgs; TargetArgs.reserve(Args.size()); - auto *FnType = - cast<llvm::FunctionType>(OutlinedFn->getType()->getPointerElementType()); + auto *FnType = OutlinedFn.getFunctionType(); for (unsigned I = 0, E = Args.size(); I < E; ++I) { if (FnType->isVarArg() && FnType->getNumParams() <= I) { TargetArgs.append(std::next(Args.begin(), I), Args.end()); @@ -4137,8 +4538,7 @@ llvm::Function *CGOpenMPRuntimeNVPTX::createParallelDataSharingWrapper( } unsigned Idx = 0; if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) { - Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx, - CGF.getPointerSize()); + Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx); Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast( Src, CGF.SizeTy->getPointerTo()); llvm::Value *LB = CGF.EmitLoadOfScalar( @@ -4148,8 +4548,7 @@ llvm::Function *CGOpenMPRuntimeNVPTX::createParallelDataSharingWrapper( cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc()); Args.emplace_back(LB); ++Idx; - Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx, - CGF.getPointerSize()); + Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx); TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast( Src, CGF.SizeTy->getPointerTo()); llvm::Value *UB = CGF.EmitLoadOfScalar( @@ -4164,8 +4563,7 @@ llvm::Function *CGOpenMPRuntimeNVPTX::createParallelDataSharingWrapper( ASTContext &CGFContext = CGF.getContext(); for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) { QualType ElemTy = CurField->getType(); - Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx, - CGF.getPointerSize()); + Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx); Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast( Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy))); llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress, @@ -4266,6 +4664,58 @@ void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF, Address CGOpenMPRuntimeNVPTX::getAddressOfLocalVariable(CodeGenFunction &CGF, const VarDecl *VD) { + if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) { + const auto *A = VD->getAttr<OMPAllocateDeclAttr>(); + switch (A->getAllocatorType()) { + // Use the default allocator here as by default local vars are + // threadlocal. + case OMPAllocateDeclAttr::OMPDefaultMemAlloc: + case OMPAllocateDeclAttr::OMPThreadMemAlloc: + case OMPAllocateDeclAttr::OMPHighBWMemAlloc: + case OMPAllocateDeclAttr::OMPLowLatMemAlloc: + // Follow the user decision - use default allocation. + return Address::invalid(); + case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc: + // TODO: implement aupport for user-defined allocators. + return Address::invalid(); + case OMPAllocateDeclAttr::OMPConstMemAlloc: { + llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType()); + auto *GV = new llvm::GlobalVariable( + CGM.getModule(), VarTy, /*isConstant=*/false, + llvm::GlobalValue::InternalLinkage, + llvm::Constant::getNullValue(VarTy), VD->getName(), + /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal, + CGM.getContext().getTargetAddressSpace(LangAS::cuda_constant)); + CharUnits Align = CGM.getContext().getDeclAlign(VD); + GV->setAlignment(Align.getQuantity()); + return Address(GV, Align); + } + case OMPAllocateDeclAttr::OMPPTeamMemAlloc: { + llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType()); + auto *GV = new llvm::GlobalVariable( + CGM.getModule(), VarTy, /*isConstant=*/false, + llvm::GlobalValue::InternalLinkage, + llvm::Constant::getNullValue(VarTy), VD->getName(), + /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal, + CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared)); + CharUnits Align = CGM.getContext().getDeclAlign(VD); + GV->setAlignment(Align.getQuantity()); + return Address(GV, Align); + } + case OMPAllocateDeclAttr::OMPLargeCapMemAlloc: + case OMPAllocateDeclAttr::OMPCGroupMemAlloc: { + llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType()); + auto *GV = new llvm::GlobalVariable( + CGM.getModule(), VarTy, /*isConstant=*/false, + llvm::GlobalValue::InternalLinkage, + llvm::Constant::getNullValue(VarTy), VD->getName()); + CharUnits Align = CGM.getContext().getDeclAlign(VD); + GV->setAlignment(Align.getQuantity()); + return Address(GV, Align); + } + } + } + if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic) return Address::invalid(); @@ -4287,6 +4737,7 @@ Address CGOpenMPRuntimeNVPTX::getAddressOfLocalVariable(CodeGenFunction &CGF, return VDI->second.PrivateAddr; } } + return Address::invalid(); } @@ -4374,6 +4825,38 @@ void CGOpenMPRuntimeNVPTX::adjustTargetSpecificDataForLambdas( } } +unsigned CGOpenMPRuntimeNVPTX::getDefaultFirstprivateAddressSpace() const { + return CGM.getContext().getTargetAddressSpace(LangAS::cuda_constant); +} + +bool CGOpenMPRuntimeNVPTX::hasAllocateAttributeForGlobalVar(const VarDecl *VD, + LangAS &AS) { + if (!VD || !VD->hasAttr<OMPAllocateDeclAttr>()) + return false; + const auto *A = VD->getAttr<OMPAllocateDeclAttr>(); + switch(A->getAllocatorType()) { + case OMPAllocateDeclAttr::OMPDefaultMemAlloc: + // Not supported, fallback to the default mem space. + case OMPAllocateDeclAttr::OMPThreadMemAlloc: + case OMPAllocateDeclAttr::OMPLargeCapMemAlloc: + case OMPAllocateDeclAttr::OMPCGroupMemAlloc: + case OMPAllocateDeclAttr::OMPHighBWMemAlloc: + case OMPAllocateDeclAttr::OMPLowLatMemAlloc: + AS = LangAS::Default; + return true; + case OMPAllocateDeclAttr::OMPConstMemAlloc: + AS = LangAS::cuda_constant; + return true; + case OMPAllocateDeclAttr::OMPPTeamMemAlloc: + AS = LangAS::cuda_shared; + return true; + case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc: + llvm_unreachable("Expected predefined allocator for the variables with the " + "static storage."); + } + return false; +} + // Get current CudaArch and ignore any unknown values static CudaArch getCudaArch(CodeGenModule &CGM) { if (!CGM.getTarget().hasFeature("ptx")) @@ -4395,7 +4878,7 @@ static CudaArch getCudaArch(CodeGenModule &CGM) { /// Check to see if target architecture supports unified addressing which is /// a restriction for OpenMP requires clause "unified_shared_memory". void CGOpenMPRuntimeNVPTX::checkArchForUnifiedAddressing( - CodeGenModule &CGM, const OMPRequiresDecl *D) const { + const OMPRequiresDecl *D) const { for (const OMPClause *Clause : D->clauselists()) { if (Clause->getClauseKind() == OMPC_unified_shared_memory) { switch (getCudaArch(CGM)) { @@ -4587,9 +5070,12 @@ void CGOpenMPRuntimeNVPTX::clear() { QualType Arr2Ty = C.getConstantArrayType(Arr1Ty, Size2, ArrayType::Normal, /*IndexTypeQuals=*/0); llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty); + // FIXME: nvlink does not handle weak linkage correctly (object with the + // different size are reported as erroneous). + // Restore CommonLinkage as soon as nvlink is fixed. auto *GV = new llvm::GlobalVariable( CGM.getModule(), LLVMArr2Ty, - /*isConstant=*/false, llvm::GlobalValue::CommonLinkage, + /*isConstant=*/false, llvm::GlobalValue::InternalLinkage, llvm::Constant::getNullValue(LLVMArr2Ty), "_openmp_static_glob_rd_$_"); auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( @@ -4600,5 +5086,36 @@ void CGOpenMPRuntimeNVPTX::clear() { } } } + if (!TeamsReductions.empty()) { + ASTContext &C = CGM.getContext(); + RecordDecl *StaticRD = C.buildImplicitRecord( + "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union); + StaticRD->startDefinition(); + for (const RecordDecl *TeamReductionRec : TeamsReductions) { + QualType RecTy = C.getRecordType(TeamReductionRec); + auto *Field = FieldDecl::Create( + C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy, + C.getTrivialTypeSourceInfo(RecTy, SourceLocation()), + /*BW=*/nullptr, /*Mutable=*/false, + /*InitStyle=*/ICIS_NoInit); + Field->setAccess(AS_public); + StaticRD->addDecl(Field); + } + StaticRD->completeDefinition(); + QualType StaticTy = C.getRecordType(StaticRD); + llvm::Type *LLVMReductionsBufferTy = + CGM.getTypes().ConvertTypeForMem(StaticTy); + // FIXME: nvlink does not handle weak linkage correctly (object with the + // different size are reported as erroneous). + // Restore CommonLinkage as soon as nvlink is fixed. + auto *GV = new llvm::GlobalVariable( + CGM.getModule(), LLVMReductionsBufferTy, + /*isConstant=*/false, llvm::GlobalValue::InternalLinkage, + llvm::Constant::getNullValue(LLVMReductionsBufferTy), + "_openmp_teams_reductions_buffer_$_"); + KernelTeamsReductionPtr->setInitializer( + llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV, + CGM.VoidPtrTy)); + } CGOpenMPRuntime::clear(); } |