summaryrefslogtreecommitdiffstats
path: root/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
diff options
context:
space:
mode:
authorJordan Rupprecht <rupprecht@google.com>2019-01-18 19:46:00 +0000
committerJordan Rupprecht <rupprecht@google.com>2019-01-18 19:46:00 +0000
commit3748d41833787fcbf59cc5624e8d2b042a8991bc (patch)
treef3fcdba7decca7ee845a1bb3f885cb0baa1b4d83 /lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
parent55c8788102d8fd203270fabd6513247b2d7fbd87 (diff)
parente054eb577a1f469b1a4a49fce08572c76e2dddf2 (diff)
Creating branches/google/stable and tags/google/stable/2019-01-18 from r351319
git-svn-id: https://llvm.org/svn/llvm-project/cfe/branches/google/stable@351578 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp')
-rw-r--r--lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp146
1 files changed, 95 insertions, 51 deletions
diff --git a/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
index b055132ef0..7046ab3aa3 100644
--- a/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ b/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@@ -56,12 +56,12 @@ enum OpenMPRTLFunctionNVPTX {
/// Call to int64_t __kmpc_shuffle_int64(int64_t element,
/// int16_t lane_offset, int16_t warp_size);
OMPRTL_NVPTX__kmpc_shuffle_int64,
- /// Call to __kmpc_nvptx_parallel_reduce_nowait(kmp_int32
+ /// Call to __kmpc_nvptx_parallel_reduce_nowait_v2(ident_t *loc, kmp_int32
/// global_tid, kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
/// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
/// lane_offset, int16_t shortCircuit),
/// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num));
- OMPRTL_NVPTX__kmpc_parallel_reduce_nowait,
+ OMPRTL_NVPTX__kmpc_parallel_reduce_nowait_v2,
/// Call to __kmpc_nvptx_teams_reduce_nowait_simple(ident_t *loc, kmp_int32
/// global_tid, kmp_critical_name *lck)
OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_simple,
@@ -91,13 +91,17 @@ enum OpenMPRTLFunctionNVPTX {
OMPRTL_NVPTX__kmpc_parallel_level,
/// Call to int8_t __kmpc_is_spmd_exec_mode();
OMPRTL_NVPTX__kmpc_is_spmd_exec_mode,
- /// Call to void __kmpc_get_team_static_memory(const void *buf, size_t size,
- /// int16_t is_shared, const void **res);
+ /// Call to void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
+ /// const void *buf, size_t size, int16_t is_shared, const void **res);
OMPRTL_NVPTX__kmpc_get_team_static_memory,
- /// Call to void __kmpc_restore_team_static_memory(int16_t is_shared);
+ /// Call to void __kmpc_restore_team_static_memory(int16_t
+ /// isSPMDExecutionMode, int16_t is_shared);
OMPRTL_NVPTX__kmpc_restore_team_static_memory,
- // Call to void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid);
+ /// Call to void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid);
OMPRTL__kmpc_barrier,
+ /// Call to void __kmpc_barrier_simple_spmd(ident_t *loc, kmp_int32
+ /// global_tid);
+ OMPRTL__kmpc_barrier_simple_spmd,
};
/// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
@@ -640,17 +644,6 @@ static llvm::Value *getNVPTXNumThreads(CodeGenFunction &CGF) {
"nvptx_num_threads");
}
-/// Get barrier to synchronize all threads in a block.
-static void getNVPTXCTABarrier(CodeGenFunction &CGF) {
- llvm::Function *F = llvm::Intrinsic::getDeclaration(
- &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier0);
- F->addFnAttr(llvm::Attribute::Convergent);
- CGF.EmitRuntimeCall(F);
-}
-
-/// Synchronize all GPU threads in a block.
-static void syncCTAThreads(CodeGenFunction &CGF) { getNVPTXCTABarrier(CGF); }
-
/// Get the value of the thread_limit clause in the teams directive.
/// For the 'generic' execution mode, the runtime encodes thread_limit in
/// the launch parameters, always starting thread_limit+warpSize threads per
@@ -712,8 +705,8 @@ getDataSharingMode(CodeGenModule &CGM) {
: CGOpenMPRuntimeNVPTX::Generic;
}
-// Checks if the expression is constant or does not have non-trivial function
-// calls.
+/// Checks if the expression is constant or does not have non-trivial function
+/// calls.
static bool isTrivial(ASTContext &Ctx, const Expr * E) {
// We can skip constant expressions.
// We can skip expressions with trivial calls or simple expressions.
@@ -1654,12 +1647,12 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) {
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int64");
break;
}
- case OMPRTL_NVPTX__kmpc_parallel_reduce_nowait: {
- // Build int32_t kmpc_nvptx_parallel_reduce_nowait(kmp_int32 global_tid,
- // kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
- // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
- // lane_offset, int16_t Algorithm Version),
- // void (*kmp_InterWarpCopyFctPtr)(void* src, int warp_num));
+ case OMPRTL_NVPTX__kmpc_parallel_reduce_nowait_v2: {
+ // Build int32_t kmpc_nvptx_parallel_reduce_nowait_v2(ident_t *loc,
+ // kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, void*
+ // reduce_data, void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t
+ // lane_id, int16_t lane_offset, int16_t Algorithm Version), void
+ // (*kmp_InterWarpCopyFctPtr)(void* src, int warp_num));
llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
CGM.Int16Ty, CGM.Int16Ty};
auto *ShuffleReduceFnTy =
@@ -1669,7 +1662,8 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) {
auto *InterWarpCopyFnTy =
llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
/*isVarArg=*/false);
- llvm::Type *TypeParams[] = {CGM.Int32Ty,
+ llvm::Type *TypeParams[] = {getIdentTyPointerTy(),
+ CGM.Int32Ty,
CGM.Int32Ty,
CGM.SizeTy,
CGM.VoidPtrTy,
@@ -1678,7 +1672,7 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) {
auto *FnTy =
llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
RTLFn = CGM.CreateRuntimeFunction(
- FnTy, /*Name=*/"__kmpc_nvptx_parallel_reduce_nowait");
+ FnTy, /*Name=*/"__kmpc_nvptx_parallel_reduce_nowait_v2");
break;
}
case OMPRTL_NVPTX__kmpc_end_reduce_nowait: {
@@ -1787,19 +1781,21 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) {
break;
}
case OMPRTL_NVPTX__kmpc_get_team_static_memory: {
- // Build void __kmpc_get_team_static_memory(const void *buf, size_t size,
- // int16_t is_shared, const void **res);
- llvm::Type *TypeParams[] = {CGM.VoidPtrTy, CGM.SizeTy, CGM.Int16Ty,
- CGM.VoidPtrPtrTy};
+ // Build void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
+ // const void *buf, size_t size, int16_t is_shared, const void **res);
+ llvm::Type *TypeParams[] = {CGM.Int16Ty, CGM.VoidPtrTy, CGM.SizeTy,
+ CGM.Int16Ty, CGM.VoidPtrPtrTy};
auto *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_team_static_memory");
break;
}
case OMPRTL_NVPTX__kmpc_restore_team_static_memory: {
- // Build void __kmpc_restore_team_static_memory(int16_t is_shared);
+ // Build void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,
+ // int16_t is_shared);
+ llvm::Type *TypeParams[] = {CGM.Int16Ty, CGM.Int16Ty};
auto *FnTy =
- llvm::FunctionType::get(CGM.VoidTy, CGM.Int16Ty, /*isVarArg=*/false);
+ llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
RTLFn =
CGM.CreateRuntimeFunction(FnTy, "__kmpc_restore_team_static_memory");
break;
@@ -1813,6 +1809,17 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) {
cast<llvm::Function>(RTLFn)->addFnAttr(llvm::Attribute::Convergent);
break;
}
+ case OMPRTL__kmpc_barrier_simple_spmd: {
+ // Build void __kmpc_barrier_simple_spmd(ident_t *loc, kmp_int32
+ // global_tid);
+ llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
+ auto *FnTy =
+ llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
+ RTLFn =
+ CGM.CreateRuntimeFunction(FnTy, /*Name*/ "__kmpc_barrier_simple_spmd");
+ cast<llvm::Function>(RTLFn)->addFnAttr(llvm::Attribute::Convergent);
+ break;
+ }
}
return RTLFn;
}
@@ -2208,8 +2215,11 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
CGM.getContext().getSizeType(), Loc);
llvm::Value *ResAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
KernelStaticGlobalized, CGM.VoidPtrPtrTy);
- llvm::Value *GlobalRecordSizeArg[] = {StaticGlobalized, Ld,
- IsInSharedMemory, ResAddr};
+ llvm::Value *GlobalRecordSizeArg[] = {
+ llvm::ConstantInt::get(
+ CGM.Int16Ty,
+ getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD ? 1 : 0),
+ StaticGlobalized, Ld, IsInSharedMemory, ResAddr};
CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
OMPRTL_NVPTX__kmpc_get_team_static_memory),
GlobalRecordSizeArg);
@@ -2397,10 +2407,15 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsEpilog(CodeGenFunction &CGF,
Address(GlobalizedRecords.back().UseSharedMemory,
CGM.getContext().getTypeAlignInChars(Int16Ty)),
/*Volatile=*/false, Int16Ty, GlobalizedRecords.back().Loc);
+ llvm::Value *Args[] = {
+ llvm::ConstantInt::get(
+ CGM.Int16Ty,
+ getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD ? 1 : 0),
+ IsInSharedMemory};
CGF.EmitRuntimeCall(
createNVPTXRuntimeFunction(
OMPRTL_NVPTX__kmpc_restore_team_static_memory),
- IsInSharedMemory);
+ Args);
}
} else {
CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
@@ -2665,6 +2680,20 @@ void CGOpenMPRuntimeNVPTX::emitSPMDParallelCall(
}
}
+void CGOpenMPRuntimeNVPTX::syncCTAThreads(CodeGenFunction &CGF) {
+ // Always emit simple barriers!
+ if (!CGF.HaveInsertPoint())
+ return;
+ // Build call __kmpc_barrier_simple_spmd(nullptr, 0);
+ // This function does not use parameters, so we can emit just default values.
+ llvm::Value *Args[] = {
+ llvm::ConstantPointerNull::get(
+ cast<llvm::PointerType>(getIdentTyPointerTy())),
+ llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)};
+ CGF.EmitRuntimeCall(
+ createNVPTXRuntimeFunction(OMPRTL__kmpc_barrier_simple_spmd), Args);
+}
+
void CGOpenMPRuntimeNVPTX::emitBarrierCall(CodeGenFunction &CGF,
SourceLocation Loc,
OpenMPDirectiveKind Kind, bool,
@@ -3089,12 +3118,12 @@ static void emitReductionListCopy(
/// void inter_warp_copy_func(void* reduce_data, num_warps)
/// shared smem[warp_size];
/// For all data entries D in reduce_data:
+/// sync
/// If (I am the first lane in each warp)
/// Copy my local D to smem[warp_id]
/// sync
/// if (I am the first warp)
/// Copy smem[thread_id] to my local D
-/// sync
static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
ArrayRef<const Expr *> Privates,
QualType ReductionArrayTy,
@@ -3203,6 +3232,10 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
Bld.CreateCondBr(Cmp, BodyBB, ExitBB);
CGF.EmitBlock(BodyBB);
}
+ // kmpc_barrier.
+ CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
+ /*EmitChecks=*/false,
+ /*ForceSimpleCall=*/true);
llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
@@ -3303,12 +3336,6 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
CGF.EmitBlock(W0MergeBB);
- // While warp 0 copies values from transfer medium, all other warps must
- // wait.
- // kmpc_barrier.
- CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
- /*EmitChecks=*/false,
- /*ForceSimpleCall=*/true);
if (NumIters > 1) {
Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /*V=*/1));
CGF.EmitStoreOfScalar(Cnt, CntAddr, /*Volatile=*/false, C.IntTy);
@@ -3586,7 +3613,7 @@ static llvm::Value *emitShuffleAndReduceFunction(
/// 3. Call the OpenMP runtime on the GPU to reduce within a team
/// and store the result on the team master:
///
-/// __kmpc_nvptx_parallel_reduce_nowait(...,
+/// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
/// reduceData, shuffleReduceFn, interWarpCpyFn)
///
/// where:
@@ -3757,7 +3784,7 @@ static llvm::Value *emitShuffleAndReduceFunction(
/// Intra-Team Reduction
///
/// This function, as implemented in the runtime call
-/// '__kmpc_nvptx_parallel_reduce_nowait', aggregates data across OpenMP
+/// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
/// threads in a team. It first reduces within a warp using the
/// aforementioned algorithms. We then proceed to gather all such
/// reduced values at the first warp.
@@ -3780,7 +3807,7 @@ static llvm::Value *emitShuffleAndReduceFunction(
/// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
/// the k'th worker reduces every k'th element.
///
-/// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait' to
+/// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
/// reduce across workers and compute a globally reduced value.
///
void CGOpenMPRuntimeNVPTX::emitReduction(
@@ -3810,6 +3837,7 @@ void CGOpenMPRuntimeNVPTX::emitReduction(
// RedList, shuffle_reduce_func, interwarp_copy_func);
// or
// Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
+ llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
llvm::Value *ThreadId = getThreadID(CGF, Loc);
llvm::Value *Res;
@@ -3864,19 +3892,19 @@ void CGOpenMPRuntimeNVPTX::emitReduction(
llvm::Value *InterWarpCopyFn =
emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
- llvm::Value *Args[] = {ThreadId,
+ llvm::Value *Args[] = {RTLoc,
+ ThreadId,
CGF.Builder.getInt32(RHSExprs.size()),
ReductionArrayTySize,
RL,
ShuffleAndReduceFn,
InterWarpCopyFn};
- Res = CGF.EmitRuntimeCall(
- createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_reduce_nowait),
- Args);
+ Res = CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
+ OMPRTL_NVPTX__kmpc_parallel_reduce_nowait_v2),
+ Args);
} else {
assert(TeamsReduction && "expected teams reduction.");
- llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
std::string Name = getName({"reduction"});
llvm::Value *Lock = getCriticalRegionLock(Name);
llvm::Value *Args[] = {RTLoc, ThreadId, Lock};
@@ -4513,6 +4541,22 @@ void CGOpenMPRuntimeNVPTX::clear() {
Records.UseSharedMemory->setInitializer(
llvm::ConstantInt::get(CGM.Int16Ty, UseSharedMemory ? 1 : 0));
}
+ // Allocate SharedMemorySize buffer for the shared memory.
+ // FIXME: nvlink does not handle weak linkage correctly (object with the
+ // different size are reported as erroneous).
+ // Restore this code as sson as nvlink is fixed.
+ if (!SharedStaticRD->field_empty()) {
+ llvm::APInt ArySize(/*numBits=*/64, SharedMemorySize);
+ QualType SubTy = C.getConstantArrayType(
+ C.CharTy, ArySize, ArrayType::Normal, /*IndexTypeQuals=*/0);
+ auto *Field = FieldDecl::Create(
+ C, SharedStaticRD, SourceLocation(), SourceLocation(), nullptr, SubTy,
+ C.getTrivialTypeSourceInfo(SubTy, SourceLocation()),
+ /*BW=*/nullptr, /*Mutable=*/false,
+ /*InitStyle=*/ICIS_NoInit);
+ Field->setAccess(AS_public);
+ SharedStaticRD->addDecl(Field);
+ }
SharedStaticRD->completeDefinition();
if (!SharedStaticRD->field_empty()) {
QualType StaticTy = C.getRecordType(SharedStaticRD);