diff options
author | Jordan Rupprecht <rupprecht@google.com> | 2019-01-18 19:46:00 +0000 |
---|---|---|
committer | Jordan Rupprecht <rupprecht@google.com> | 2019-01-18 19:46:00 +0000 |
commit | 3748d41833787fcbf59cc5624e8d2b042a8991bc (patch) | |
tree | f3fcdba7decca7ee845a1bb3f885cb0baa1b4d83 /lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp | |
parent | 55c8788102d8fd203270fabd6513247b2d7fbd87 (diff) | |
parent | e054eb577a1f469b1a4a49fce08572c76e2dddf2 (diff) |
Creating branches/google/stable and tags/google/stable/2019-01-18 from r351319
git-svn-id: https://llvm.org/svn/llvm-project/cfe/branches/google/stable@351578 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp')
-rw-r--r-- | lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp | 146 |
1 files changed, 95 insertions, 51 deletions
diff --git a/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp index b055132ef0..7046ab3aa3 100644 --- a/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ b/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -56,12 +56,12 @@ enum OpenMPRTLFunctionNVPTX { /// Call to int64_t __kmpc_shuffle_int64(int64_t element, /// int16_t lane_offset, int16_t warp_size); OMPRTL_NVPTX__kmpc_shuffle_int64, - /// Call to __kmpc_nvptx_parallel_reduce_nowait(kmp_int32 + /// Call to __kmpc_nvptx_parallel_reduce_nowait_v2(ident_t *loc, kmp_int32 /// global_tid, kmp_int32 num_vars, size_t reduce_size, void* reduce_data, /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t /// lane_offset, int16_t shortCircuit), /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num)); - OMPRTL_NVPTX__kmpc_parallel_reduce_nowait, + OMPRTL_NVPTX__kmpc_parallel_reduce_nowait_v2, /// Call to __kmpc_nvptx_teams_reduce_nowait_simple(ident_t *loc, kmp_int32 /// global_tid, kmp_critical_name *lck) OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_simple, @@ -91,13 +91,17 @@ enum OpenMPRTLFunctionNVPTX { OMPRTL_NVPTX__kmpc_parallel_level, /// Call to int8_t __kmpc_is_spmd_exec_mode(); OMPRTL_NVPTX__kmpc_is_spmd_exec_mode, - /// Call to void __kmpc_get_team_static_memory(const void *buf, size_t size, - /// int16_t is_shared, const void **res); + /// Call to void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode, + /// const void *buf, size_t size, int16_t is_shared, const void **res); OMPRTL_NVPTX__kmpc_get_team_static_memory, - /// Call to void __kmpc_restore_team_static_memory(int16_t is_shared); + /// Call to void __kmpc_restore_team_static_memory(int16_t + /// isSPMDExecutionMode, int16_t is_shared); OMPRTL_NVPTX__kmpc_restore_team_static_memory, - // Call to void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid); + /// Call to void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid); OMPRTL__kmpc_barrier, + /// Call to void __kmpc_barrier_simple_spmd(ident_t *loc, kmp_int32 + /// global_tid); + OMPRTL__kmpc_barrier_simple_spmd, }; /// Pre(post)-action for different OpenMP constructs specialized for NVPTX. @@ -640,17 +644,6 @@ static llvm::Value *getNVPTXNumThreads(CodeGenFunction &CGF) { "nvptx_num_threads"); } -/// Get barrier to synchronize all threads in a block. -static void getNVPTXCTABarrier(CodeGenFunction &CGF) { - llvm::Function *F = llvm::Intrinsic::getDeclaration( - &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier0); - F->addFnAttr(llvm::Attribute::Convergent); - CGF.EmitRuntimeCall(F); -} - -/// Synchronize all GPU threads in a block. -static void syncCTAThreads(CodeGenFunction &CGF) { getNVPTXCTABarrier(CGF); } - /// Get the value of the thread_limit clause in the teams directive. /// For the 'generic' execution mode, the runtime encodes thread_limit in /// the launch parameters, always starting thread_limit+warpSize threads per @@ -712,8 +705,8 @@ getDataSharingMode(CodeGenModule &CGM) { : CGOpenMPRuntimeNVPTX::Generic; } -// Checks if the expression is constant or does not have non-trivial function -// calls. +/// Checks if the expression is constant or does not have non-trivial function +/// calls. static bool isTrivial(ASTContext &Ctx, const Expr * E) { // We can skip constant expressions. // We can skip expressions with trivial calls or simple expressions. @@ -1654,12 +1647,12 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) { RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int64"); break; } - case OMPRTL_NVPTX__kmpc_parallel_reduce_nowait: { - // Build int32_t kmpc_nvptx_parallel_reduce_nowait(kmp_int32 global_tid, - // kmp_int32 num_vars, size_t reduce_size, void* reduce_data, - // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t - // lane_offset, int16_t Algorithm Version), - // void (*kmp_InterWarpCopyFctPtr)(void* src, int warp_num)); + case OMPRTL_NVPTX__kmpc_parallel_reduce_nowait_v2: { + // Build int32_t kmpc_nvptx_parallel_reduce_nowait_v2(ident_t *loc, + // kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, void* + // reduce_data, void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t + // lane_id, int16_t lane_offset, int16_t Algorithm Version), void + // (*kmp_InterWarpCopyFctPtr)(void* src, int warp_num)); llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty, CGM.Int16Ty, CGM.Int16Ty}; auto *ShuffleReduceFnTy = @@ -1669,7 +1662,8 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) { auto *InterWarpCopyFnTy = llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams, /*isVarArg=*/false); - llvm::Type *TypeParams[] = {CGM.Int32Ty, + llvm::Type *TypeParams[] = {getIdentTyPointerTy(), + CGM.Int32Ty, CGM.Int32Ty, CGM.SizeTy, CGM.VoidPtrTy, @@ -1678,7 +1672,7 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) { auto *FnTy = llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false); RTLFn = CGM.CreateRuntimeFunction( - FnTy, /*Name=*/"__kmpc_nvptx_parallel_reduce_nowait"); + FnTy, /*Name=*/"__kmpc_nvptx_parallel_reduce_nowait_v2"); break; } case OMPRTL_NVPTX__kmpc_end_reduce_nowait: { @@ -1787,19 +1781,21 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) { break; } case OMPRTL_NVPTX__kmpc_get_team_static_memory: { - // Build void __kmpc_get_team_static_memory(const void *buf, size_t size, - // int16_t is_shared, const void **res); - llvm::Type *TypeParams[] = {CGM.VoidPtrTy, CGM.SizeTy, CGM.Int16Ty, - CGM.VoidPtrPtrTy}; + // Build void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode, + // const void *buf, size_t size, int16_t is_shared, const void **res); + llvm::Type *TypeParams[] = {CGM.Int16Ty, CGM.VoidPtrTy, CGM.SizeTy, + CGM.Int16Ty, CGM.VoidPtrPtrTy}; auto *FnTy = llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_team_static_memory"); break; } case OMPRTL_NVPTX__kmpc_restore_team_static_memory: { - // Build void __kmpc_restore_team_static_memory(int16_t is_shared); + // Build void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode, + // int16_t is_shared); + llvm::Type *TypeParams[] = {CGM.Int16Ty, CGM.Int16Ty}; auto *FnTy = - llvm::FunctionType::get(CGM.VoidTy, CGM.Int16Ty, /*isVarArg=*/false); + llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false); RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_restore_team_static_memory"); break; @@ -1813,6 +1809,17 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) { cast<llvm::Function>(RTLFn)->addFnAttr(llvm::Attribute::Convergent); break; } + case OMPRTL__kmpc_barrier_simple_spmd: { + // Build void __kmpc_barrier_simple_spmd(ident_t *loc, kmp_int32 + // global_tid); + llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty}; + auto *FnTy = + llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); + RTLFn = + CGM.CreateRuntimeFunction(FnTy, /*Name*/ "__kmpc_barrier_simple_spmd"); + cast<llvm::Function>(RTLFn)->addFnAttr(llvm::Attribute::Convergent); + break; + } } return RTLFn; } @@ -2208,8 +2215,11 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF, CGM.getContext().getSizeType(), Loc); llvm::Value *ResAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( KernelStaticGlobalized, CGM.VoidPtrPtrTy); - llvm::Value *GlobalRecordSizeArg[] = {StaticGlobalized, Ld, - IsInSharedMemory, ResAddr}; + llvm::Value *GlobalRecordSizeArg[] = { + llvm::ConstantInt::get( + CGM.Int16Ty, + getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD ? 1 : 0), + StaticGlobalized, Ld, IsInSharedMemory, ResAddr}; CGF.EmitRuntimeCall(createNVPTXRuntimeFunction( OMPRTL_NVPTX__kmpc_get_team_static_memory), GlobalRecordSizeArg); @@ -2397,10 +2407,15 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsEpilog(CodeGenFunction &CGF, Address(GlobalizedRecords.back().UseSharedMemory, CGM.getContext().getTypeAlignInChars(Int16Ty)), /*Volatile=*/false, Int16Ty, GlobalizedRecords.back().Loc); + llvm::Value *Args[] = { + llvm::ConstantInt::get( + CGM.Int16Ty, + getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD ? 1 : 0), + IsInSharedMemory}; CGF.EmitRuntimeCall( createNVPTXRuntimeFunction( OMPRTL_NVPTX__kmpc_restore_team_static_memory), - IsInSharedMemory); + Args); } } else { CGF.EmitRuntimeCall(createNVPTXRuntimeFunction( @@ -2665,6 +2680,20 @@ void CGOpenMPRuntimeNVPTX::emitSPMDParallelCall( } } +void CGOpenMPRuntimeNVPTX::syncCTAThreads(CodeGenFunction &CGF) { + // Always emit simple barriers! + if (!CGF.HaveInsertPoint()) + return; + // Build call __kmpc_barrier_simple_spmd(nullptr, 0); + // This function does not use parameters, so we can emit just default values. + llvm::Value *Args[] = { + llvm::ConstantPointerNull::get( + cast<llvm::PointerType>(getIdentTyPointerTy())), + llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)}; + CGF.EmitRuntimeCall( + createNVPTXRuntimeFunction(OMPRTL__kmpc_barrier_simple_spmd), Args); +} + void CGOpenMPRuntimeNVPTX::emitBarrierCall(CodeGenFunction &CGF, SourceLocation Loc, OpenMPDirectiveKind Kind, bool, @@ -3089,12 +3118,12 @@ static void emitReductionListCopy( /// void inter_warp_copy_func(void* reduce_data, num_warps) /// shared smem[warp_size]; /// For all data entries D in reduce_data: +/// sync /// If (I am the first lane in each warp) /// Copy my local D to smem[warp_id] /// sync /// if (I am the first warp) /// Copy smem[thread_id] to my local D -/// sync static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM, ArrayRef<const Expr *> Privates, QualType ReductionArrayTy, @@ -3203,6 +3232,10 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM, Bld.CreateCondBr(Cmp, BodyBB, ExitBB); CGF.EmitBlock(BodyBB); } + // kmpc_barrier. + CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown, + /*EmitChecks=*/false, + /*ForceSimpleCall=*/true); llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then"); llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else"); llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont"); @@ -3303,12 +3336,6 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM, CGF.EmitBlock(W0MergeBB); - // While warp 0 copies values from transfer medium, all other warps must - // wait. - // kmpc_barrier. - CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown, - /*EmitChecks=*/false, - /*ForceSimpleCall=*/true); if (NumIters > 1) { Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /*V=*/1)); CGF.EmitStoreOfScalar(Cnt, CntAddr, /*Volatile=*/false, C.IntTy); @@ -3586,7 +3613,7 @@ static llvm::Value *emitShuffleAndReduceFunction( /// 3. Call the OpenMP runtime on the GPU to reduce within a team /// and store the result on the team master: /// -/// __kmpc_nvptx_parallel_reduce_nowait(..., +/// __kmpc_nvptx_parallel_reduce_nowait_v2(..., /// reduceData, shuffleReduceFn, interWarpCpyFn) /// /// where: @@ -3757,7 +3784,7 @@ static llvm::Value *emitShuffleAndReduceFunction( /// Intra-Team Reduction /// /// This function, as implemented in the runtime call -/// '__kmpc_nvptx_parallel_reduce_nowait', aggregates data across OpenMP +/// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP /// threads in a team. It first reduces within a warp using the /// aforementioned algorithms. We then proceed to gather all such /// reduced values at the first warp. @@ -3780,7 +3807,7 @@ static llvm::Value *emitShuffleAndReduceFunction( /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e., /// the k'th worker reduces every k'th element. /// -/// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait' to +/// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to /// reduce across workers and compute a globally reduced value. /// void CGOpenMPRuntimeNVPTX::emitReduction( @@ -3810,6 +3837,7 @@ void CGOpenMPRuntimeNVPTX::emitReduction( // RedList, shuffle_reduce_func, interwarp_copy_func); // or // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>); + llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); llvm::Value *ThreadId = getThreadID(CGF, Loc); llvm::Value *Res; @@ -3864,19 +3892,19 @@ void CGOpenMPRuntimeNVPTX::emitReduction( llvm::Value *InterWarpCopyFn = emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc); - llvm::Value *Args[] = {ThreadId, + llvm::Value *Args[] = {RTLoc, + ThreadId, CGF.Builder.getInt32(RHSExprs.size()), ReductionArrayTySize, RL, ShuffleAndReduceFn, InterWarpCopyFn}; - Res = CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_reduce_nowait), - Args); + Res = CGF.EmitRuntimeCall(createNVPTXRuntimeFunction( + OMPRTL_NVPTX__kmpc_parallel_reduce_nowait_v2), + Args); } else { assert(TeamsReduction && "expected teams reduction."); - llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); std::string Name = getName({"reduction"}); llvm::Value *Lock = getCriticalRegionLock(Name); llvm::Value *Args[] = {RTLoc, ThreadId, Lock}; @@ -4513,6 +4541,22 @@ void CGOpenMPRuntimeNVPTX::clear() { Records.UseSharedMemory->setInitializer( llvm::ConstantInt::get(CGM.Int16Ty, UseSharedMemory ? 1 : 0)); } + // Allocate SharedMemorySize buffer for the shared memory. + // FIXME: nvlink does not handle weak linkage correctly (object with the + // different size are reported as erroneous). + // Restore this code as sson as nvlink is fixed. + if (!SharedStaticRD->field_empty()) { + llvm::APInt ArySize(/*numBits=*/64, SharedMemorySize); + QualType SubTy = C.getConstantArrayType( + C.CharTy, ArySize, ArrayType::Normal, /*IndexTypeQuals=*/0); + auto *Field = FieldDecl::Create( + C, SharedStaticRD, SourceLocation(), SourceLocation(), nullptr, SubTy, + C.getTrivialTypeSourceInfo(SubTy, SourceLocation()), + /*BW=*/nullptr, /*Mutable=*/false, + /*InitStyle=*/ICIS_NoInit); + Field->setAccess(AS_public); + SharedStaticRD->addDecl(Field); + } SharedStaticRD->completeDefinition(); if (!SharedStaticRD->field_empty()) { QualType StaticTy = C.getRecordType(SharedStaticRD); |