[OpenMP] Basic support for a parallel directive in a target region on an NVPTX device

Summary: This patch introduces support for the execution of parallel constructs in a target region on the NVPTX device. Parallel regions must be in the lexical scope of the target directive. The master thread in the master warp signals parallel work for worker threads in worker warps on encountering a parallel region. Note: The patch does not yet support capture of arguments in a parallel region so the test cases are simple. Reviewers: ABataev Differential Revision: https://reviews.llvm.org/D28145 git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@291565 91177308-0d34-0410-b5e6-96231b3b80d8
author: Arpith Chacko Jacob <acjacob@us.ibm.com> 2017-01-10 15:42:51 +0000
committer: Arpith Chacko Jacob <acjacob@us.ibm.com> 2017-01-10 15:42:51 +0000
commit: f6a92cca29985a47b5d4da98c59d0238f75d8a3e (patch)
tree: 2355343a998205b73e45126b6f2d54f46670e824
parent: a41b64721d4be78870711cbb7b5af156a9e1f4ad (diff)
5 files changed, 611 insertions, 27 deletions
diff --git a/lib/CodeGen/CGOpenMPRuntime.cpp b/lib/CodeGen/CGOpenMPRuntime.cpp
index 27af344fae..db9de2ab6a 100644
--- a/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -99,10 +99,11 @@ class CGOpenMPOutlinedRegionInfo final : public CGOpenMPRegionInfo {
 public:
   CGOpenMPOutlinedRegionInfo(const CapturedStmt &CS, const VarDecl *ThreadIDVar,
                              const RegionCodeGenTy &CodeGen,
-                             OpenMPDirectiveKind Kind, bool HasCancel)
+                             OpenMPDirectiveKind Kind, bool HasCancel,
+                             StringRef HelperName)
       : CGOpenMPRegionInfo(CS, ParallelOutlinedRegion, CodeGen, Kind,
                            HasCancel),
-        ThreadIDVar(ThreadIDVar) {
+        ThreadIDVar(ThreadIDVar), HelperName(HelperName) {
     assert(ThreadIDVar != nullptr && "No ThreadID in OpenMP region.");
   }
 
@@ -111,7 +112,7 @@ public:
   const VarDecl *getThreadIDVariable() const override { return ThreadIDVar; }
 
   /// \brief Get the name of the capture helper.
-  StringRef getHelperName() const override { return ".omp_outlined."; }
+  StringRef getHelperName() const override { return HelperName; }
 
   static bool classof(const CGCapturedStmtInfo *Info) {
     return CGOpenMPRegionInfo::classof(Info) &&
@@ -123,6 +124,7 @@ private:
   /// \brief A variable or parameter storing global thread id for OpenMP
   /// constructs.
   const VarDecl *ThreadIDVar;
+  StringRef HelperName;
 };
 
 /// \brief API for captured statement code generation in OpenMP constructs.
@@ -855,7 +857,7 @@ llvm::Value *CGOpenMPRuntime::emitParallelOrTeamsOutlinedFunction(
   else if (auto *OPFD = dyn_cast<OMPParallelForDirective>(&D))
     HasCancel = OPFD->hasCancel();
   CGOpenMPOutlinedRegionInfo CGInfo(*CS, ThreadIDVar, CodeGen, InnermostKind,
-                                    HasCancel);
+                                    HasCancel, getOutlinedHelperName());
   CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CGInfo);
   return CGF.GenerateOpenMPCapturedStmtFunction(*CS);
 }
@@ -1892,9 +1894,9 @@ llvm::Function *CGOpenMPRuntime::emitThreadPrivateVarDefinition(
 /// } else {
 ///   ElseGen();
 /// }
-static void emitOMPIfClause(CodeGenFunction &CGF, const Expr *Cond,
-                            const RegionCodeGenTy &ThenGen,
-                            const RegionCodeGenTy &ElseGen) {
+void CGOpenMPRuntime::emitOMPIfClause(CodeGenFunction &CGF, const Expr *Cond,
+                                      const RegionCodeGenTy &ThenGen,
+                                      const RegionCodeGenTy &ElseGen) {
   CodeGenFunction::LexicalScope ConditionScope(CGF, Cond->getSourceRange());
 
   // If the condition constant folds and can be elided, try to avoid emitting
diff --git a/lib/CodeGen/CGOpenMPRuntime.h b/lib/CodeGen/CGOpenMPRuntime.h
index 9a784dff0a..61ddc702ed 100644
--- a/lib/CodeGen/CGOpenMPRuntime.h
+++ b/lib/CodeGen/CGOpenMPRuntime.h
@@ -130,6 +130,35 @@ protected:
                                                 bool IsOffloadEntry,
                                                 const RegionCodeGenTy &CodeGen);
 
+  /// \brief Emits code for OpenMP 'if' clause using specified \a CodeGen
+  /// function. Here is the logic:
+  /// if (Cond) {
+  ///   ThenGen();
+  /// } else {
+  ///   ElseGen();
+  /// }
+  void emitOMPIfClause(CodeGenFunction &CGF, const Expr *Cond,
+                       const RegionCodeGenTy &ThenGen,
+                       const RegionCodeGenTy &ElseGen);
+
+  /// \brief Emits object of ident_t type with info for source location.
+  /// \param Flags Flags for OpenMP location.
+  ///
+  llvm::Value *emitUpdateLocation(CodeGenFunction &CGF, SourceLocation Loc,
+                                  unsigned Flags = 0);
+
+  /// \brief Returns pointer to ident_t type.
+  llvm::Type *getIdentTyPointerTy();
+
+  /// \brief Gets thread id value for the current thread.
+  ///
+  llvm::Value *getThreadID(CodeGenFunction &CGF, SourceLocation Loc);
+
+  /// \brief Get the function name of an outlined region.
+  //  The name can be customized depending on the target.
+  //
+  virtual StringRef getOutlinedHelperName() const { return ".omp_outlined."; }
+
 private:
   /// \brief Default const ident_t object used for initialization of all other
   /// ident_t objects.
@@ -388,15 +417,6 @@ private:
   /// \brief Build type kmp_routine_entry_t (if not built yet).
   void emitKmpRoutineEntryT(QualType KmpInt32Ty);
 
-  /// \brief Emits object of ident_t type with info for source location.
-  /// \param Flags Flags for OpenMP location.
-  ///
-  llvm::Value *emitUpdateLocation(CodeGenFunction &CGF, SourceLocation Loc,
-                                  unsigned Flags = 0);
-
-  /// \brief Returns pointer to ident_t type.
-  llvm::Type *getIdentTyPointerTy();
-
   /// \brief Returns pointer to kmpc_micro type.
   llvm::Type *getKmpc_MicroPointerTy();
 
@@ -432,10 +452,6 @@ private:
   /// stored.
   virtual Address emitThreadIDAddress(CodeGenFunction &CGF, SourceLocation Loc);
 
-  /// \brief Gets thread id value for the current thread.
-  ///
-  llvm::Value *getThreadID(CodeGenFunction &CGF, SourceLocation Loc);
-
   /// \brief Gets (if variable with the given name already exist) or creates
   /// internal global variable with the specified Name. The created variable has
   /// linkage CommonLinkage by default and is initialized by null value.
diff --git a/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
index bc1458b1c2..6a6d832e33 100644
--- a/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ b/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@@ -26,8 +26,57 @@ enum OpenMPRTLFunctionNVPTX {
   OMPRTL_NVPTX__kmpc_kernel_init,
   /// \brief Call to void __kmpc_kernel_deinit();
   OMPRTL_NVPTX__kmpc_kernel_deinit,
+  /// \brief Call to void __kmpc_kernel_prepare_parallel(void
+  /// *outlined_function);
+  OMPRTL_NVPTX__kmpc_kernel_prepare_parallel,
+  /// \brief Call to bool __kmpc_kernel_parallel(void **outlined_function);
+  OMPRTL_NVPTX__kmpc_kernel_parallel,
+  /// \brief Call to void __kmpc_kernel_end_parallel();
+  OMPRTL_NVPTX__kmpc_kernel_end_parallel,
+  /// Call to void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
+  /// global_tid);
+  OMPRTL_NVPTX__kmpc_serialized_parallel,
+  /// Call to void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
+  /// global_tid);
+  OMPRTL_NVPTX__kmpc_end_serialized_parallel,
 };
-} // namespace
+
+/// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
+class NVPTXActionTy final : public PrePostActionTy {
+  llvm::Value *EnterCallee;
+  ArrayRef<llvm::Value *> EnterArgs;
+  llvm::Value *ExitCallee;
+  ArrayRef<llvm::Value *> ExitArgs;
+  bool Conditional;
+  llvm::BasicBlock *ContBlock = nullptr;
+
+public:
+  NVPTXActionTy(llvm::Value *EnterCallee, ArrayRef<llvm::Value *> EnterArgs,
+                llvm::Value *ExitCallee, ArrayRef<llvm::Value *> ExitArgs,
+                bool Conditional = false)
+      : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
+        ExitArgs(ExitArgs), Conditional(Conditional) {}
+  void Enter(CodeGenFunction &CGF) override {
+    llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
+    if (Conditional) {
+      llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
+      auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
+      ContBlock = CGF.createBasicBlock("omp_if.end");
+      // Generate the branch (If-stmt)
+      CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
+      CGF.EmitBlock(ThenBlock);
+    }
+  }
+  void Done(CodeGenFunction &CGF) {
+    // Emit the rest of blocks/branches
+    CGF.EmitBranch(ContBlock);
+    CGF.EmitBlock(ContBlock, true);
+  }
+  void Exit(CodeGenFunction &CGF) override {
+    CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
+  }
+};
+} // anonymous namespace
 
 /// Get the GPU warp size.
 static llvm::Value *getNVPTXWarpSize(CodeGenFunction &CGF) {
@@ -118,6 +167,7 @@ void CGOpenMPRuntimeNVPTX::emitGenericKernel(const OMPExecutableDirective &D,
                                              const RegionCodeGenTy &CodeGen) {
   EntryFunctionState EST;
   WorkerFunctionState WST(CGM);
+  Work.clear();
 
   // Emit target region as a standalone region.
   class NVPTXPrePostActionTy : public PrePostActionTy {
@@ -246,7 +296,10 @@ void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF,
   CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0));
   CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy));
 
-  // TODO: Call into runtime to get parallel work.
+  llvm::Value *Args[] = {WorkFn.getPointer()};
+  llvm::Value *Ret = CGF.EmitRuntimeCall(
+      createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_parallel), Args);
+  Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus);
 
   // On termination condition (workid == 0), exit loop.
   llvm::Value *ShouldTerminate =
@@ -261,10 +314,42 @@ void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF,
 
   // Signal start of parallel region.
   CGF.EmitBlock(ExecuteBB);
-  // TODO: Add parallel work.
+
+  // Process work items: outlined parallel functions.
+  for (auto *W : Work) {
+    // Try to match this outlined function.
+    auto *ID = Bld.CreatePointerBitCastOrAddrSpaceCast(W, CGM.Int8PtrTy);
+
+    llvm::Value *WorkFnMatch =
+        Bld.CreateICmpEQ(Bld.CreateLoad(WorkFn), ID, "work_match");
+
+    llvm::BasicBlock *ExecuteFNBB = CGF.createBasicBlock(".execute.fn");
+    llvm::BasicBlock *CheckNextBB = CGF.createBasicBlock(".check.next");
+    Bld.CreateCondBr(WorkFnMatch, ExecuteFNBB, CheckNextBB);
+
+    // Execute this outlined function.
+    CGF.EmitBlock(ExecuteFNBB);
+
+    // Insert call to work function.
+    // FIXME: Pass arguments to outlined function from master thread.
+    auto *Fn = cast<llvm::Function>(W);
+    Address ZeroAddr =
+        CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, /*Name=*/".zero.addr");
+    CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C=*/0));
+    llvm::Value *FnArgs[] = {ZeroAddr.getPointer(), ZeroAddr.getPointer()};
+    CGF.EmitCallOrInvoke(Fn, FnArgs);
+
+    // Go to end of parallel region.
+    CGF.EmitBranch(TerminateBB);
+
+    CGF.EmitBlock(CheckNextBB);
+  }
 
   // Signal end of parallel region.
   CGF.EmitBlock(TerminateBB);
+  CGF.EmitRuntimeCall(
+      createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_end_parallel),
+      llvm::None);
   CGF.EmitBranch(BarrierBB);
 
   // All active and inactive workers wait at a barrier after parallel region.
@@ -296,10 +381,53 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) {
   case OMPRTL_NVPTX__kmpc_kernel_deinit: {
     // Build void __kmpc_kernel_deinit();
     llvm::FunctionType *FnTy =
-        llvm::FunctionType::get(CGM.VoidTy, {}, /*isVarArg*/ false);
+        llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
     RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_deinit");
     break;
   }
+  case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: {
+    /// Build void __kmpc_kernel_prepare_parallel(
+    /// void *outlined_function);
+    llvm::Type *TypeParams[] = {CGM.Int8PtrTy};
+    llvm::FunctionType *FnTy =
+        llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
+    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_prepare_parallel");
+    break;
+  }
+  case OMPRTL_NVPTX__kmpc_kernel_parallel: {
+    /// Build bool __kmpc_kernel_parallel(void **outlined_function);
+    llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy};
+    llvm::Type *RetTy = CGM.getTypes().ConvertType(CGM.getContext().BoolTy);
+    llvm::FunctionType *FnTy =
+        llvm::FunctionType::get(RetTy, TypeParams, /*isVarArg*/ false);
+    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_parallel");
+    break;
+  }
+  case OMPRTL_NVPTX__kmpc_kernel_end_parallel: {
+    /// Build void __kmpc_kernel_end_parallel();
+    llvm::FunctionType *FnTy =
+        llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
+    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_end_parallel");
+    break;
+  }
+  case OMPRTL_NVPTX__kmpc_serialized_parallel: {
+    // Build void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
+    // global_tid);
+    llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
+    llvm::FunctionType *FnTy =
+        llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
+    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_serialized_parallel");
+    break;
+  }
+  case OMPRTL_NVPTX__kmpc_end_serialized_parallel: {
+    // Build void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
+    // global_tid);
+    llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
+    llvm::FunctionType *FnTy =
+        llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
+    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_serialized_parallel");
+    break;
+  }
   }
   return RTLFn;
 }
@@ -362,9 +490,12 @@ llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOrTeamsOutlinedFunction(
     OutlinedFun = cast<llvm::Function>(OutlinedFunVal);
     OutlinedFun->removeFnAttr(llvm::Attribute::NoInline);
     OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline);
-  } else
-    llvm_unreachable("parallel directive is not yet supported for nvptx "
-                     "backend.");
+  } else {
+    llvm::Value *OutlinedFunVal =
+        CGOpenMPRuntime::emitParallelOrTeamsOutlinedFunction(
+            D, ThreadIDVar, InnermostKind, CodeGen);
+    OutlinedFun = cast<llvm::Function>(OutlinedFunVal);
+  }
 
   return OutlinedFun;
 }
@@ -387,3 +518,81 @@ void CGOpenMPRuntimeNVPTX::emitTeamsCall(CodeGenFunction &CGF,
   OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
   CGF.EmitCallOrInvoke(OutlinedFn, OutlinedFnArgs);
 }
+
+void CGOpenMPRuntimeNVPTX::emitParallelCall(
+    CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
+    ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
+  if (!CGF.HaveInsertPoint())
+    return;
+
+  emitGenericParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
+}
+
+void CGOpenMPRuntimeNVPTX::emitGenericParallelCall(
+    CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
+    ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
+  llvm::Function *Fn = cast<llvm::Function>(OutlinedFn);
+
+  auto &&L0ParallelGen = [this, Fn, &CapturedVars](CodeGenFunction &CGF,
+                                                   PrePostActionTy &) {
+    CGBuilderTy &Bld = CGF.Builder;
+
+    // Prepare for parallel region. Indicate the outlined function.
+    llvm::Value *Args[] = {Bld.CreateBitOrPointerCast(Fn, CGM.Int8PtrTy)};
+    CGF.EmitRuntimeCall(
+        createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel),
+        Args);
+
+    // Activate workers. This barrier is used by the master to signal
+    // work for the workers.
+    syncCTAThreads(CGF);
+
+    // OpenMP [2.5, Parallel Construct, p.49]
+    // There is an implied barrier at the end of a parallel region. After the
+    // end of a parallel region, only the master thread of the team resumes
+    // execution of the enclosing task region.
+    //
+    // The master waits at this barrier until all workers are done.
+    syncCTAThreads(CGF);
+
+    // Remember for post-processing in worker loop.
+    Work.push_back(Fn);
+  };
+
+  auto *RTLoc = emitUpdateLocation(CGF, Loc);
+  auto *ThreadID = getThreadID(CGF, Loc);
+  llvm::Value *Args[] = {RTLoc, ThreadID};
+
+  auto &&SeqGen = [this, Fn, &CapturedVars, &Args](CodeGenFunction &CGF,
+                                                   PrePostActionTy &) {
+    auto &&CodeGen = [this, Fn, &CapturedVars, &Args](CodeGenFunction &CGF,
+                                                      PrePostActionTy &Action) {
+      Action.Enter(CGF);
+
+      llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
+      OutlinedFnArgs.push_back(
+          llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()));
+      OutlinedFnArgs.push_back(
+          llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()));
+      OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
+      CGF.EmitCallOrInvoke(Fn, OutlinedFnArgs);
+    };
+
+    RegionCodeGenTy RCG(CodeGen);
+    NVPTXActionTy Action(
+        createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
+        Args,
+        createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
+        Args);
+    RCG.setAction(Action);
+    RCG(CGF);
+  };
+
+  if (IfCond)
+    emitOMPIfClause(CGF, IfCond, L0ParallelGen, SeqGen);
+  else {
+    CodeGenFunction::RunCleanupsScope Scope(CGF);
+    RegionCodeGenTy ThenRCG(L0ParallelGen);
+    ThenRCG(CGF);
+  }
+}
diff --git a/lib/CodeGen/CGOpenMPRuntimeNVPTX.h b/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
index 63a02965a5..4010b46a4c 100644
--- a/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
+++ b/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
@@ -25,6 +25,9 @@ namespace CodeGen {
 
 class CGOpenMPRuntimeNVPTX : public CGOpenMPRuntime {
 private:
+  // Parallel outlined function work for workers to execute.
+  llvm::SmallVector<llvm::Function *, 16> Work;
+
   struct EntryFunctionState {
     llvm::BasicBlock *ExitBB = nullptr;
   };
@@ -100,6 +103,29 @@ private:
                                   bool IsOffloadEntry,
                                   const RegionCodeGenTy &CodeGen) override;
 
+  /// \brief Emits code for parallel or serial call of the \a OutlinedFn with
+  /// variables captured in a record which address is stored in \a
+  /// CapturedStruct.
+  /// This call is for the Generic Execution Mode.
+  /// \param OutlinedFn Outlined function to be run in parallel threads. Type of
+  /// this function is void(*)(kmp_int32 *, kmp_int32, struct context_vars*).
+  /// \param CapturedVars A pointer to the record with the references to
+  /// variables used in \a OutlinedFn function.
+  /// \param IfCond Condition in the associated 'if' clause, if it was
+  /// specified, nullptr otherwise.
+  void emitGenericParallelCall(CodeGenFunction &CGF, SourceLocation Loc,
+                               llvm::Value *OutlinedFn,
+                               ArrayRef<llvm::Value *> CapturedVars,
+                               const Expr *IfCond);
+
+protected:
+  /// \brief Get the function name of an outlined region.
+  //  The name can be customized depending on the target.
+  //
+  StringRef getOutlinedHelperName() const override {
+    return "__omp_outlined__";
+  }
+
 public:
   explicit CGOpenMPRuntimeNVPTX(CodeGenModule &CGM);
 
@@ -137,6 +163,20 @@ public:
   void emitTeamsCall(CodeGenFunction &CGF, const OMPExecutableDirective &D,
                      SourceLocation Loc, llvm::Value *OutlinedFn,
                      ArrayRef<llvm::Value *> CapturedVars) override;
+
+  /// \brief Emits code for parallel or serial call of the \a OutlinedFn with
+  /// variables captured in a record which address is stored in \a
+  /// CapturedStruct.
+  /// \param OutlinedFn Outlined function to be run in parallel threads. Type of
+  /// this function is void(*)(kmp_int32 *, kmp_int32, struct context_vars*).
+  /// \param CapturedVars A pointer to the record with the references to
+  /// variables used in \a OutlinedFn function.
+  /// \param IfCond Condition in the associated 'if' clause, if it was
+  /// specified, nullptr otherwise.
+  void emitParallelCall(CodeGenFunction &CGF, SourceLocation Loc,
+                        llvm::Value *OutlinedFn,
+                        ArrayRef<llvm::Value *> CapturedVars,
+                        const Expr *IfCond) override;
 };
 
 } // CodeGen namespace.
diff --git a/test/OpenMP/nvptx_parallel_codegen.cpp b/test/OpenMP/nvptx_parallel_codegen.cpp
new file mode 100644
index 0000000000..224f245696
--- /dev/null
+++ b/test/OpenMP/nvptx_parallel_codegen.cpp
@@ -0,0 +1,317 @@
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+template<typename tx>
+tx ftemplate(int n) {
+  tx a = 0;
+  short aa = 0;
+  tx b[10];
+
+  #pragma omp target if(0)
+  {
+    #pragma omp parallel
+    {
+      int a = 41;
+    }
+    a += 1;
+  }
+
+  #pragma omp target
+  {
+    #pragma omp parallel
+    {
+      int a = 42;
+    }
+    #pragma omp parallel if(0)
+    {
+      int a = 43;
+    }
+    #pragma omp parallel if(1)
+    {
+      int a = 44;
+    }
+    a += 1;
+  }
+
+  #pragma omp target if(n>40)
+  {
+    #pragma omp parallel if(n>1000)
+    {
+      int a = 45;
+    }
+    a += 1;
+    aa += 1;
+    b[2] += 1;
+  }
+
+  return a;
+}
+
+int bar(int n){
+  int a = 0;
+
+  a += ftemplate<int>(n);
+
+  return a;
+}
+
+  // CHECK-NOT: define {{.*}}void {{@__omp_offloading_.+template.+l17}}_worker()
+
+
+
+
+
+
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l26}}_worker()
+  // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
+  // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
+  // CHECK: store i8* null, i8** [[OMP_WORK_FN]],
+  // CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]],
+  // CHECK: br label {{%?}}[[AWAIT_WORK:.+]]
+  //
+  // CHECK: [[AWAIT_WORK]]
+  // CHECK: call void @llvm.nvvm.barrier0()
+  // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]])
+  // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8
+  // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1
+  // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
+  // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i8* [[WORK]], null
+  // CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]]
+  //
+  // CHECK: [[SEL_WORKERS]]
+  // CHECK: [[ST:%.+]] = load i8, i8* [[OMP_EXEC_STATUS]]
+  // CHECK: [[IS_ACTIVE:%.+]] = icmp ne i8 [[ST]], 0
+  // CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]]
+  //
+  // CHECK: [[EXEC_PARALLEL]]
+  // CHECK: [[WF1:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
+  // CHECK: [[WM1:%.+]] = icmp eq i8* [[WF1]], bitcast (void (i32*, i32*)* [[PARALLEL_FN1:@.+]] to i8*)
+  // CHECK: br i1 [[WM1]], label {{%?}}[[EXEC_PFN1:.+]], label {{%?}}[[CHECK_NEXT1:.+]]
+  //
+  // CHECK: [[EXEC_PFN1]]
+  // CHECK: call void [[PARALLEL_FN1]](
+  // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
+  //
+  // CHECK: [[CHECK_NEXT1]]
+  // CHECK: [[WF2:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
+  // CHECK: [[WM2:%.+]] = icmp eq i8* [[WF2]], bitcast (void (i32*, i32*)* [[PARALLEL_FN2:@.+]] to i8*)
+  // CHECK: br i1 [[WM2]], label {{%?}}[[EXEC_PFN2:.+]], label {{%?}}[[CHECK_NEXT2:.+]]
+  //
+  // CHECK: [[EXEC_PFN2]]
+  // CHECK: call void [[PARALLEL_FN2]](
+  // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
+  //
+  // CHECK: [[CHECK_NEXT2]]
+  // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
+  //
+  // CHECK: [[TERM_PARALLEL]]
+  // CHECK: call void @__kmpc_kernel_end_parallel()
+  // CHECK: br label {{%?}}[[BAR_PARALLEL]]
+  //
+  // CHECK: [[BAR_PARALLEL]]
+  // CHECK: call void @llvm.nvvm.barrier0()
+  // CHECK: br label {{%?}}[[AWAIT_WORK]]
+  //
+  // CHECK: [[EXIT]]
+  // CHECK: ret void
+
+  // CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l26]](i[[SZ:32|64]]
+  // Create local storage for each capture.
+  // CHECK:  [[LOCAL_A:%.+]] = alloca i[[SZ]],
+  // CHECK-DAG:  store i[[SZ]] [[ARG_A:%.+]], i[[SZ]]* [[LOCAL_A]]
+  // Store captures in the context.
+  // CHECK-64-DAG:[[REF_A:%.+]] = bitcast i[[SZ]]* [[LOCAL_A]] to i32*
+  //
+  // CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK-DAG: [[TH_LIMIT:%.+]] = sub i32 [[NTH]], [[WS]]
+  // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]]
+  // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]]
+  //
+  // CHECK: [[WORKER]]
+  // CHECK: {{call|invoke}} void [[T6]]_worker()
+  // CHECK: br label {{%?}}[[EXIT:.+]]
+  //
+  // CHECK: [[CHECK_MASTER]]
+  // CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]],
+  // CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]]
+  //
+  // CHECK: [[MASTER]]
+  // CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]]
+  // CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]]
+  // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i32*, i32*)* [[PARALLEL_FN1]] to i8*))
+  // CHECK: call void @llvm.nvvm.barrier0()
+  // CHECK: call void @llvm.nvvm.barrier0()
+  // CHECK: call void @__kmpc_serialized_parallel(
+  // CHECK: {{call|invoke}} void [[PARALLEL_FN3:@.+]](
+  // CHECK: call void @__kmpc_end_serialized_parallel(
+  // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i32*, i32*)* [[PARALLEL_FN2]] to i8*))
+  // CHECK: call void @llvm.nvvm.barrier0()
+  // CHECK: call void @llvm.nvvm.barrier0()
+  // CHECK-64-DAG: load i32, i32* [[REF_A]]
+  // CHECK-32-DAG: load i32, i32* [[LOCAL_A]]
+  // CHECK: br label {{%?}}[[TERMINATE:.+]]
+  //
+  // CHECK: [[TERMINATE]]
+  // CHECK: call void @__kmpc_kernel_deinit()
+  // CHECK: call void @llvm.nvvm.barrier0()
+  // CHECK: br label {{%?}}[[EXIT]]
+  //
+  // CHECK: [[EXIT]]
+  // CHECK: ret void
+
+  // CHECK-DAG: define internal void [[PARALLEL_FN1]](
+  // CHECK: [[A:%.+]] = alloca i[[SZ:32|64]],
+  // CHECK: store i[[SZ]] 42, i[[SZ]]* %a,
+  // CHECK: ret void
+
+  // CHECK-DAG: define internal void [[PARALLEL_FN3]](
+  // CHECK: [[A:%.+]] = alloca i[[SZ:32|64]],
+  // CHECK: store i[[SZ]] 43, i[[SZ]]* %a,
+  // CHECK: ret void
+
+  // CHECK-DAG: define internal void [[PARALLEL_FN2]](
+  // CHECK: [[A:%.+]] = alloca i[[SZ:32|64]],
+  // CHECK: store i[[SZ]] 44, i[[SZ]]* %a,
+  // CHECK: ret void
+
+
+
+
+
+
+
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l43}}_worker()
+  // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
+  // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
+  // CHECK: store i8* null, i8** [[OMP_WORK_FN]],
+  // CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]],
+  // CHECK: br label {{%?}}[[AWAIT_WORK:.+]]
+  //
+  // CHECK: [[AWAIT_WORK]]
+  // CHECK: call void @llvm.nvvm.barrier0()
+  // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]])
+  // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8
+  // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1
+  // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
+  // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i8* [[WORK]], null
+  // CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]]
+  //
+  // CHECK: [[SEL_WORKERS]]
+  // CHECK: [[ST:%.+]] = load i8, i8* [[OMP_EXEC_STATUS]]
+  // CHECK: [[IS_ACTIVE:%.+]] = icmp ne i8 [[ST]], 0
+  // CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]]
+  //
+  // CHECK: [[EXEC_PARALLEL]]
+  // CHECK: [[WF:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
+  // CHECK: [[WM:%.+]] = icmp eq i8* [[WF]], bitcast (void (i32*, i32*)* [[PARALLEL_FN4:@.+]] to i8*)
+  // CHECK: br i1 [[WM]], label {{%?}}[[EXEC_PFN:.+]], label {{%?}}[[CHECK_NEXT:.+]]
+  //
+  // CHECK: [[EXEC_PFN]]
+  // CHECK: call void [[PARALLEL_FN4]](
+  // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
+  //
+  // CHECK: [[CHECK_NEXT]]
+  // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
+  //
+  // CHECK: [[TERM_PARALLEL]]
+  // CHECK: call void @__kmpc_kernel_end_parallel()
+  // CHECK: br label {{%?}}[[BAR_PARALLEL]]
+  //
+  // CHECK: [[BAR_PARALLEL]]
+  // CHECK: call void @llvm.nvvm.barrier0()
+  // CHECK: br label {{%?}}[[AWAIT_WORK]]
+  //
+  // CHECK: [[EXIT]]
+  // CHECK: ret void
+
+  // CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l43]](i[[SZ:32|64]]
+  // Create local storage for each capture.
+  // CHECK:  [[LOCAL_N:%.+]] = alloca i[[SZ]],
+  // CHECK:  [[LOCAL_A:%.+]] = alloca i[[SZ]],
+  // CHECK:  [[LOCAL_AA:%.+]] = alloca i[[SZ]],
+  // CHECK:  [[LOCAL_B:%.+]] = alloca [10 x i32]*
+  // CHECK-DAG:  store i[[SZ]] [[ARG_N:%.+]], i[[SZ]]* [[LOCAL_N]]
+  // CHECK-DAG:  store i[[SZ]] [[ARG_A:%.+]], i[[SZ]]* [[LOCAL_A]]
+  // CHECK-DAG:  store i[[SZ]] [[ARG_AA:%.+]], i[[SZ]]* [[LOCAL_AA]]
+  // CHECK-DAG:   store [10 x i32]* [[ARG_B:%.+]], [10 x i32]** [[LOCAL_B]]
+  // Store captures in the context.
+  // CHECK-64-DAG:[[REF_N:%.+]] = bitcast i[[SZ]]* [[LOCAL_N]] to i32*
+  // CHECK-64-DAG:[[REF_A:%.+]] = bitcast i[[SZ]]* [[LOCAL_A]] to i32*
+  // CHECK-DAG:   [[REF_AA:%.+]] = bitcast i[[SZ]]* [[LOCAL_AA]] to i16*
+  // CHECK-DAG:   [[REF_B:%.+]] = load [10 x i32]*, [10 x i32]** [[LOCAL_B]],
+  //
+  // CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK-DAG: [[TH_LIMIT:%.+]] = sub i32 [[NTH]], [[WS]]
+  // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]]
+  // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]]
+  //
+  // CHECK: [[WORKER]]
+  // CHECK: {{call|invoke}} void [[T6]]_worker()
+  // CHECK: br label {{%?}}[[EXIT:.+]]
+  //
+  // CHECK: [[CHECK_MASTER]]
+  // CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]],
+  // CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]]
+  //
+  // CHECK: [[MASTER]]
+  // CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]]
+  // CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]]
+  // CHECK-64: [[N:%.+]] = load i32, i32* [[REF_N]],
+  // CHECK-32: [[N:%.+]] = load i32, i32* [[LOCAL_N]],
+  // CHECK: [[CMP:%.+]] = icmp sgt i32 [[N]], 1000
+  // CHECK: br i1 [[CMP]], label {{%?}}[[IF_THEN:.+]], label {{%?}}[[IF_ELSE:.+]]
+  //
+  // CHECK: [[IF_THEN]]
+  // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i32*, i32*)* [[PARALLEL_FN4]] to i8*))
+  // CHECK: call void @llvm.nvvm.barrier0()
+  // CHECK: call void @llvm.nvvm.barrier0()
+  // CHECK: br label {{%?}}[[IF_END:.+]]
+  //
+  // CHECK: [[IF_ELSE]]
+  // CHECK: call void @__kmpc_serialized_parallel(
+  // CHECK: {{call|invoke}} void [[PARALLEL_FN4]](
+  // CHECK: call void @__kmpc_end_serialized_parallel(
+  // br label [[IF_END]]
+  //
+  // CHECK: [[IF_END]]
+  // CHECK-64-DAG: load i32, i32* [[REF_A]]
+  // CHECK-32-DAG: load i32, i32* [[LOCAL_A]]
+  // CHECK-DAG:    load i16, i16* [[REF_AA]]
+  // CHECK-DAG:    getelementptr inbounds [10 x i32], [10 x i32]* [[REF_B]], i[[SZ]] 0, i[[SZ]] 2
+  //
+  // CHECK: br label {{%?}}[[TERMINATE:.+]]
+  //
+  // CHECK: [[TERMINATE]]
+  // CHECK: call void @__kmpc_kernel_deinit()
+  // CHECK: call void @llvm.nvvm.barrier0()
+  // CHECK: br label {{%?}}[[EXIT]]
+  //
+  // CHECK: [[EXIT]]
+  // CHECK: ret void
+
+  // CHECK: define internal void [[PARALLEL_FN4]](
+  // CHECK: [[A:%.+]] = alloca i[[SZ:32|64]],
+  // CHECK: store i[[SZ]] 45, i[[SZ]]* %a,
+  // CHECK: ret void
+#endif
author	Arpith Chacko Jacob <acjacob@us.ibm.com>	2017-01-10 15:42:51 +0000
committer	Arpith Chacko Jacob <acjacob@us.ibm.com>	2017-01-10 15:42:51 +0000
commit	f6a92cca29985a47b5d4da98c59d0238f75d8a3e (patch)
tree	2355343a998205b73e45126b6f2d54f46670e824
parent	a41b64721d4be78870711cbb7b5af156a9e1f4ad (diff)