[CUDA] Invoke ptxas and fatbinary during compilation.

Summary: Previously we compiled CUDA device code to PTX assembly and embedded that asm as text in our host binary. Now we compile to PTX assembly and then invoke ptxas to assemble the PTX into a cubin file. We gather the ptx and cubin files for each of our --cuda-gpu-archs and combine them using fatbinary, and then embed that into the host binary. Adds two new command-line flags, -Xcuda_ptxas and -Xcuda_fatbinary, which pass args down to the external tools. Reviewers: tra, echristo Subscribers: cfe-commits, jhen Differential Revision: http://reviews.llvm.org/D16082 git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@257809 91177308-0d34-0410-b5e6-96231b3b80d8
author: Justin Lebar <jlebar@google.com> 2016-01-14 21:41:27 +0000
committer: Justin Lebar <jlebar@google.com> 2016-01-14 21:41:27 +0000
commit: 5001ed56f234095e3f01ac1f2057135efebda3b7 (patch)
tree: c3d17b2ff4c80ad8a12dd3d1f55ed87f770be137 /lib
parent: 9c0291cdff2e9a819b7ecf0ffdfd63cd29d0a2c6 (diff)
8 files changed, 184 insertions, 26 deletions
diff --git a/lib/CodeGen/CGCUDANV.cpp b/lib/CodeGen/CGCUDANV.cpp
index 045e19b189..9dd7928bcf 100644
--- a/lib/CodeGen/CGCUDANV.cpp
+++ b/lib/CodeGen/CGCUDANV.cpp
@@ -259,6 +259,8 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
         TheModule, FatbinWrapperTy, true, llvm::GlobalValue::InternalLinkage,
         llvm::ConstantStruct::get(FatbinWrapperTy, Values),
         "__cuda_fatbin_wrapper");
+    // NVIDIA's cuobjdump looks for fatbins in this section.
+    FatbinWrapper->setSection(".nvFatBinSegment");
 
     // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
     llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
diff --git a/lib/Driver/Action.cpp b/lib/Driver/Action.cpp
index e9490e96db..b45f290efc 100644
--- a/lib/Driver/Action.cpp
+++ b/lib/Driver/Action.cpp
@@ -75,7 +75,7 @@ CudaDeviceAction::CudaDeviceAction(Action *Input, const char *ArchName,
                                    bool AtTopLevel)
     : Action(CudaDeviceClass, Input), GpuArchName(ArchName),
       AtTopLevel(AtTopLevel) {
-  assert(IsValidGpuArchName(GpuArchName));
+  assert(!GpuArchName || IsValidGpuArchName(GpuArchName));
 }
 
 const char *CudaDeviceAction::getComputeArchName() const {
diff --git a/lib/Driver/Driver.cpp b/lib/Driver/Driver.cpp
index e9a37c6aa8..0cdfb4fe10 100644
--- a/lib/Driver/Driver.cpp
+++ b/lib/Driver/Driver.cpp
@@ -949,8 +949,9 @@ static unsigned PrintActions1(const Compilation &C, Action *A,
     os << '"' << BIA->getArchName() << '"' << ", {"
        << PrintActions1(C, *BIA->begin(), Ids) << "}";
   } else if (CudaDeviceAction *CDA = dyn_cast<CudaDeviceAction>(A)) {
-    os << '"' << CDA->getGpuArchName() << '"' << ", {"
-       << PrintActions1(C, *CDA->begin(), Ids) << "}";
+    os << '"'
+       << (CDA->getGpuArchName() ? CDA->getGpuArchName() : "(multiple archs)")
+       << '"' << ", {" << PrintActions1(C, *CDA->begin(), Ids) << "}";
   } else {
     const ActionList *AL;
     if (CudaHostAction *CHA = dyn_cast<CudaHostAction>(A)) {
@@ -1327,7 +1328,7 @@ static Action *buildCudaActions(Compilation &C, DerivedArgList &Args,
   // Check whether any of device actions stopped before they could generate PTX.
   bool PartialCompilation =
       llvm::any_of(CudaDeviceActions, [](const Action *a) {
-        return a->getKind() != Action::BackendJobClass;
+        return a->getKind() != Action::AssembleJobClass;
       });
 
   // Figure out what to do with device actions -- pass them as inputs to the
@@ -1356,16 +1357,32 @@ static Action *buildCudaActions(Compilation &C, DerivedArgList &Args,
     return HostAction;
   }
 
-  // Outputs of device actions during complete CUDA compilation get created
-  // with AtTopLevel=false and become inputs for the host action.
+  // If we're not a partial or device-only compilation, we compile each arch to
+  // ptx and assemble to cubin, then feed the cubin *and* the ptx into a device
+  // "link" action, which uses fatbinary to combine these cubins into one
+  // fatbin.  The fatbin is then an input to the host compilation.
   ActionList DeviceActions;
-  for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I)
-    DeviceActions.push_back(
-        C.MakeAction<CudaDeviceAction>(CudaDeviceActions[I], GpuArchList[I],
-                                       /* AtTopLevel */ false));
+  for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) {
+    Action* AssembleAction = CudaDeviceActions[I];
+    assert(AssembleAction->getType() == types::TY_Object);
+    assert(AssembleAction->getInputs().size() == 1);
+
+    Action* BackendAction = AssembleAction->getInputs()[0];
+    assert(BackendAction->getType() == types::TY_PP_Asm);
+
+    for (const auto& A : {AssembleAction, BackendAction}) {
+      DeviceActions.push_back(C.MakeAction<CudaDeviceAction>(
+          A, GpuArchList[I], /* AtTopLevel */ false));
+    }
+  }
+  auto FatbinAction = C.MakeAction<CudaDeviceAction>(
+      C.MakeAction<LinkJobAction>(DeviceActions, types::TY_CUDA_FATBIN),
+      /* GpuArchName = */ nullptr,
+      /* AtTopLevel = */ false);
   // Return a new host action that incorporates original host action and all
   // device actions.
-  return C.MakeAction<CudaHostAction>(HostAction, DeviceActions);
+  return C.MakeAction<CudaHostAction>(std::move(HostAction),
+                                      ActionList({FatbinAction}));
 }
 
 void Driver::BuildActions(Compilation &C, const ToolChain &TC,
@@ -1600,7 +1617,7 @@ Action *Driver::ConstructPhaseAction(Compilation &C, const ToolChain &TC,
     return C.MakeAction<BackendJobAction>(Input, types::TY_PP_Asm);
   }
   case phases::Assemble:
-    return C.MakeAction<AssembleJobAction>(Input, types::TY_Object);
+    return C.MakeAction<AssembleJobAction>(std::move(Input), types::TY_Object);
   }
 
   llvm_unreachable("invalid phase in ConstructPhaseAction");
@@ -1849,11 +1866,14 @@ InputInfo Driver::BuildJobsForActionNoCache(
   if (const CudaDeviceAction *CDA = dyn_cast<CudaDeviceAction>(A)) {
     // Initial processing of CudaDeviceAction carries host params.
     // Call BuildJobsForAction() again, now with correct device parameters.
-    assert(CDA->getGpuArchName() && "No GPU name in device action.");
-    return BuildJobsForAction(C, *CDA->begin(), C.getCudaDeviceToolChain(),
-                              CDA->getGpuArchName(), CDA->isAtTopLevel(),
-                              /*MultipleArchs*/ true, LinkingOutput,
-                              CachedResults);
+    InputInfo II = BuildJobsForAction(
+        C, *CDA->begin(), C.getCudaDeviceToolChain(), CDA->getGpuArchName(),
+        CDA->isAtTopLevel(), /*MultipleArchs*/ true, LinkingOutput,
+        CachedResults);
+    // Currently II's Action is *CDA->begin().  Set it to CDA instead, so that
+    // one can retrieve II's GPU arch.
+    II.setAction(A);
+    return II;
   }
 
   const ActionList *Inputs = &A->getInputs();
diff --git a/lib/Driver/ToolChains.cpp b/lib/Driver/ToolChains.cpp
index e3f25f1e32..15b3677822 100644
--- a/lib/Driver/ToolChains.cpp
+++ b/lib/Driver/ToolChains.cpp
@@ -1652,13 +1652,14 @@ void Generic_GCC::CudaInstallationDetector::init(
       continue;
 
     CudaInstallPath = CudaPath;
+    CudaBinPath = CudaPath + "/bin";
     CudaIncludePath = CudaInstallPath + "/include";
     CudaLibDevicePath = CudaInstallPath + "/nvvm/libdevice";
     CudaLibPath =
         CudaInstallPath + (TargetTriple.isArch64Bit() ? "/lib64" : "/lib");
 
     if (!(D.getVFS().exists(CudaIncludePath) &&
-          D.getVFS().exists(CudaLibPath) &&
+          D.getVFS().exists(CudaBinPath) && D.getVFS().exists(CudaLibPath) &&
           D.getVFS().exists(CudaLibDevicePath)))
       continue;
 
@@ -4182,13 +4183,16 @@ Tool *DragonFly::buildLinker() const {
   return new tools::dragonfly::Linker(*this);
 }
 
-/// Stub for CUDA toolchain. At the moment we don't have assembler or
-/// linker and need toolchain mainly to propagate device-side options
-/// to CC1.
+/// CUDA toolchain.  Our assembler is ptxas, and our "linker" is fatbinary,
+/// which isn't properly a linker but nonetheless performs the step of stitching
+/// together object files from the assembler into a single blob.
 
 CudaToolChain::CudaToolChain(const Driver &D, const llvm::Triple &Triple,
                              const ArgList &Args)
-    : Linux(D, Triple, Args) {}
+    : Linux(D, Triple, Args) {
+  if (CudaInstallation.isValid())
+    getProgramPaths().push_back(CudaInstallation.getBinPath());
+}
 
 void
 CudaToolChain::addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
@@ -4222,7 +4226,7 @@ CudaToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args,
   for (Arg *A : Args) {
     if (A->getOption().matches(options::OPT_Xarch__)) {
       // Skip this argument unless the architecture matches BoundArch
-      if (A->getValue(0) != StringRef(BoundArch))
+      if (!BoundArch || A->getValue(0) != StringRef(BoundArch))
         continue;
 
       unsigned Index = Args.getBaseArgs().MakeIndex(A->getValue(1));
@@ -4253,10 +4257,19 @@ CudaToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args,
     DAL->append(A);
   }
 
-  DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ), BoundArch);
+  if (BoundArch)
+    DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ), BoundArch);
   return DAL;
 }
 
+Tool *CudaToolChain::buildAssembler() const {
+  return new tools::NVPTX::Assembler(*this);
+}
+
+Tool *CudaToolChain::buildLinker() const {
+  return new tools::NVPTX::Linker(*this);
+}
+
 /// XCore tool chain
 XCoreToolChain::XCoreToolChain(const Driver &D, const llvm::Triple &Triple,
                                const ArgList &Args)
diff --git a/lib/Driver/ToolChains.h b/lib/Driver/ToolChains.h
index 622c414c62..a5f54733e8 100644
--- a/lib/Driver/ToolChains.h
+++ b/lib/Driver/ToolChains.h
@@ -163,6 +163,7 @@ protected:
     bool IsValid;
     const Driver &D;
     std::string CudaInstallPath;
+    std::string CudaBinPath;
     std::string CudaLibPath;
     std::string CudaLibDevicePath;
     std::string CudaIncludePath;
@@ -179,6 +180,8 @@ protected:
 
     /// \brief Get the detected Cuda installation path.
     StringRef getInstallPath() const { return CudaInstallPath; }
+    /// \brief Get the detected path to Cuda's bin directory.
+    StringRef getBinPath() const { return CudaBinPath; }
     /// \brief Get the detected Cuda Include path.
     StringRef getIncludePath() const { return CudaIncludePath; }
     /// \brief Get the detected Cuda library path.
@@ -816,6 +819,14 @@ public:
                 const char *BoundArch) const override;
   void addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
                              llvm::opt::ArgStringList &CC1Args) const override;
+
+  // Never try to use the integrated assembler with CUDA; always fork out to
+  // ptxas.
+  bool useIntegratedAs() const override { return false; }
+
+protected:
+  Tool *buildAssembler() const override;  // ptxas
+  Tool *buildLinker() const override;     // fatbinary (ok, not really a linker)
 };
 
 class LLVM_LIBRARY_VISIBILITY MipsLLVMToolChain : public Linux {
diff --git a/lib/Driver/Tools.cpp b/lib/Driver/Tools.cpp
index 5e4777b466..e498f98355 100644
--- a/lib/Driver/Tools.cpp
+++ b/lib/Driver/Tools.cpp
@@ -10625,3 +10625,81 @@ void PS4cpu::Link::ConstructJob(Compilation &C, const JobAction &JA,
   else
     ConstructGoldLinkJob(*this, C, JA, Output, Inputs, Args, LinkingOutput);
 }
+
+void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
+                                    const InputInfo &Output,
+                                    const InputInfoList &Inputs,
+                                    const ArgList &Args,
+                                    const char *LinkingOutput) const {
+  const auto &TC =
+      static_cast<const toolchains::CudaToolChain &>(getToolChain());
+  assert(TC.getArch() == llvm::Triple::nvptx ||
+         TC.getArch() == llvm::Triple::nvptx64);
+
+  std::vector<std::string> gpu_archs =
+      Args.getAllArgValues(options::OPT_march_EQ);
+  assert(gpu_archs.size() == 1 && "Exactly one GPU Arch required for ptxas.");
+  const std::string& gpu_arch = gpu_archs[0];
+
+
+  ArgStringList CmdArgs;
+  CmdArgs.push_back(TC.getTriple().isArch64Bit() ? "-m64" : "-m32");
+
+  // Clang's default optimization level is -O0, but ptxas's default is -O3.
+  CmdArgs.push_back(Args.MakeArgString(
+      llvm::Twine("-O") +
+      Args.getLastArgValue(options::OPT_O_Group, "0").data()));
+
+  // Don't bother passing -g to ptxas: It's enabled by default at -O0, and
+  // not supported at other optimization levels.
+
+  CmdArgs.push_back("--gpu-name");
+  CmdArgs.push_back(Args.MakeArgString(gpu_arch));
+  CmdArgs.push_back("--output-file");
+  CmdArgs.push_back(Args.MakeArgString(Output.getFilename()));
+  for (const auto& II : Inputs)
+    CmdArgs.push_back(Args.MakeArgString(II.getFilename()));
+
+  for (const auto& A : Args.getAllArgValues(options::OPT_Xcuda_ptxas))
+    CmdArgs.push_back(Args.MakeArgString(A));
+
+  const char *Exec = Args.MakeArgString(TC.GetProgramPath("ptxas"));
+  C.addCommand(llvm::make_unique<Command>(JA, *this, Exec, CmdArgs, Inputs));
+}
+
+// All inputs to this linker must be from CudaDeviceActions, as we need to look
+// at the Inputs' Actions in order to figure out which GPU architecture they
+// correspond to.
+void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA,
+                                 const InputInfo &Output,
+                                 const InputInfoList &Inputs,
+                                 const ArgList &Args,
+                                 const char *LinkingOutput) const {
+  const auto &TC =
+      static_cast<const toolchains::CudaToolChain &>(getToolChain());
+  assert(TC.getArch() == llvm::Triple::nvptx ||
+         TC.getArch() == llvm::Triple::nvptx64);
+
+  ArgStringList CmdArgs;
+  CmdArgs.push_back("--cuda");
+  CmdArgs.push_back(TC.getTriple().isArch64Bit() ? "-64" : "-32");
+  CmdArgs.push_back(Args.MakeArgString("--create"));
+  CmdArgs.push_back(Args.MakeArgString(Output.getFilename()));
+
+  for (const auto& II : Inputs) {
+    auto* A = cast<const CudaDeviceAction>(II.getAction());
+    // We need to pass an Arch of the form "sm_XX" for cubin files and
+    // "compute_XX" for ptx.
+    const char *Arch = (II.getType() == types::TY_PP_Asm)
+                           ? A->getComputeArchName()
+                           : A->getGpuArchName();
+    CmdArgs.push_back(Args.MakeArgString(llvm::Twine("--image=profile=") +
+                                         Arch + ",file=" + II.getFilename()));
+  }
+
+  for (const auto& A : Args.getAllArgValues(options::OPT_Xcuda_fatbinary))
+    CmdArgs.push_back(Args.MakeArgString(A));
+
+  const char *Exec = Args.MakeArgString(TC.GetProgramPath("fatbinary"));
+  C.addCommand(llvm::make_unique<Command>(JA, *this, Exec, CmdArgs, Inputs));
+}
diff --git a/lib/Driver/Tools.h b/lib/Driver/Tools.h
index 2b137f4a6d..69c046587f 100644
--- a/lib/Driver/Tools.h
+++ b/lib/Driver/Tools.h
@@ -903,6 +903,41 @@ public:
 };
 } // end namespace PS4cpu
 
+namespace NVPTX {
+
+// Run ptxas, the NVPTX assembler.
+class LLVM_LIBRARY_VISIBILITY Assembler : public Tool {
+ public:
+   Assembler(const ToolChain &TC)
+       : Tool("NVPTX::Assembler", "ptxas", TC, RF_Full, llvm::sys::WEM_UTF8,
+              "--options-file") {}
+
+   bool hasIntegratedCPP() const override { return false; }
+
+   void ConstructJob(Compilation &C, const JobAction &JA,
+                     const InputInfo &Output, const InputInfoList &Inputs,
+                     const llvm::opt::ArgList &TCArgs,
+                     const char *LinkingOutput) const override;
+};
+
+// Runs fatbinary, which combines GPU object files ("cubin" files) and/or PTX
+// assembly into a single output file.
+class LLVM_LIBRARY_VISIBILITY Linker : public Tool {
+ public:
+   Linker(const ToolChain &TC)
+       : Tool("NVPTX::Linker", "fatbinary", TC, RF_Full, llvm::sys::WEM_UTF8,
+              "--options-file") {}
+
+   bool hasIntegratedCPP() const override { return false; }
+
+   void ConstructJob(Compilation &C, const JobAction &JA,
+                     const InputInfo &Output, const InputInfoList &Inputs,
+                     const llvm::opt::ArgList &TCArgs,
+                     const char *LinkingOutput) const override;
+};
+
+}  // end namespace NVPTX
+
 } // end namespace tools
 } // end namespace driver
 } // end namespace clang
diff --git a/lib/Driver/Types.cpp b/lib/Driver/Types.cpp
index c29ce9462a..3b3b67fc5d 100644
--- a/lib/Driver/Types.cpp
+++ b/lib/Driver/Types.cpp
@@ -232,8 +232,7 @@ void types::getCompilationPhases(ID Id, llvm::SmallVectorImpl<phases::ID> &P) {
         P.push_back(phases::Compile);
         P.push_back(phases::Backend);
       }
-      if (Id != TY_CUDA_DEVICE)
-        P.push_back(phases::Assemble);
+      P.push_back(phases::Assemble);
     }
   }
author	Justin Lebar <jlebar@google.com>	2016-01-14 21:41:27 +0000
committer	Justin Lebar <jlebar@google.com>	2016-01-14 21:41:27 +0000
commit	5001ed56f234095e3f01ac1f2057135efebda3b7 (patch)
tree	c3d17b2ff4c80ad8a12dd3d1f55ed87f770be137 /lib
parent	9c0291cdff2e9a819b7ecf0ffdfd63cd29d0a2c6 (diff)